In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from scipy.special import expit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train=pd.read_csv('X_train.csv')
X_train=X_train.to_numpy()
X_test=pd.read_csv('X_test.csv')
X_test=X_test.to_numpy()
y_train=pd.read_csv('y_train.csv')
y_train=y_train.to_numpy()
y_train=y_train.reshape(-1)
y_test=pd.read_csv('y_test.csv')
y_test=y_test.to_numpy()

In [3]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
def sigmoid(z):
    return expit(z)

In [5]:
# def compute_loss(y_true, y_pred):
#     return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

In [6]:
def logistic_regression_sgd(X, y, learning_rate=0.01, epochs=3):
    m, n = X.shape
    weights = np.zeros(n)
    bias = 0

    for epoch in range(epochs):
        # Shuffle the data for each epoch to ensure better convergence
        indices = np.random.permutation(m)
        X = X[indices]
        y = y[indices]

        for i in range(m):
            xi = X[i]
            yi = y[i]
            linear_output = np.dot(xi, weights) + bias
            y_pred = sigmoid(linear_output)

            # Gradient calculation
            dw = (y_pred - yi) * xi
            db = y_pred - yi

            # Update weights and bias
            weights -= learning_rate * dw
            bias -= learning_rate * db


    return weights, bias

In [7]:
weights_sgd, bias_sgd = logistic_regression_sgd(X_train, y_train)

In [8]:
print("Weights from SGD:", weights_sgd)
print("Bias from SGD:", bias_sgd)

Weights from SGD: [-0.18949692  0.30616352 -0.06540636  0.43299196 -0.19178846 -0.2746342
  0.1998107  -0.09073953  1.03307318 -0.01053275  0.31179407  0.41404817
  0.29090574  0.09856637  0.46533635 -0.02130242  0.34216249  0.03793722
 -0.42518941 -0.54036168 -0.76154155 -0.4093778 ]
Bias from SGD: 0.3525268615821252


In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
model = LogisticRegression(max_iter=3,solver='saga',C=100)
model.fit(X_train, y_train)
# from sklearn.linear_model import SGDClassifier
# model = SGDClassifier(max_iter=3, tol=None, random_state=42)
# model.fit(X_train, y_train)



In [11]:
weights_sklearn = model.coef_[0]
bias_sklearn = model.intercept_[0]

In [12]:
print("Weights from scikit-learn:", weights_sklearn)
print("Bias from scikit-learn:", bias_sklearn)

Weights from scikit-learn: [-0.11328901  0.33546771 -0.10538747  0.39887194 -0.29913137 -0.3147627
  0.15037884 -0.09060104  0.92301968  0.11345633  0.26867468  0.39456753
  0.29926916  0.13753568  0.37221496  0.0902821   0.24384925  0.01638535
 -0.22818504 -0.47933165 -0.76719666 -0.37937842]
Bias from scikit-learn: 0.35824011792380006


In [13]:
print("\nComparison of Weights and Bias:")
print("Weights difference:", np.abs(weights_sgd - weights_sklearn))
print("Bias difference:", np.abs(bias_sgd - bias_sklearn))


Comparison of Weights and Bias:
Weights difference: [7.62079138e-02 2.93041870e-02 3.99811085e-02 3.41200225e-02
 1.07342907e-01 4.01284992e-02 4.94318587e-02 1.38489255e-04
 1.10053497e-01 1.23989080e-01 4.31193843e-02 1.94806428e-02
 8.36342283e-03 3.89693155e-02 9.31213892e-02 1.11584514e-01
 9.83132450e-02 2.15518651e-02 1.97004368e-01 6.10300217e-02
 5.65510929e-03 2.99993825e-02]
Bias difference: 0.005713256341674855


In [14]:
def predict(X, weights, bias):
    z = np.dot(X, weights) + bias
    y_pred = sigmoid(z)
    return (y_pred > 0.5).astype(int)

In [15]:
y_pred_sgd = predict(X_test, weights_sgd, bias_sgd)

In [16]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred_sgd)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8338687754849871


In [17]:
y_pred_sklearn=model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_sklearn)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8340850117385394
