In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from scipy.stats import ttest_rel

In [None]:
X_train = pd.read_csv('X_train_classification.csv').values
X_test = pd.read_csv('X_test_classification.csv').values
y_train = pd.read_csv('y_train_classification.csv').values.flatten()
y_test = pd.read_csv('y_test_classification.csv').values.flatten()

In [None]:
y_train = np.where(y_train == 0, -1, 1)
y_test = np.where(y_test == 0, -1, 1)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
def initialize_parameters(n_features):
    w = np.zeros(n_features)
    b = 0
    return w, b

In [None]:
def predict(X, w, b):
    return np.sign(np.dot(X, w) + b)

In [None]:
def sgd_svm(X, y, learning_rate=0.01, C=0.01, epochs=3):
    n_samples, n_features = X.shape
    w, b = initialize_parameters(n_features)

    for epoch in range(epochs):
        for i in range(n_samples):
            xi = X[i]
            yi = y[i]
            margins = 1 - yi * (np.dot(xi, w) + b)
            if margins > 0:
                w = w - learning_rate * (w - C * yi * xi)
                b = b - learning_rate * (-C * yi)
            else:
                w = w - learning_rate * w

    return w, b

In [None]:
w_manual, b_manual = sgd_svm(X_train, y_train, learning_rate=0.01, C=0.01, epochs=3) #change C=1 when there are high attributes

In [None]:
y_pred_manual = predict(X_test, w_manual, b_manual)
accuracy_manual = accuracy_score(y_test, y_pred_manual)

In [None]:
C=0.01 #change C=1 when there are high attributes
alpha = 1.0 / (X_train.shape[0] * C)
sgd_clf = SGDClassifier(loss='hinge', alpha=alpha, max_iter=3, tol=None, random_state=42)
sgd_clf.fit(X_train, y_train)
y_pred_sklearn = sgd_clf.predict(X_test)
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)

In [None]:
accuracy_manual

0.85

In [None]:
accuracy_sklearn

0.55

In [None]:
w_manual

array([0.00020519, 0.00715798])

In [None]:
sgd_clf.coef_

array([[0.08697866, 0.43660696]])

In [None]:
w_sklearn=sgd_clf.coef_[0]

In [None]:
t_stat, p_value = ttest_rel(w_manual, w_sklearn)

In [None]:
t_stat

-1.5064468567617095

In [None]:
p_value

0.37307500194383186

In [None]:
sgd_clf.intercept_

array([0.41406289])