In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from scipy.stats import ttest_rel

In [2]:
X_train = pd.read_csv('X_train1.csv').values
X_test = pd.read_csv('X_test1.csv').values
y_train = pd.read_csv('y_train1.csv').values.flatten()
y_test = pd.read_csv('y_test1.csv').values.flatten()

In [3]:
y_train = np.where(y_train == 0, -1, 1)
y_test = np.where(y_test == 0, -1, 1)

In [4]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
def initialize_parameters(n_features):
    w = np.zeros(n_features)
    b = 0
    return w, b

In [6]:
def predict(X, w, b):
    return np.sign(np.dot(X, w) + b)

In [7]:
def sgd_svm(X, y, learning_rate=0.01, C=1.0, epochs=3):
    n_samples, n_features = X.shape
    w, b = initialize_parameters(n_features)
        
    for epoch in range(epochs):
        for i in range(n_samples):
            xi = X[i]
            yi = y[i]
            margins = 1 - yi * (np.dot(xi, w) + b)
            if margins > 0:
                w = w - learning_rate * (w - C * yi * xi)
                b = b - learning_rate * (-C * yi)
            else:
                w = w - learning_rate * w
        
    return w, b

In [8]:
w_manual, b_manual = sgd_svm(X_train, y_train, learning_rate=0.01, C=1, epochs=3) #change C=10 when there are high attributes

In [9]:
y_pred_manual = predict(X_test, w_manual, b_manual)
accuracy_manual = accuracy_score(y_test, y_pred_manual)

In [10]:
C=1 #change C=10 when there are high attributes
alpha = 1.0 / (X_train.shape[0] * C) 
sgd_clf = SGDClassifier(loss='hinge', alpha=alpha, max_iter=3, tol=None, random_state=42)
sgd_clf.fit(X_train, y_train)
y_pred_sklearn = sgd_clf.predict(X_test)
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)

In [11]:
accuracy_manual

0.7888888888888889

In [12]:
accuracy_sklearn

0.7555555555555555

In [13]:
w_manual

array([ 0.02359832,  0.17292294, -0.00993769,  0.07209141, -0.0702677 ,
       -0.01128185, -0.02426165, -0.00232784,  0.23595634,  0.10885654,
        0.10165723,  0.18055084,  0.13675739,  0.04455185,  0.13649581,
        0.09880037,  0.04349818, -0.02609455, -0.04539256, -0.20308784,
       -0.05955048, -0.10677053])

In [14]:
sgd_clf.coef_

array([[ 8.31736595e-02,  9.64745610e-01,  6.73919145e-01,
         1.77396020e+00, -4.01326639e-01, -2.55946930e-15,
         1.47433693e+00, -2.35970697e-01,  3.37408042e+00,
        -1.68322890e+00,  1.66727259e+00,  2.42683355e-01,
        -2.40160233e-01, -5.26282755e-01,  1.45893668e+00,
         1.31616459e+00,  1.18990068e+00,  7.12714975e-01,
        -3.60867048e+00, -2.47985100e+00, -3.95872163e+00,
        -1.31210685e+00]])

In [15]:
w_sklearn=sgd_clf.coef_[0]

In [16]:
t_stat, p_value = ttest_rel(w_manual, w_sklearn)

In [17]:
t_stat

0.03860986308006111

In [18]:
p_value

0.9695661375639474