In [34]:
import numpy as np
import time
import sklearn.linear_model as lm
import sklearn.svm as svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_absolute_error,accuracy_score
import pickle

In [35]:
data = np.loadtxt('train.dat').astype(np.float64)
test = np.loadtxt('test.dat').astype(np.float64)

In [40]:
y = data[:,data.shape[1]-1]
X = data[:,:data.shape[1]-1]

print(f"y.shape {y.shape}")
print(f"X.shape {X.shape}")


y.shape (40000,)
X.shape (40000, 32)


In [28]:
def feature_map_method1(X):
    X = 1-2*X
    D = X.shape[1]
    for i in range(1,D):
        X[:,D-i-1] = X[:,D-i-1] * X[:,D-i]
    
    n_samples, n_features = X.shape
    outer_product = np.einsum('ij,ik->ijk', X, X)
    upper_triangle_indices = np.triu_indices(n_features,k=1)
    features = outer_product[:, upper_triangle_indices[0], upper_triangle_indices[1]]

    result = np.concatenate((X,features),axis=1)

    return result

def feature_map_method2(X):
    X = 1-2*X
    X_rev = X[:,::-1]
    X_prod = np.cumprod(X_rev,axis=1)
    X = X_prod[:,::-1]
    
    n_samples, n_features = X.shape
    outer_product = np.einsum('ij,ik->ijk', X, X)
    upper_triangle_indices = np.triu_indices(n_features,k=1)
    features = outer_product[:, upper_triangle_indices[0], upper_triangle_indices[1]]

    result = np.concatenate((X,features),axis=1)

    return result



Khatri-Rao is bullcrap crashes the kernel

In [5]:
X = data[:,:data.shape[1]-1]
start = time.time()
X = feature_map_method1(X)
end = time.time()
print(f"Time for execution:{end-start}")
print(X.shape)

Time for execution:0.4766242504119873
(40000, 528)


## Other things to try
- SGDclassifier
- LogisticRegressionCV
- GridSearchCV


## Experiments to try (check accuracy and training time)
- check l1 and l2 penalty for all solvers of Logistic Regression -> 1 graph x 2
- Take different values of tol from 1e2 to 1e-5 logistic regression -> 1 graph x 2
- Take different values of C uniformly from 0.1-1 and 2-10 (Can do for different solvers and create graphs) logistic regression -> 1 graph x 2
- Same for SVC  C and tol - do for both hinge loss and squared hinge loss -> 4 graphs ((c,tol)*(hinge,sqaured)) x 2

## Logistic Regression


In [32]:
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.2)
model = lm.LogisticRegression()
model.fit(X_train,y_train)

In [33]:
y_pred = model.predict(X_train)
print(f"train R2 - Score {r2_score(y_pred,y_train)}")
print(f"train accuracy {accuracy_score(y_pred,y_train)}")

y_pred = model.predict(X_val)
print(f"Val R2 - Score {r2_score(y_pred,y_val)}")
print(f"Val accuracy {accuracy_score(y_val,y_pred )}")




train R2 - Score 0.987799644237626
train accuracy 0.99695
Val R2 - Score 0.9627977366142956
Val accuracy 0.9907


In [8]:
y_pred==y_val

array([ True,  True,  True, ...,  True,  True,  True])

## Ridge Classifier

In [9]:
X_train,X_val,y_train,y_val = train_test_split(X,y,train_size=0.2)
model = lm.RidgeClassifier()
model.fit(X_train,y_train)

NameError: name 'y_' is not defined

In [None]:
y_pred = model.predict(X_train)
print(f"train R2 - Score {r2_score(y_pred,y_train)}")
print(f"train MAE {mean_absolute_error(y_pred,y_train)}")

y_pred = model.predict(X_val)

print(f"Val R2 - Score {r2_score(y_pred,y_val)}")
print(f"Val MAE {mean_absolute_error(y_pred,y_val)}")



train R2 - Score 0.4032376631429143
train MAE 0.139375
Val R2 - Score 0.21533607043069414
Val MAE 0.1785625


## Ridge Classifier with Cross Validation

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.2)
model = lm.RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1])
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_train)
print(f"train R2 - Score {r2_score(y_pred,y_train)}")
print(f"train MAE {mean_absolute_error(y_pred,y_train)}")

y_pred = model.predict(X_val)

print(f"Val R2 - Score {r2_score(y_pred,y_val)}")
print(f"Val MAE {mean_absolute_error(y_pred,y_val)}")



train R2 - Score 0.29546666673844935
train MAE 0.15846875
Val R2 - Score 0.24300670298256366
Val MAE 0.166625


## Support Vector Classifier

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.2)
#model = svm.LinearSVC(penalty='l2',loss='hinge')
model = svm.LinearSVC()
model.fit(X_train,y_train)



In [None]:
y_pred = model.predict(X_train)
print(f"train accuracy - Score {accuracy_score(y_pred,y_train)}")
print(f"train MAE {mean_absolute_error(y_pred,y_train)}")

y_pred = model.predict(X_val)
print(f"Val accuracy - Score {accuracy_score(y_pred,y_val)}")
print(f"Val MAE {mean_absolute_error(y_pred,y_val)}")



train accuracy - Score 0.997875
train MAE 0.002125
Val accuracy - Score 0.9905
Val MAE 0.0095


## SGDClassifer

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.2)
#model = svm.LinearSVC(penalty='l2',loss='hinge')
model = lm.SGDClassifier()
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_train)
print(f"train accuracy - Score {accuracy_score(y_pred,y_train)}")
print(f"train MAE {mean_absolute_error(y_pred,y_train)}")

y_pred = model.predict(X_val)
print(f"Val accuracy - Score {accuracy_score(y_pred,y_val)}")
print(f"Val MAE {mean_absolute_error(y_pred,y_val)}")



train accuracy - Score 0.982375
train MAE 0.017625
Val accuracy - Score 0.97675
Val MAE 0.02325
