In [9]:
import numpy as np
import time
import sklearn.linear_model as lm
import sklearn.svm as svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_absolute_error,accuracy_score

In [10]:
data = np.loadtxt('train.dat').astype(np.float64)
test = np.loadtxt('test.dat').astype(np.float64)

In [11]:
y = data[:,data.shape[1]-1]
X = data[:,:data.shape[1]-1]
print(f"y.shape {y.shape}")
print(f"X.shape {X.shape}")


y.shape (40000,)
X.shape (40000, 32)


In [12]:
def feature_map_method1(X):
    X = 1-2*X
    D = X.shape[1]
    for i in range(1,D):
        X[:,D-i-1] = X[:,D-i-1] * X[:,D-i]
    
    n_samples, n_features = X.shape
    outer_product = np.einsum('ij,ik->ijk', X, X)
    upper_triangle_indices = np.triu_indices(n_features,k=1)
    features = outer_product[:, upper_triangle_indices[0], upper_triangle_indices[1]]

    result = np.concatenate((X,features),axis=1)

    return result

def feature_map_method2(X):
    X = 1-2*X
    X_rev = X[:,::-1]
    X_prod = np.cumprod(X_rev,axis=1)
    X = X_prod[:,::-1]
    
    n_samples, n_features = X.shape
    outer_product = np.einsum('ij,ik->ijk', X, X)
    upper_triangle_indices = np.triu_indices(n_features,k=1)
    features = outer_product[:, upper_triangle_indices[0], upper_triangle_indices[1]]

    result = np.concatenate((X,features),axis=1)

    return result



Khatri-Rao is bullcrap crashes the kernel

In [13]:
X = data[:,:data.shape[1]-1]
start = time.time()
X = feature_map_method1(X)
end = time.time()
print(f"Time for execution:{end-start}")
print(X.shape)

Time for execution:0.3828730583190918
(40000, 528)


## Other things to try
- SGDclassifier
- LogisticRegressionCV
- GridSearchCV


## Experiments to try (check accuracy and training time)
- check l1 and l2 penalty for all solvers of Logistic Regression
- Take different values of tol from 1e2 to 1e-5 logistic regression
- Take different values of C uniformly from 0.1-1 and 2-10 (Can do for different solvers and create graphs) logistic regression
- Same for SVC  C and tol - do for both hinge loss and squared hinge loss

## Logistic Regression


In [14]:
X_train,X_val,y_train,y_val = train_test_split(X,y,train_size=0.2)
model = lm.LogisticRegression()
model.fit(X_train,y_train)

In [15]:
y_pred = model.predict(X_train)
print(f"train R2 - Score {r2_score(y_pred,y_train)}")
print(f"train accuracy {accuracy_score(y_pred,y_train)}")

y_pred = model.predict(X_val)

print(f"Val R2 - Score {r2_score(y_pred,y_val)}")
print(f"Val accuracy {accuracy_score(y_pred,y_val)}")




train R2 - Score 0.9924998124953124
train accuracy 0.998125
Val R2 - Score 0.8828406411874132
Val accuracy 0.97071875


## Ridge Classifier

In [16]:
X_train,X_val,y_train,y_val = train_test_split(X,y,train_size=0.2)
model = lm.RidgeClassifier()
model.fit(X_train,y_train)

In [17]:
y_pred = model.predict(X_train)
print(f"train R2 - Score {r2_score(y_pred,y_train)}")
print(f"train MAE {mean_absolute_error(y_pred,y_train)}")

y_pred = model.predict(X_val)

print(f"Val R2 - Score {r2_score(y_pred,y_val)}")
print(f"Val MAE {mean_absolute_error(y_pred,y_val)}")



train R2 - Score 0.4358301349510957
train MAE 0.1335
Val R2 - Score 0.21687364586394742
Val MAE 0.17903125


## Ridge Classifier with Cross Validation

In [18]:
X_train,X_val,y_train,y_val = train_test_split(X,y,train_size=0.2)
model = lm.RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1])
model.fit(X_train,y_train)

In [19]:
y_pred = model.predict(X_train)
print(f"train R2 - Score {r2_score(y_pred,y_train)}")
print(f"train MAE {mean_absolute_error(y_pred,y_train)}")

y_pred = model.predict(X_val)

print(f"Val R2 - Score {r2_score(y_pred,y_val)}")
print(f"Val MAE {mean_absolute_error(y_pred,y_val)}")



train R2 - Score 0.4277639584720131
train MAE 0.13475
Val R2 - Score 0.2305495300399163
Val MAE 0.17703125


## Support Vector Classifier

In [20]:
X_train,X_val,y_train,y_val = train_test_split(X,y,train_size=0.2)
#model = svm.LinearSVC(penalty='l2',loss='hinge')
model = svm.LinearSVC()
model.fit(X_train,y_train)



In [21]:
y_pred = model.predict(X_train)
print(f"train R2 - Score {r2_score(y_pred,y_train)}")
print(f"train MAE {mean_absolute_error(y_pred,y_train)}")

y_pred = model.predict(X_val)

print(f"Val R2 - Score {r2_score(y_pred,y_val)}")
print(f"Val MAE {mean_absolute_error(y_pred,y_val)}")



train R2 - Score 1.0
train MAE 0.0
Val R2 - Score 0.8778682441101752
Val MAE 0.03053125
