In [59]:
import numpy as np
import time
import sklearn.linear_model as lm
import sklearn.svm as svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_absolute_error

In [2]:
data = np.loadtxt('train.dat').astype(np.int64)

## Tasks
- Prepare Feature Map
- Split Train data - train + val
- train on train data and hyper parameter tuning on val data till get best accuracy (r-2) on val data

## Models to Try

- LinearSVC
- LogisticRegression
- Ridge Regression
- Lasso Regression


## Experiments to try

- Loss parameter in SVC (hinge loss vs sqaured hinge loss)
- Iterating over C in SVC
- Iterating over tol in SVC and logistic Regression 
- Changing penalty in SVC and l2-l1 in logistic regression


In [3]:
y = data[:,data.shape[1]-1]
X = data[:,:data.shape[1]-1]
print(f"y.shape {y.shape}")
print(f"X.shape {X.shape}")


y.shape (40000,)
X.shape (40000, 32)


In [4]:
def feature_map_method1(X):
    X = 1-2*X
    D = X.shape[1]
    for i in range(1,D):
        X[:,D-i-1] = X[:,D-i-1] * X[:,D-i]
    
    n_samples, n_features = X.shape
    outer_product = np.einsum('ij,ik->ijk', X, X)
    upper_triangle_indices = np.triu_indices(n_features,k=1)
    features = outer_product[:, upper_triangle_indices[0], upper_triangle_indices[1]]

    result = np.concatenate((X,features),axis=1)

    return result

def feature_map_method2(X):
    X = 1-2*X
    X_rev = X[:,::-1]
    X_prod = np.cumprod(X_rev,axis=1)
    X = X_prod[:,::-1]
    
    n_samples, n_features = X.shape
    outer_product = np.einsum('ij,ik->ijk', X, X)
    upper_triangle_indices = np.triu_indices(n_features,k=1)
    features = outer_product[:, upper_triangle_indices[0], upper_triangle_indices[1]]

    result = np.concatenate((X,features),axis=1)

    return result



Khatri-Rao is bullcrap crashes the kernel

In [5]:
X = data[:,:data.shape[1]-1]
start = time.time()
X = feature_map_method2(X)
end = time.time()
print(f"Time for execution:{end-start}")
print(X.shape)

Time for execution:0.4778118133544922
(40000, 528)


## Logistic Regression


In [57]:
X_train,X_val,y_train,y_val = train_test_split(X,y,train_size=0.2)
model = lm.LogisticRegression(penalty='l2',tol=1e-4)
model.fit(X_train,y_train)

In [58]:
y_pred = model.predict(X_train)
print(f"train R2 - Score {r2_score(y_pred,y_train)}")
print(f"train MAE {mean_absolute_error(y_pred,y_train)}")

y_pred = model.predict(X_val)

print(f"Val R2 - Score {r2_score(y_pred,y_val)}")
print(f"Val MAE {mean_absolute_error(y_pred,y_val)}")



train R2 - Score 0.9909935553259329
train MAE 0.00225
Val R2 - Score 0.8892328652761633
Val MAE 0.0276875


## Ridge Classifier

In [55]:
X_train,X_val,y_train,y_val = train_test_split(X,y,train_size=0.2)
model = lm.RidgeClassifier()
model.fit(X_train,y_train)

In [56]:
y_pred = model.predict(X_train)
print(f"train R2 - Score {r2_score(y_pred,y_train)}")
print(f"train MAE {mean_absolute_error(y_pred,y_train)}")

y_pred = model.predict(X_val)

print(f"Val R2 - Score {r2_score(y_pred,y_val)}")
print(f"Val MAE {mean_absolute_error(y_pred,y_val)}")



train R2 - Score 0.42011862198439043
train MAE 0.135
Val R2 - Score 0.22387990044037998
Val MAE 0.1749375


## Ridge Classifier with Cross Validation

In [53]:
X_train,X_val,y_train,y_val = train_test_split(X,y,train_size=0.2)
model = lm.RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1])
model.fit(X_train,y_train)

In [54]:
y_pred = model.predict(X_train)
print(f"train R2 - Score {r2_score(y_pred,y_train)}")
print(f"train MAE {mean_absolute_error(y_pred,y_train)}")

y_pred = model.predict(X_val)

print(f"Val R2 - Score {r2_score(y_pred,y_val)}")
print(f"Val MAE {mean_absolute_error(y_pred,y_val)}")



train R2 - Score 0.4370530337638605
train MAE 0.132375
Val R2 - Score 0.2527669937238053
Val MAE 0.17096875


## Support Vector Classifier

In [69]:
X_train,X_val,y_train,y_val = train_test_split(X,y,train_size=0.2)
#model = svm.LinearSVC(penalty='l2',loss='hinge')
model = svm.SVC(kernel='linear')
model.fit(X_train,y_train)

In [70]:
y_pred = model.predict(X_train)
print(f"train R2 - Score {r2_score(y_pred,y_train)}")
print(f"train MAE {mean_absolute_error(y_pred,y_train)}")

y_pred = model.predict(X_val)

print(f"Val R2 - Score {r2_score(y_pred,y_val)}")
print(f"Val MAE {mean_absolute_error(y_pred,y_val)}")



train R2 - Score 0.9899983097143418
train MAE 0.0025
Val R2 - Score 0.8738748891478518
Val MAE 0.03153125
