In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Read Data

In [2]:
from helpers.utils import read_data
from helpers.utils import get_features

In [3]:
train_path = "./data/train.csv"
X, y = read_data(train_path)
X.shape, y.shape

((700, 5), (700,))

In [4]:
X_features = get_features(X)
X_features.shape

(700, 21)

## 2.Cross Validation

In [5]:
import helpers.config as config

from helpers.cross_validation import CrossValidator
from helpers.utils import rmse_scoring

### 2.1 Ridge Regression

In [6]:
from sklearn.linear_model import Ridge

In [45]:
classifier = Ridge(fit_intercept=False, tol=1e-3)
param_grid = {"alpha": config.config_lambda}
scorings = {"rmse": rmse_scoring()}
refit = "rmse"
cv = CrossValidator(X_features, y, classifier, param_grid, config.config_K, scorings, refit=refit)
df_train = cv.fit()

In [46]:
df_train

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse
0,0.00271,0.00542,0.0002,0.0004,0.0,{'alpha': 0.0},-4.149157,-1.929628,-2.304097,-2.000639,-1.92489,-2.461682,0.855125,12
1,0.000802,0.000401,0.0,0.0,0.001,{'alpha': 0.001},-2.253038,-1.911017,-2.030677,-1.901543,-1.861436,-1.991542,0.142424,11
2,0.000879,0.001315,0.0,0.0,0.005,{'alpha': 0.005},-2.251434,-1.91242,-2.030274,-1.899697,-1.859538,-1.990673,0.14224,10
3,0.000601,0.00049,0.000403,0.000493,0.01,{'alpha': 0.01},-2.251463,-1.912509,-2.029837,-1.898485,-1.858942,-1.990247,0.142482,9
4,0.000729,0.000374,0.0002,0.0004,0.05,{'alpha': 0.05},-2.252368,-1.911771,-2.027193,-1.893324,-1.856568,-1.988245,0.143858,8
5,0.000203,0.000407,0.000799,0.000399,0.1,{'alpha': 0.1},-2.252418,-1.910901,-2.024719,-1.890746,-1.85494,-1.986745,0.144475,7
6,0.000603,0.000492,0.0,0.0,0.5,{'alpha': 0.5},-2.250871,-1.90776,-2.01464,-1.887439,-1.848112,-1.981765,0.145421,6
7,0.000217,0.000434,0.0,0.0,1.0,{'alpha': 1.0},-2.249552,-1.906859,-2.010228,-1.887603,-1.842342,-1.979317,0.145881,5
8,0.0,0.0,0.0,0.0,5.0,{'alpha': 5.0},-2.246756,-1.909411,-2.006213,-1.892034,-1.821122,-1.975107,0.148114,3
9,0.0,0.0,0.0,0.0,10.0,{'alpha': 10.0},-2.245685,-1.912618,-2.006658,-1.894706,-1.812267,-1.974387,0.149058,2


In [47]:
y_pred, rmse_val = cv.predict()
rmse_val

1.879761002973425

In [48]:
coeffs = cv.get_coeff()
coeffs

array([ 0.12518284, -0.10464149, -0.18692221,  0.21906393,  0.04093704,
       -0.04741644,  0.0248698 ,  0.03729714, -0.0827775 ,  0.01499384,
       -0.49972776, -0.69286684, -0.76943339, -0.41971174, -0.55352431,
       -0.57631292, -0.61231367, -0.61853893, -0.5592288 , -0.60708634,
       -0.599955  ])

### 2.2 Lasso

In [49]:
from sklearn.linear_model import Lasso

In [50]:
classifier = Lasso(fit_intercept=False, tol=1e-3, max_iter=6000)
param_grid = {"alpha": [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 20, 50]}
scorings = {"rmse": rmse_scoring()}
refit = "rmse"
cv = CrossValidator(X_features, y, classifier, param_grid, config.config_K, scorings, refit=refit)
df_train = cv.fit()

In [51]:
df_train

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse
0,0.019506,0.010974,0.003325,0.006162,0.001,{'alpha': 0.001},-2.248499,-1.904685,-2.017751,-1.888142,-1.846075,-1.98103,0.145298,3
1,0.003126,0.006251,0.0,0.0,0.005,{'alpha': 0.005},-2.252456,-1.904015,-2.002782,-1.873738,-1.858215,-1.978241,0.146036,1
2,0.003126,0.006251,0.0,0.0,0.01,{'alpha': 0.01},-2.251595,-1.906402,-2.003045,-1.884654,-1.857457,-1.98063,0.144095,2
3,0.0,0.0,0.0,0.0,0.05,{'alpha': 0.05},-2.245086,-1.917584,-1.998292,-1.891595,-1.85643,-1.981797,0.139693,5
4,0.006253,0.007658,0.0,0.0,0.1,{'alpha': 0.1},-2.230523,-1.933373,-2.000174,-1.898478,-1.84373,-1.981256,0.134574,4
5,0.003123,0.006245,0.0,0.0,0.5,{'alpha': 0.5},-2.216453,-2.077962,-2.081589,-1.965068,-1.895391,-2.047293,0.110061,6
6,0.003128,0.006256,0.0,0.0,1.0,{'alpha': 1},-2.297805,-2.323978,-2.261807,-2.132619,-2.09745,-2.222732,0.090807,7
7,0.0,0.0,0.0,0.0,5.0,{'alpha': 5},-5.112168,-5.680627,-5.42281,-5.288343,-5.382527,-5.377295,0.185642,8
8,0.003122,0.006244,0.0,0.0,10.0,{'alpha': 10},-6.39744,-6.866636,-6.655812,-6.545736,-6.62321,-6.617767,0.153065,9
9,0.0,0.0,0.0,0.0,20.0,{'alpha': 20},-6.39744,-6.866636,-6.655812,-6.545736,-6.62321,-6.617767,0.153065,9


In [52]:
y_pred, rmse_val = cv.predict()
rmse_val

1.863197289645828

In [53]:
coeffs = cv.get_coeff()
coeffs

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00, -2.04002836e-03, -2.00909026e+00,
       -3.18479174e+00, -7.39435273e-02, -1.12405615e+00, -5.30997286e-02,
       -2.23627686e-01, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00])

### 2.3 Conclusion: Lasso is Better

In [64]:
from helpers.utils import rmse

In [62]:
best_alpha = cv.clf.best_params_["alpha"]
# print(best_alpha)
model_all_tr = Lasso(alpha=best_alpha, fit_intercept=False, tol=1e-3)
model_all_tr.fit(X_features, y)

Lasso(alpha=0.005, fit_intercept=False, tol=0.001)

In [63]:
model_all_tr.coef_

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.        ,  0.        , -0.        , -0.        ,  0.        ,
       -0.19536083, -2.10840448, -3.3994524 , -0.08697392, -0.58766454,
       -0.        , -0.27310716, -0.        , -0.        , -0.        ,
       -0.        ])

In [67]:
y_pred = model_all_tr.predict(X_features)
rmse(y, y_pred)

1.9480103681175276

## 3. Save Results

In [68]:
from helpers.utils import save_data

In [69]:
save_path = "./outputs/submission.csv"
save_data(save_path, model_all_tr.coef_)