In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 300)

from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [2]:
base = pd.read_csv('Data/BaseData.csv')

In [3]:
base.shape

(2580, 299)

In [4]:
base.isnull().sum().sum()

0

In [6]:
X = base.drop(['SalePrice'], axis=1).values
y = base.SalePrice.values

X_frame = base.drop(['SalePrice'], axis=1)

folds = KFold(n_splits = 5, shuffle = True, random_state = None)

for trainIndex, testIndex in folds.split(X):
    X_train, X_test = X[trainIndex], X[testIndex]
    y_train, y_test = y[trainIndex], y[testIndex]
    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)
    
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    #alphas = 10**np.linspace(10,-2,100)*0.5
    lassocv = LassoCV(alphas=None, cv = 10, max_iter = 100000)
    lassocv.fit(X_train_scaled, y_train)
    print(lassocv.alpha_)
    
    lasso = Lasso(alpha=lassocv.alpha_)
    print(lasso)
    lasso.fit(X_train_scaled, y_train)
    print('R2 Train is: ' + str(lasso.score(X_train_scaled, y_train)))
    print('R2 Test is: ' + str(lasso.score(X_test_scaled, y_test)))
    
    #the below offers same result as above, good back up:
    pred = lasso.predict(X_test_scaled)
    #print(r2_score(y_test, pred))
    
    ADJ_R2 = 1 - (1-r2_score(y_test, pred)) * (len(y)-1)/(len(y)-X.shape[1]-1)
    print('ADJ_R2 is: ' + str(ADJ_R2))
    
    results = pd.DataFrame(lasso.coef_, index=X_frame.columns)
    results.columns = ['Coef']
    Null = results[(results['Coef']>-1) & (results['Coef']<1)]
    Null_list = list(Null.index.values)
    Imp = results[(results['Coef']<-1) | (results['Coef']>1)]
    Imp_list = list(Imp.index.values)
    #print(Null_list)
    print('Features Dropped: ' + str(len(Null_list)))
    #print(Imp_list)
    print('Features Kept: ' + str(len(Imp_list)))
    print()

(2064, 298)
(2064,)
(516, 298)
(516,)
299.5877391910952
Lasso(alpha=299.5877391910952)
R2 Train is: 0.9340690947170441
R2 Test is: 0.9204672889179866
ADJ_R2 is: 0.9100767812886836
Features Dropped: 132
Features Kept: 166

(2064, 298)
(2064,)
(516, 298)
(516,)
225.91695896401967
Lasso(alpha=225.91695896401967)
R2 Train is: 0.9361813382756597
R2 Test is: 0.9136192874899125
ADJ_R2 is: 0.9023341264517687
Features Dropped: 117
Features Kept: 181

(2064, 298)
(2064,)
(516, 298)
(516,)
104.39510561889361
Lasso(alpha=104.39510561889361)
R2 Train is: 0.9489223197256474
R2 Test is: 0.8435135075399975
ADJ_R2 is: 0.82306941514496
Features Dropped: 74
Features Kept: 224

(2064, 298)
(2064,)
(516, 298)
(516,)
117.16396520147629
Lasso(alpha=117.16396520147629)
R2 Train is: 0.9350614457788706
R2 Test is: 0.9340831766998755
ADJ_R2 is: 0.9254715092981056
Features Dropped: 72
Features Kept: 226

(2064, 298)
(2064,)
(516, 298)
(516,)
289.79948372400384
Lasso(alpha=289.79948372400384)
R2 Train is: 0.933607