In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import os
import re
import matplotlib.pyplot as plt
import statistics
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score

# Set random seed.
np.random.seed(42)


%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
df_train = pd.read_csv("../data/train_final.csv")
df_test = pd.read_csv("../data/test_final.csv")
df_test_o = pd.read_csv("../data/test.csv")
pd.set_option('display.max_columns', 4000)
pd.set_option('display.max_rows', 4000)

In [3]:
cols = ['SalePrice']

In [4]:
X = df_train.drop(cols, axis=1)
y = df_train['SalePrice']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42)

In [6]:
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [7]:
ols = LinearRegression()
ols.fit(Z_train, y_train)

LinearRegression()

In [8]:
lasso = LassoCV(n_alphas=100)
lasso.fit(Z_train, y_train)

LassoCV()

In [9]:
ridge = RidgeCV(alphas=np.linspace(0.1, 10, 100))
ridge.fit(Z_train, y_train)

RidgeCV(alphas=array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,
        1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,  2.2,
        2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,  3.3,
        3.4,  3.5,  3.6,  3.7,  3.8,  3.9,  4. ,  4.1,  4.2,  4.3,  4.4,
        4.5,  4.6,  4.7,  4.8,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.5,
        5.6,  5.7,  5.8,  5.9,  6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,
        6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,
        7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,
        8.9,  9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9,
       10. ]))

In [10]:
#accuracy is an impt kpi for the model
#cross_val_score(knn, X_train_sc, y_train, cv=5).mean()

In [11]:
print(ols.score(Z_train, y_train))
print(ols.score(Z_test, y_test))

0.9349935784007858
-4.132424375299485e+20


In [12]:
cross_val_score(ols, Z_train, y_train, cv=10).mean()

-8.772719218936503e+24

In [13]:
print(lasso.score(Z_train, y_train))
print(lasso.score(Z_test, y_test))

0.9280035135692906
0.910210073579524


In [14]:
cross_val_score(lasso, Z_train, y_train, cv=10).mean()

0.9115209472008736

In [15]:
print(ridge.score(Z_train, y_train))
print(ridge.score(Z_test, y_test))

0.9342252469860698
0.9060496903368966


In [16]:
cross_val_score(ridge, Z_train, y_train, cv=10).mean()

0.9065305398789685

## Coefficients

In [17]:
ols.coef_

array([-3.09883708e+02, -1.25630739e+04,  2.79034926e+03,  9.89807667e+03,
        7.25697889e+02,  1.30936779e+03, -2.27928336e+02,  1.01947228e+04,
        4.70628772e+03, -7.51956603e+15,  2.27690521e+03,  4.87687194e+03,
        3.09422807e+03, -4.17376265e+02,  6.13788970e+02, -1.22240594e+03,
        4.36317562e+03,  3.50361910e+03, -6.82717571e+02,  6.17258843e+03,
        1.82173006e+02, -4.42052401e+02, -9.54896698e+14, -1.14345324e+15,
       -1.30528291e+14,  1.20648974e+15,  4.29682776e+03,  1.17967588e+03,
        6.02494311e+02,  1.23328768e+02, -2.11811137e+03, -1.59301122e+03,
        4.31835562e+03, -4.44454936e+02,  2.80484868e+03,  1.67064703e+03,
        1.67668629e+03,  3.34682245e+03,  3.38362066e+03,  3.23311700e+03,
       -8.89565322e+02,  6.67364661e+02,  1.60208824e+03,  6.78867549e+02,
        8.11826990e+02,  8.77030404e+02,  2.25619433e+03, -3.36449035e+02,
        2.69123421e+02, -2.37423133e+01,  3.32617645e+14, -5.90721937e+02,
        9.96797432e+03, -

In [18]:
lasso.coef_

array([-0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  3.19400306e+03,
       -0.00000000e+00,  4.21042696e+02, -0.00000000e+00,  1.05974603e+04,
        3.38420076e+03,  0.00000000e+00,  2.12949764e+03,  3.53719388e+03,
        3.59188073e+03,  0.00000000e+00,  0.00000000e+00, -5.28007185e+02,
        4.23893920e+03,  3.68290404e+03, -3.78266759e+02,  6.11889225e+03,
        5.22237684e+02, -1.68053139e+02,  1.15418647e+03,  0.00000000e+00,
       -7.20359846e+02,  2.01290673e+04,  3.72865592e+03,  2.48238223e+02,
        6.21971084e+01,  0.00000000e+00, -1.49623563e+03, -1.04528403e+03,
        4.55962619e+03, -0.00000000e+00,  2.60942751e+03,  2.13525345e+03,
        1.04291167e+03,  2.70381135e+03,  3.89939415e+03, -0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  1.50609979e+03,  6.92790729e+02,
        5.62359243e+00,  5.46734506e+02,  1.59965445e+03, -4.98809790e+01,
       -0.00000000e+00,  0.00000000e+00, -5.75750755e+01,  0.00000000e+00,
        1.62269427e+03, -

In [19]:
ridge.coef_

array([ -277.82791188, -3323.33522012,  1388.29399358,  7885.7212472 ,
         467.61200848,  1217.13487996,  -200.52047763,  9957.27798877,
        4551.91011421,  4209.62593001,  2414.83129442,  4773.72398127,
        3012.2648916 ,  -343.40987085,   811.2592841 , -1151.29084403,
        4344.27799043,  3556.9067089 ,  -682.2091414 ,  6133.72851734,
         368.17070822,  -410.90369177,  6572.6034037 ,  8386.36880479,
        -246.91005528, 13123.48418841,  4200.06648949,  1077.86028019,
         707.15507869,   210.18475723, -1919.19598965, -1509.46501484,
        4323.03766567,  -491.72212564,  2811.93824726,  1965.31982805,
        1527.10927654,  3594.32964319,  3472.25532453,  1637.82395164,
        -343.46525309,   662.57272578,  1644.26686487,   828.71217838,
         740.84150734,   862.44306478,  2128.86118451,  -387.89368928,
         263.50686149,    63.33977328, -3834.58008938,  -197.50712157,
        7041.48053788, -7803.91185557,  3137.81841075,  2027.68985186,
      

## Null Model

In [20]:
df_train["SalePrice"].mean()

174420.64517766496

## Evaluating my model

In [21]:
#predicting using training values
pred_train = lasso.predict(X_train)

In [22]:
#getting the redisuals
resids = y_train - pred_train

In [23]:
#calculating the model's mean squared error
model_mse = np.mean(resids**2)
model_mse

6244231022714446.0

In [24]:
y_bar = df_train["SalePrice"].mean()

In [25]:
#calculating the null model's mean squared error
null_mse = np.mean((y_train - y_bar)**2)
null_mse

4240616708.6556993

In [26]:
R2_null = 1 - (model_mse / null_mse)

In [27]:
R2_null

-1472480.8232143184

The model does worse than the null model...

As such, I will remove more variables.

In [28]:
df_train_2 = pd.read_csv("../data/train_2_final.csv")
df_test_2 = pd.read_csv("../data/test_2_final.csv")
df_test_o_2 = pd.read_csv("../data/test.csv")

In [29]:
cols_2 = ['SalePrice']

In [30]:
X2 = df_train_2.drop(cols, axis=1)
y2 = df_train_2['SalePrice']

In [31]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(
    X2,
    y2,
    test_size=0.3,
    random_state=42)

In [32]:
sc = StandardScaler()
Z_train_2 = sc.fit_transform(X_train_2)
Z_test_2 = sc.transform(X_test_2)

In [33]:
ols2 = LinearRegression()
ols2.fit(Z_train_2, y_train_2)

LinearRegression()

In [34]:
lasso2 = LassoCV(n_alphas=1000)
lasso2.fit(Z_train_2, y_train_2)

LassoCV(n_alphas=1000)

In [35]:
ridge2 = RidgeCV(alphas=np.logspace(0, 10, 100))
ridge2.fit(Z_train_2, y_train_2)

RidgeCV(alphas=array([1.00000000e+00, 1.26185688e+00, 1.59228279e+00, 2.00923300e+00,
       2.53536449e+00, 3.19926714e+00, 4.03701726e+00, 5.09413801e+00,
       6.42807312e+00, 8.11130831e+00, 1.02353102e+01, 1.29154967e+01,
       1.62975083e+01, 2.05651231e+01, 2.59502421e+01, 3.27454916e+01,
       4.13201240e+01, 5.21400829e+01, 6.57933225e+01, 8.30217568e+01,
       1.04761575e+02, 1.32194115e+0...
       4.75081016e+07, 5.99484250e+07, 7.56463328e+07, 9.54548457e+07,
       1.20450354e+08, 1.51991108e+08, 1.91791026e+08, 2.42012826e+08,
       3.05385551e+08, 3.85352859e+08, 4.86260158e+08, 6.13590727e+08,
       7.74263683e+08, 9.77009957e+08, 1.23284674e+09, 1.55567614e+09,
       1.96304065e+09, 2.47707636e+09, 3.12571585e+09, 3.94420606e+09,
       4.97702356e+09, 6.28029144e+09, 7.92482898e+09, 1.00000000e+10]))

In [36]:
print(ols2.score(Z_train_2, y_train_2))
print(ols2.score(Z_test_2, y_test_2))

0.8649026685054739
0.8666692764236241


In [37]:
cross_val_score(ols2, Z_train_2, y_train_2, cv=10).mean()

0.8588338020316844

In [38]:
print(lasso2.score(Z_train_2, y_train_2))
print(lasso2.score(Z_test_2, y_test_2))

0.8648894332377706
0.8668293877595037


In [39]:
cross_val_score(lasso2, Z_train_2, y_train_2, cv=10).mean()

0.8587841382233734

In [40]:
print(ridge2.score(Z_train_2, y_train_2))
print(ridge2.score(Z_test_2, y_test_2))

0.8648693190111527
0.867036516254048


In [41]:
cross_val_score(ridge2, Z_train_2, y_train_2, cv=10).mean()

0.8588753077076048

In [42]:
pred_train_2 = ols2.predict(X_train_2)

In [43]:
resids2 = y_train_2 - pred_train_2

In [44]:
model_mse2 = np.mean(resids2**2)
model_mse2

4609519940553897.0

In [45]:
y_bar2 = df_train_2["SalePrice"].mean()

In [46]:
null_mse2 = np.mean((y_train_2 - y_bar2)**2)
null_mse2

4240616708.6556993

In [47]:
metrics.mean_squared_error(y_train, pred_train)

6244231022714455.0

In [48]:
np.sqrt(metrics.mean_squared_error(y_train, pred_train))

79020446.86481123

In [49]:
metrics.r2_score(y_train, pred_train)

-1473005.3887821757

## Predicting test.csv SalePrice

In [50]:
x_test_test = df_test

In [51]:
pred = lasso.predict(x_test_test)

In [52]:
pred

array([8.09348902e+07, 9.16004313e+07, 9.59601504e+07, 5.96677900e+07,
       7.59998850e+07, 3.72885088e+07, 6.12131791e+07, 7.00197943e+07,
       5.80351395e+07, 7.35028747e+07, 6.26496476e+07, 5.68788050e+07,
       7.97048184e+07, 9.83726149e+07, 8.57032279e+07, 6.33120161e+07,
       7.71096710e+07, 6.54520373e+07, 8.74049528e+07, 1.03847439e+08,
       6.14461219e+07, 5.81027810e+07, 7.63949118e+07, 1.01875631e+08,
       7.10368742e+07, 5.22734872e+07, 8.63427076e+07, 7.38132751e+07,
       6.31426300e+07, 3.14821597e+07, 5.39059229e+07, 5.57753321e+07,
       1.07086930e+08, 7.06417125e+07, 8.44807169e+07, 8.26243142e+07,
       6.39309300e+07, 5.58410508e+07, 5.45708887e+07, 6.78523282e+07,
       4.87594086e+07, 8.69310164e+07, 6.50794586e+07, 8.80781680e+07,
       6.21218171e+07, 4.04584194e+07, 8.61249335e+07, 5.65555875e+07,
       5.01112856e+07, 5.78760584e+07, 6.64886986e+07, 8.42202084e+07,
       8.29127616e+07, 6.15459838e+07, 5.27145716e+07, 5.98112568e+07,
      

In [53]:
print(lasso.score(x_test_test, pred))

1.0


In [54]:
df_pred = pd.DataFrame (pred)
df_pred.shape

(878, 1)

In [55]:
df_pred.head()

Unnamed: 0,0
0,80934890.0
1,91600430.0
2,95960150.0
3,59667790.0
4,75999890.0


In [56]:
df_test_o.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,2fmCon,2Story,6,8,1910,1950,Gable,CompShg,AsbShng,AsbShng,,0.0,TA,Fa,Stone,Fa,TA,No,Unf,0,Unf,0,1020,1020,GasA,Gd,N,FuseP,908,1020,0,1928,0,0,2,0,4,2,Fa,9,Typ,0,,Detchd,1910.0,Unf,1,440,Po,Po,Y,0,60,112,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Sawyer,Norm,Norm,Duplex,1Story,5,4,1977,1977,Gable,CompShg,Plywood,Plywood,,0.0,TA,TA,CBlock,Gd,TA,No,Unf,0,Unf,0,1967,1967,GasA,TA,Y,SBrkr,1967,0,0,1967,0,0,2,0,6,2,TA,10,Typ,0,,Attchd,1977.0,Fin,2,580,TA,TA,Y,170,0,0,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,7,5,2006,2006,Gable,CompShg,VinylSd,VinylSd,,0.0,Gd,TA,PConc,Gd,Gd,Av,GLQ,554,Unf,0,100,654,GasA,Ex,Y,SBrkr,664,832,0,1496,1,0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,2006.0,RFn,2,426,TA,TA,Y,100,24,0,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Fam,1Story,5,6,1923,2006,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,Gd,TA,CBlock,TA,TA,No,Unf,0,Unf,0,968,968,GasA,TA,Y,SBrkr,968,0,0,968,0,0,1,0,2,1,TA,5,Typ,0,,Detchd,1935.0,Unf,2,480,Fa,TA,N,0,0,184,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1963,1963,Gable,CompShg,Plywood,Plywood,BrkFace,247.0,TA,TA,CBlock,Gd,TA,No,BLQ,609,Unf,0,785,1394,GasA,Gd,Y,SBrkr,1394,0,0,1394,1,0,1,1,3,1,TA,6,Typ,2,Gd,Attchd,1963.0,RFn,2,514,TA,TA,Y,0,76,0,0,185,0,,,,0,7,2009,WD


In [57]:
df_pred.rename(columns={
    0: 'SalePrice'
}, inplace=True)

In [58]:
df_pred["Id"] = df_test_o["Id"].copy()

In [59]:
df_pred.head()

Unnamed: 0,SalePrice,Id
0,80934890.0,2658
1,91600430.0,2718
2,95960150.0,2414
3,59667790.0,1989
4,75999890.0,625


In [60]:
#shifting column around
col_name="Id"
first_col = df_pred.pop(col_name)
df_pred.insert(0, col_name, first_col)

In [61]:
df_pred.head()

Unnamed: 0,Id,SalePrice
0,2658,80934890.0
1,2718,91600430.0
2,2414,95960150.0
3,1989,59667790.0
4,625,75999890.0


In [62]:
df_pred.to_csv("../data/saleprice.csv", index = False)