In [1]:
# imports

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets, metrics 
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.linear_model import Ridge, Lasso, LassoCV

import statsmodels.api as sm

In [2]:
# suppressing scientific notation for pandas 

pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Reading in cleaned datasets

In [3]:
ames = pd.read_pickle('../data/ames_clean.pkl')
ames_test = pd.read_pickle('../data/ames_test_clean.pkl')

In [4]:
ames.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,69.049,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [5]:
ames_test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,69.546,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,69.546,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


### Baseline Model 

_This will give me something to compare my future models to as I can then see how they compare to the baseline score._ 

In [6]:
ames_test['SalePrice_bl'] = ames['SalePrice'].mean()
ames_test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice_bl
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,181534.626
1,2718,905108090,90,RL,69.546,9662,Pave,,IR1,Lvl,...,0,0,,,,0,8,2006,WD,181534.626
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,,,,0,9,2006,New,181534.626
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,,,,0,7,2007,WD,181534.626
4,625,535105100,20,RL,69.546,9500,Pave,,IR1,Lvl,...,185,0,,,,0,7,2009,WD,181534.626


In [7]:
baseline_model = ames_test[['Id', 'SalePrice_bl']].copy()
baseline_model.head()

Unnamed: 0,Id,SalePrice_bl
0,2658,181534.626
1,2718,181534.626
2,2414,181534.626
3,1989,181534.626
4,625,181534.626


## _Now to build better models!_

In [9]:
features_num = ['Lot Frontage', 'Lot Area', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area',
       'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       '1st Flr SF', '2nd Flr SF', 'Gr Liv Area', 'Garage Area', 'Wood Deck SF',
       'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch',
       'Pool Area', 'Misc Val']

In [10]:
# identify X, y 

X = ames[features_num]
y = ames['SalePrice']

In [11]:
X

Unnamed: 0,Lot Frontage,Lot Area,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,2nd Flr SF,Gr Liv Area,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val
0,69.049,13517,1976,2005,289.000,533.000,0.000,192.000,725.000,725,754,1479,475.000,0,44,0,0,0,0,0
1,43.000,11492,1996,1997,132.000,637.000,0.000,276.000,913.000,913,1209,2122,559.000,0,74,0,0,0,0,0
2,68.000,7922,1953,2007,0.000,731.000,0.000,326.000,1057.000,1057,0,1057,246.000,0,52,0,0,0,0,0
3,73.000,9802,2006,2007,0.000,0.000,0.000,384.000,384.000,744,700,1444,400.000,100,0,0,0,0,0,0
4,82.000,14235,1900,1993,0.000,0.000,0.000,676.000,676.000,831,614,1445,484.000,0,59,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,79.000,11449,2007,2007,0.000,1011.000,0.000,873.000,1884.000,1728,0,1728,520.000,0,276,0,0,0,0,0
2047,69.049,12342,1940,1950,0.000,262.000,0.000,599.000,861.000,861,0,861,539.000,158,0,0,0,0,0,0
2048,57.000,7558,1928,1950,0.000,0.000,0.000,896.000,896.000,1172,741,1913,342.000,0,0,0,0,0,0,0
2049,80.000,10400,1956,1956,0.000,155.000,750.000,295.000,1200.000,1200,0,1200,294.000,0,189,140,0,0,0,0


In [12]:
# train-test split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# scaling X's

ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [14]:
# instantiating the model

lr = LinearRegression()

_NOTE: # I tried my first set of prediction without Scaling; I then tried scaling and called the model lr_sc. The predictions changed somewhat and I will continue to use StandardScaler for every model I do now unless something breaks._

_So, I will now call my scaled model just lr._

In [15]:
# fitting my model using scaled data (see note in cell above?)

lr.fit(X_train_sc, y_train)

LinearRegression()

In [16]:
lr.intercept_

181996.92861500918

In [17]:
lr.coef_

array([ -512.74843466,  2264.23883868, 14268.37307057, 13101.99499995,
        9014.05503594,  7057.78791201,   873.17743509,   885.05560404,
        8444.07376974, 14784.13612592, 12438.63499432, 14605.85426951,
       11111.09995468,  5567.77482522,  2269.89457361,  2678.90135273,
         805.59652254,  5670.15198755, -4944.51736148, -9100.36531219])

In [None]:
# creating dataframe to show coefficients for each feature clearly

pd.DataFrame(list(zip(X_train.columns, lr.coef_)))

In [19]:
# make predictions on scaled ames_test columns and adding these predictions to the dataframe 
ames_test['SalePrice_fn'] = lr.predict(ss.transform(ames_test[features_num]))

In [20]:
ames_test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice_bl,SalePrice_fn
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,,,,0,4,2006,WD,181534.626,140685.771
1,2718,905108090,90,RL,69.546,9662,Pave,,IR1,Lvl,...,0,,,,0,8,2006,WD,181534.626,225796.997
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,,,,0,9,2006,New,181534.626,194323.567
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,,,,0,7,2007,WD,181534.626,128493.14
4,625,535105100,20,RL,69.546,9500,Pave,,IR1,Lvl,...,0,,,,0,7,2009,WD,181534.626,193068.456


In [21]:
# training set score

lr.score(X_train_sc, y_train)

0.7687301465477075

In [22]:
# test set score 

lr.score(X_test_sc, y_test)

0.8231299338712463

In [23]:
# RMSE 

np.sqrt(metrics.mean_squared_error(y_test, lr.predict(X_test_sc)))

32577.805589673284

I will make a new DataFrame with just the Id and SalePrice prediction for this model so that I can submit it. I need to set a column caled `SalePrice` to the `ames_test` Dataframe and set this column equal to these latest prediction so that it will be in the right format for submissions. 

In [74]:
ames_test['SalePrice'] = ames_test['SalePrice_fn']
submission_6 = ames_test[['Id', 'SalePrice']].copy()

In [75]:
pwd

'/Users/emilynaftalin/Data_Science/General Assembly/dsi/projects/project_2/code'

In [76]:
submission_6.to_csv('../submissions/submission_6.csv', index=False)

In [77]:
submission_6.head()

Unnamed: 0,Id,SalePrice
0,2658,140685.771
1,2718,225796.997
2,2414,194323.567
3,1989,128493.14
4,625,193068.456


### Making functions to simplify 

_KISS_

In [63]:
def standardize(X_train, X_test):
    ss = StandardScaler()
    X_train_sc = pd.DataFrame(ss.fit_transform(X_train))
    X_test_sc = pd.DataFrame(ss.transform(X_test))
    return ss, X_train_sc, X_test_sc

In [58]:
def my_model_metrics(estimator, X_test_sc, y_test):
    r2 = estimator.score(X_test_sc, y_test)
    print(f"R^2 = {r2}")
    rmse = np.sqrt(metrics.mean_squared_error(y_test, estimator.predict(X_test_sc)))
    print(f"RMSE = {rmse}")

In [31]:
def fit_predict_score(estimator, X_train_sc, y_train, X_test_sc, y_test):
    estimator.fit(X_train_sc, y_train)
    r2 = estimator.score(X_test_sc, y_test)
    print(f"R^2 = {r2}")
    rmse = np.sqrt(metrics.mean_squared_error(y_test, estimator.predict(X_test_sc)))
    print(f"RMSE = {rmse}")

### Building models that include more numeric columns

_some cells below are repeats from above for easy reference_

#### Identifying new lists of features to use in models

In [38]:
num_cols = ames.describe().columns
num_cols

Index(['Id', 'PID', 'MS SubClass', 'Lot Frontage', 'Lot Area', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area',
       'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area',
       'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath',
       'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces',
       'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'Wood Deck SF',
       'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch',
       'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold', 'SalePrice'],
      dtype='object')

In [39]:
features_num = ['Lot Frontage', 'Lot Area', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area',
       'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       '1st Flr SF', '2nd Flr SF', 'Gr Liv Area', 'Garage Area', 'Wood Deck SF',
       'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch',
       'Pool Area', 'Misc Val']

In [40]:
continuous_cols = ['Lot Area', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area',
       'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       '1st Flr SF', '2nd Flr SF', 'Gr Liv Area','Garage Yr Blt', 'Garage Area', 'Wood Deck SF',
       'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch',
       'Pool Area', 'Misc Val', 'SalePrice']

In [41]:
# discrete variables 

discrete_cols = [x for x in num_cols if x not in continuous_cols]
discrete_cols

['Id',
 'PID',
 'MS SubClass',
 'Lot Frontage',
 'Overall Qual',
 'Overall Cond',
 'Low Qual Fin SF',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'TotRms AbvGrd',
 'Fireplaces',
 'Garage Cars',
 'Mo Sold',
 'Yr Sold']

_Figuring out which discrete numeric columns need to be removed because they are misleading -- like Id._

In [42]:
ames['MS SubClass'].unique()

array([ 60,  20,  50, 180, 160,  70, 120, 190,  85,  30,  90,  80,  75,
        45,  40, 150])

In [43]:
for feature in discrete_cols:
    print(f"{feature}: {ames[feature].unique()}")

Id: [109 544 153 ... 916 639  10]
PID: [533352170 531379050 535304180 ... 909253010 535179160 527162130]
MS SubClass: [ 60  20  50 180 160  70 120 190  85  30  90  80  75  45  40 150]
Lot Frontage: [ 69.04886562  43.          68.          73.          82.
 137.          35.          70.          21.          64.
 120.          24.          74.          93.          34.
  80.          71.          72.         109.          40.
 103.         100.          92.          65.          75.
  60.          30.          79.          41.         105.
 107.          81.          36.          63.          32.
  94.          44.          50.          48.          67.
  88.          83.          53.          58.          57.
  52.          87.         134.          56.          54.
 140.          78.          85.          90.          96.
  62.          49.          59.         155.          91.
  61.          86.         128.          77.          42.
  89.          51.          69.          55.    

In [44]:
ames['Bsmt Full Bath'].isnull().sum()

1

In [45]:
ames['Bsmt Full Bath'].fillna(ames['Bsmt Full Bath'].mean(), inplace=True)
ames['Bsmt Half Bath'].fillna(ames['Bsmt Half Bath'].mean(), inplace=True)

In [46]:
ames[discrete_cols].isnull().sum()

Id                 0
PID                0
MS SubClass        0
Lot Frontage       0
Overall Qual       0
Overall Cond       0
Low Qual Fin SF    0
Bsmt Full Bath     0
Bsmt Half Bath     0
Full Bath          0
Half Bath          0
Bedroom AbvGr      0
Kitchen AbvGr      0
TotRms AbvGrd      0
Fireplaces         0
Garage Cars        0
Mo Sold            0
Yr Sold            0
dtype: int64

In [47]:
# now that I have really taken care of null values
# I wlil all discrete variable cols to the features
# EXCEPT Id, PID
# Also leaving out Mo Sold, Yr Sold (these will be dummified)

In [48]:
feat_discrete = features_num.copy()
feat_discrete

['Lot Frontage',
 'Lot Area',
 'Year Built',
 'Year Remod/Add',
 'Mas Vnr Area',
 'BsmtFin SF 1',
 'BsmtFin SF 2',
 'Bsmt Unf SF',
 'Total Bsmt SF',
 '1st Flr SF',
 '2nd Flr SF',
 'Gr Liv Area',
 'Garage Area',
 'Wood Deck SF',
 'Open Porch SF',
 'Enclosed Porch',
 '3Ssn Porch',
 'Screen Porch',
 'Pool Area',
 'Misc Val']

In [49]:
# adding discrete columns to make longer numeric col list

for feat in ['MS SubClass',
 'Lot Frontage',
 'Overall Qual',
 'Overall Cond',
 'Low Qual Fin SF',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'TotRms AbvGrd',
 'Fireplaces',
 'Garage Cars']:
    feat_discrete.append(feat)

In [50]:
feat_discrete

['Lot Frontage',
 'Lot Area',
 'Year Built',
 'Year Remod/Add',
 'Mas Vnr Area',
 'BsmtFin SF 1',
 'BsmtFin SF 2',
 'Bsmt Unf SF',
 'Total Bsmt SF',
 '1st Flr SF',
 '2nd Flr SF',
 'Gr Liv Area',
 'Garage Area',
 'Wood Deck SF',
 'Open Porch SF',
 'Enclosed Porch',
 '3Ssn Porch',
 'Screen Porch',
 'Pool Area',
 'Misc Val',
 'MS SubClass',
 'Lot Frontage',
 'Overall Qual',
 'Overall Cond',
 'Low Qual Fin SF',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'TotRms AbvGrd',
 'Fireplaces',
 'Garage Cars']

#### Now building models with new lists of features identified just above 

**Model using discrete variables (```feat_discrete```)**

In [51]:
# identify X, y 

X = ames[feat_discrete]
y = ames['SalePrice']

In [52]:
# train-test split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                   test_size=0.2,
                                                   random_state=42)

In [64]:
# this time scaling with my function 

ss, X_train_sc, X_test_sc = standardize(X_train, X_test)

In [65]:
X_train_sc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,-0.011,0.878,0.964,0.836,0.442,-0.940,-0.287,2.633,1.522,1.532,...,-0.105,-0.816,-0.249,0.772,-0.736,0.197,-0.189,0.368,0.664,1.607
1,-0.011,0.395,0.242,-0.264,-0.564,1.200,-0.287,-1.274,-0.129,0.326,...,-0.105,1.105,-0.249,0.772,-0.736,-1.030,-0.189,-0.910,0.664,0.291
2,-0.011,0.502,0.013,-0.598,-0.564,0.342,-0.287,-0.686,-0.430,-0.750,...,-0.105,-0.816,-0.249,-1.049,-0.736,0.197,-0.189,-0.910,-0.934,0.291
3,0.405,-0.404,-0.775,0.836,-0.564,0.349,-0.287,-0.621,-0.360,-0.137,...,-0.105,1.105,-0.249,2.593,-0.736,0.197,-0.189,1.007,-0.934,-1.026
4,0.312,-0.075,1.062,0.979,1.379,-0.940,-0.287,1.274,0.183,-0.060,...,-0.105,-0.816,-0.249,0.772,1.291,1.424,-0.189,1.646,0.664,1.607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1634,-0.011,7.361,-0.447,-1.268,-0.564,1.003,-0.287,0.337,1.256,1.193,...,-0.105,1.105,-0.249,-1.049,-0.736,-1.030,-0.189,-0.271,2.262,0.291
1635,0.126,-0.184,-0.512,-1.363,-0.146,0.377,-0.287,0.019,0.299,0.071,...,-0.105,1.105,-0.249,-1.049,-0.736,-1.030,-0.189,-0.271,0.664,-1.026
1636,0.405,0.026,0.078,0.692,-0.005,0.477,1.954,-1.274,-0.044,0.350,...,-0.105,1.105,-0.249,-1.049,1.291,0.197,-0.189,-0.910,0.664,0.291
1637,-2.247,-1.493,-0.053,-0.694,0.153,-0.254,-0.287,-0.913,-1.267,-1.692,...,-0.105,-0.816,-0.249,-1.049,1.291,-1.030,-0.189,-0.910,-0.934,-1.026


In [66]:
lr = LinearRegression()

In [67]:
lr.fit(X_train_sc, y_train)

LinearRegression()

In [68]:
my_model_metrics(lr, X_test_sc, y_test)

R^2 = 0.8616140448051623
RMSE = 28816.462512006598


In [69]:
ames_test['SalePrice_fd'] = lr.predict(ss.transform(ames_test[feat_discrete]))
ames_test

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice_bl,SalePrice_fn,SalePrice_fd
0,2658,902301120,190,RM,69.000,9142,Pave,Grvl,Reg,Lvl,...,,,,0,4,2006,WD,181534.626,140685.771,130220.790
1,2718,905108090,90,RL,69.546,9662,Pave,,IR1,Lvl,...,,,,0,8,2006,WD,181534.626,225796.997,155397.327
2,2414,528218130,60,RL,58.000,17104,Pave,,IR1,Lvl,...,,,,0,9,2006,New,181534.626,194323.567,219080.759
3,1989,902207150,30,RM,60.000,8520,Pave,,Reg,Lvl,...,,,,0,7,2007,WD,181534.626,128493.140,115763.631
4,625,535105100,20,RL,69.546,9500,Pave,,IR1,Lvl,...,,,,0,7,2009,WD,181534.626,193068.456,199359.211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,1662,527377110,60,RL,80.000,8000,Pave,,Reg,Lvl,...,,,,0,11,2007,WD,181534.626,195902.817,200409.266
874,1234,535126140,60,RL,90.000,14670,Pave,,Reg,Lvl,...,,MnPrv,,0,8,2008,WD,181534.626,237258.147,223100.646
875,1373,904100040,20,RL,55.000,8250,Pave,,Reg,Lvl,...,,,,0,8,2008,WD,181534.626,131968.239,123000.534
876,1672,527425140,20,RL,60.000,9000,Pave,,Reg,Lvl,...,,GdWo,,0,5,2007,WD,181534.626,122547.241,98854.895


In [78]:
# making a new dataframe with just the Id and SalePrice predictions for this model so that I can submit it 
# need to set column called "saleprice" equal to these predictions so that I can copy that column and it will be in the right format for submissions 

ames_test['SalePrice'] = ames_test['SalePrice_fd']
submission_7 = ames_test[['Id', 'SalePrice']].copy()

In [80]:
submission_7.to_csv('../submissions/submission_7.csv', index=False)

In [81]:
submission_7.head()

Unnamed: 0,Id,SalePrice
0,2658,130220.79
1,2718,155397.327
2,2414,219080.759
3,1989,115763.631
4,625,199359.211


### Ridge Regressor 

_Linear Regression w/ Ridge Regressor regularization_

In [84]:
ridge = Ridge()

In [85]:
# this time using my fit-score-predict fxn 
# here it is as a reminder of what it looks like: 

def fit_predict_score(estimator, X_train_sc, y_train, X_test_sc, y_test):
    estimator.fit(X_train_sc, y_train)
    r2 = estimator.score(X_test_sc, y_test)
    print(f"R^2 Test = {r2}")
    rmse = np.sqrt(metrics.mean_squared_error(y_test, estimator.predict(X_test_sc)))
    print(f"RMSE = {rmse}")

In [86]:
fit_predict_score(ridge, X_train_sc, y_train, X_test_sc, y_test)

R^2 Test = 0.8618748507091332
RMSE = 28789.29556480834


As above, I will once again make a new DataFrame with just the Id and SalePrice prediction for this model so that I can submit it. I need to set a column caled `SalePrice` to the `ames_test` Dataframe and set this column equal to these latest prediction so that it will be in the right format for submissions. 

In [87]:
ames_test['SalePrice_fd_r'] = ridge.predict(ss.transform(ames_test[feat_discrete]))

ames_test['SalePrice'] = ames_test['SalePrice_fd_r']

ames_test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice_bl,SalePrice_fn,SalePrice_fd,SalePrice,SalePrice_fd_r
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,,0,4,2006,WD,181534.626,140685.771,130220.79,130244.537,130244.537
1,2718,905108090,90,RL,69.546,9662,Pave,,IR1,Lvl,...,,0,8,2006,WD,181534.626,225796.997,155397.327,155263.984,155263.984
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,,0,9,2006,New,181534.626,194323.567,219080.759,218830.587,218830.587
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,,0,7,2007,WD,181534.626,128493.14,115763.631,115728.891,115728.891
4,625,535105100,20,RL,69.546,9500,Pave,,IR1,Lvl,...,,0,7,2009,WD,181534.626,193068.456,199359.211,199213.194,199213.194


In [88]:
submission_8 = ames_test[['Id', 'SalePrice']].copy()

In [90]:
submission_8.to_csv('../submissions/submission_8.csv', index=False)

In [91]:
submission_8.head()

Unnamed: 0,Id,SalePrice
0,2658,130244.537
1,2718,155263.984
2,2414,218830.587
3,1989,115728.891
4,625,199213.194


### LASSO Regression 

_Linear Regression w/ LASSO Regressor regularization_

In [94]:
lasso = Lasso()

In [96]:
fit_predict_score(lasso, X_train_sc, y_train, X_test_sc, y_test)

R^2 Test = 0.8618696827160147
RMSE = 28789.83413975456


  model = cd_fast.enet_coordinate_descent(


_The values for $r^2$ and RMSE are the same with LASSO as with Ridge._

### LASSO CV 

In [98]:
# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(-3, 0 , 100)

# Cross-validate over our list of Lasso alphas.
lasso_cv = LassoCV(alphas=l_alphas, cv=5)

lasso_cv

LassoCV(alphas=array([0.001     , 0.00107227, 0.00114976, 0.00123285, 0.00132194,
       0.00141747, 0.00151991, 0.00162975, 0.00174753, 0.00187382,
       0.00200923, 0.00215443, 0.00231013, 0.00247708, 0.00265609,
       0.00284804, 0.00305386, 0.00327455, 0.00351119, 0.00376494,
       0.00403702, 0.00432876, 0.00464159, 0.00497702, 0.0053367 ,
       0.00572237, 0.00613591, 0.00657933, 0.0070548 , 0.00756463,
       0.008...
       0.09326033, 0.1       , 0.10722672, 0.1149757 , 0.12328467,
       0.13219411, 0.14174742, 0.15199111, 0.16297508, 0.17475284,
       0.18738174, 0.2009233 , 0.21544347, 0.23101297, 0.24770764,
       0.26560878, 0.28480359, 0.30538555, 0.32745492, 0.35111917,
       0.37649358, 0.40370173, 0.43287613, 0.46415888, 0.49770236,
       0.53366992, 0.57223677, 0.61359073, 0.65793322, 0.70548023,
       0.75646333, 0.81113083, 0.869749  , 0.93260335, 1.        ]),
        cv=5)

In [99]:
fit_predict_score(lasso_cv, X_train_sc, y_train, X_test_sc, y_test)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

R^2 Test = 0.8618696827160147
RMSE = 28789.83413975456


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


In [101]:
# this threw error before I fit the lasso_cv as below? i.e. the function didn't work in this case
ames_test['SalePrice_fd_lcv'] = lasso_cv.predict(ss.transform(ames_test[feat_discrete]))
ames_test['SalePrice_fd_lcv'] = ames_test['SalePrice']
ames_test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice_bl,SalePrice_fn,SalePrice_fd,SalePrice,SalePrice_fd_r,SalePrice_fd_lcv
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,4,2006,WD,181534.626,140685.771,130220.79,130244.537,130244.537,130244.537
1,2718,905108090,90,RL,69.546,9662,Pave,,IR1,Lvl,...,0,8,2006,WD,181534.626,225796.997,155397.327,155263.984,155263.984,155263.984
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,9,2006,New,181534.626,194323.567,219080.759,218830.587,218830.587,218830.587
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,7,2007,WD,181534.626,128493.14,115763.631,115728.891,115728.891,115728.891
4,625,535105100,20,RL,69.546,9500,Pave,,IR1,Lvl,...,0,7,2009,WD,181534.626,193068.456,199359.211,199213.194,199213.194,199213.194


R^2 Test = 0.8618696827160147   
RMSE = 28789.83413975456

Once again, the $r^2$ and RMSE scores for LassoCV are the same as for Ridge and for LASSO. As such I will not make a new submission for LASSO/LassoCV

### Model based on garages 

_Using some garage columns plus specially selected numeric columns used previously to build new model._

In [102]:
garage_cols = [col for col in ames.columns if 'garage' in col or 'Garage' in col]
garage_cols

['Garage Type',
 'Garage Yr Blt',
 'Garage Finish',
 'Garage Cars',
 'Garage Area',
 'Garage Qual',
 'Garage Cond']

In [103]:
# adding some garage columns and taking out some features from feat discrete 
numeric_w_garage = ['Lot Frontage',
 'Lot Area',
 'Year Built',
 'Year Remod/Add',
 'Mas Vnr Area',
 'BsmtFin SF 1',
 'BsmtFin SF 2',
 'Bsmt Unf SF',
 'Total Bsmt SF',
 '1st Flr SF',
 '2nd Flr SF',
 'Gr Liv Area',
 'Garage Area',
 'Wood Deck SF',
 'Open Porch SF',
 'Enclosed Porch',
 '3Ssn Porch',
 'Screen Porch',
 'Pool Area',
 'MS SubClass',
 'Lot Frontage',
 'Overall Qual',
 'Overall Cond',
 'Low Qual Fin SF',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'Garage Cars',
 'has_fireplace', 
 'Garage Yr Blt',
 'Garage Area',
 'has_garage',
 'garage_attached',
 'garage_large',
 'garage_nice',
 'garage_RFn',
 'garage_Unf']

In [104]:
ames[numeric_w_garage].isnull().sum()

KeyError: "['garage_RFn', 'garage_Unf', 'garage_large', 'garage_attached', 'has_garage', 'has_fireplace', 'garage_nice'] not in index"