In [56]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

In [57]:
# Second Argument = Column to use as row labels in DataFrame.
train_df = pd.read_csv('train.csv', index_col=0) 
test_df = pd.read_csv('test.csv', index_col=0)

In [58]:
train_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [59]:
y_train = np.log(train_df.pop('SalePrice')) # Transform price -> ln_price to make RMSE cross-val more accurate
all_df = pd.concat((train_df, test_df), axis=0) # Combine Train & Test in order to transform the data simultaneously.

In [83]:
# Make some more variables for kicks

# Creates new features: TotSqf, TotSqf2, Age, Age2, TotFullBath, TotHalfBath

# Square Footage 
all_df["TotSqf"] = (all_df['TotalBsmtSF'] + all_df["1stFlrSF"] + all_df["2ndFlrSF"])
all_df["TotSqf2"] = all_df["TotSqf"] * all_df["TotSqf"]

# Age
all_df["Age"] = all_df["YrSold"] - all_df["YearBuilt"]
all_df["Age2"] = all_df["Age"] * all_df["Age"]

# Num Full/Half Baths
all_df["TotFullBath"] = all_df["BsmtFullBath"] + all_df["FullBath"]
all_df["TotHalfBath"] = all_df["BsmtHalfBath"] + all_df["HalfBath"]

all_df.head()

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,TotSqf,TotSqf2,Age,Age2,TotFullBath,TotHalfBath
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,856.0,854.0,0.0,,3.0,1Fam,TA,No,706.0,0.0,...,0.0,2003.0,2003.0,2008.0,2566.0,6584356.0,5.0,25.0,3.0,1.0
2.0,1262.0,0.0,0.0,,3.0,1Fam,TA,Gd,978.0,0.0,...,298.0,1976.0,1976.0,2007.0,2524.0,6370576.0,31.0,961.0,2.0,1.0
3.0,920.0,866.0,0.0,,3.0,1Fam,TA,Mn,486.0,0.0,...,0.0,2001.0,2002.0,2008.0,2706.0,7322436.0,7.0,49.0,3.0,1.0
4.0,961.0,756.0,0.0,,3.0,1Fam,Gd,No,216.0,0.0,...,0.0,1915.0,1970.0,2006.0,2473.0,6115729.0,91.0,8281.0,2.0,0.0
5.0,1145.0,1053.0,0.0,,4.0,1Fam,TA,Av,655.0,0.0,...,192.0,2000.0,2000.0,2008.0,3343.0,11175649.0,8.0,64.0,3.0,1.0


In [84]:
all_df.shape

(2920, 86)

In [85]:
y_train.head()

Id
1    12.247694
2    12.109011
3    12.317167
4    11.849398
5    12.429216
Name: SalePrice, dtype: float64

In [86]:
all_df['MSSubClass'].value_counts() # even though this variable has integer values, it's truly a categorical variable.
# We can use pd.get_dummies() in order to create dummy variables for all categorical vars. but it requires that all 

20.0     1079
60.0      575
50.0      287
120.0     182
30.0      139
160.0     128
70.0      128
80.0      118
90.0      109
190.0      61
85.0       48
75.0       23
45.0       18
180.0      17
40.0        6
nan         1
150.0       1
Name: MSSubClass, dtype: int64

In [87]:
all_df['MSSubClass'] = all_df['MSSubClass'].astype(str) # Change MSSubClass to string 
all_dummy_df = pd.get_dummies(all_df) # Generate dummy vars for all variables
all_dummy_df.head()


Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,...,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Grvl,Street_Pave,Utilities_AllPub,Utilities_NoSeWa
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,856.0,854.0,0.0,3.0,706.0,0.0,1.0,0.0,150.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2.0,1262.0,0.0,0.0,3.0,978.0,0.0,0.0,1.0,284.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3.0,920.0,866.0,0.0,3.0,486.0,0.0,1.0,0.0,434.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4.0,961.0,756.0,0.0,3.0,216.0,0.0,1.0,0.0,540.0,272.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
5.0,1145.0,1053.0,0.0,4.0,655.0,0.0,1.0,0.0,490.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0


In [88]:
# sum number of null values for each col & list in descending frequency
all_dummy_df.isnull().sum().sort_values(ascending=False).head(10) 

SalePrice       1460
LotFrontage      487
GarageYrBlt      160
MasVnrArea        24
TotHalfBath        3
TotFullBath        3
BsmtFullBath       3
BsmtHalfBath       3
TotSqf2            2
GarageCars         2
dtype: int64

In [89]:
mean_cols = all_dummy_df.mean() # Calculate Mean values for all columns
all_dummy_df = all_dummy_df.fillna(mean_cols) # Replace NA values with averages from the columns
all_dummy_df.isnull().sum().sum() # Count number of null values in DataFrame - This is the output

0

In [90]:
# Split the data back into train/test datasets
dummy_train_df = all_dummy_df.loc[train_df.index] 
dummy_test_df = all_dummy_df.loc[test_df.index]

In [91]:
# Make sure we've got it right by looking at their dataframe dimensions
dummy_train_df.shape, dummy_test_df.shape 

((1460, 311), (1460, 311))

In [92]:
# Not completely necessary, just converts dataframe to numpy array
X_train = dummy_train_df.values
X_test = dummy_test_df.values

In [93]:
# Ridge Regression with Cross Validation
alphas = np.logspace(-3, 2, 50)
test_scores = []
for alpha in alphas:
    clf = Ridge(alpha)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))

In [94]:
plt.plot(alphas, test_scores)
plt.title("Alpha vs CV Error");

In [95]:
ridge = Ridge(alpha=15)
ridge.fit(X_train, y_train)
y_ridge = np.exp(ridge.predict(X_test))
submission_df = pd.DataFrame(data= {'Id' : test_df.index, 'SalePrice': y_ridge})

In [96]:
submission_df.head(10)

Unnamed: 0,Id,SalePrice
0,1461.0,114573.652097
1,1462.0,157134.465986
2,1463.0,182898.438458
3,1464.0,199222.819734
4,1465.0,194371.940717
5,1466.0,171950.940986
6,1467.0,185561.091174
7,1468.0,160890.764249
8,1469.0,198631.747824
9,1470.0,118165.27108


In [98]:
submission_df.to_csv('ridge_regression.csv', index=False)