In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV
from sklearn.preprocessing import PolynomialFeatures

In [2]:
df = pd.read_csv('./datasets/train.csv')
test = pd.read_csv('./datasets/test.csv')

In [3]:
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [4]:
df.columns

Index(['Id', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'G

___
## Create Baseline Model For Comparison

In [5]:
# Before we feature select, we will run the baseline linear regression model, where we just guess the mean house price everytime

X = np.full_like(df['SalePrice'], df['SalePrice'].mean())
X = X.reshape(-1, 1)  
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Training Score: {lr.score(X_train, y_train)}')
print(f'Testing Score: {lr.score(X_test, y_test)}')

y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f'Training RMSE: {rmse_train}')
print(f'Testing RMSE: {rmse_test}')

# So our RMSE is 79356

Training Score: 0.0
Testing Score: -0.00025950239237215733
Training RMSE: 79769.09260643576
Testing RMSE: 77091.22733067861


___
## First Model (Hollistically Selected Features)

In [6]:
# The predictors we will start with are:
    # MS Zoning 
    # Neighborhood
    # Bldg Type
    # Overall Cond
    # Year Remod/Add
    # Exter Qual
    # Bsmt Qual
    # Garage Cond
    # Year Remod/Add

___ 

## Data Cleaning

In [7]:
# Clean the columns for the predictors we want to use

In [8]:
features = ['MS Zoning', 'Neighborhood', 'Bldg Type', 'Overall Cond', 'Year Remod/Add', 'Exter Qual', 
            'Bsmt Qual', 'Garage Cond']

null_values_dict = {feature: df[feature].isnull().sum() for feature in features}
pd.DataFrame(list(null_values_dict.items()), columns=['Feature', 'Null Values']).set_index('Feature')

Unnamed: 0_level_0,Null Values
Feature,Unnamed: 1_level_1
MS Zoning,0
Neighborhood,0
Bldg Type,0
Overall Cond,0
Year Remod/Add,0
Exter Qual,0
Bsmt Qual,55
Garage Cond,114


In [9]:
# After looking at the data dictionary, we see a null value in Bsmt Qual and Garage Cond 
# We will replace those with better values 

df['Bsmt Qual'] = df['Bsmt Qual'].fillna('Nb') # Nb for 'no basement'
df['Garage Cond'] = df['Garage Cond'].fillna('Ng') # Nb for 'no garage'

null_values_dict = {feature: df[feature].isnull().sum() for feature in features}
pd.DataFrame(list(null_values_dict.items()), columns=['Feature', 'Null Values']).set_index('Feature')
# No null values now 

Unnamed: 0_level_0,Null Values
Feature,Unnamed: 1_level_1
MS Zoning,0
Neighborhood,0
Bldg Type,0
Overall Cond,0
Year Remod/Add,0
Exter Qual,0
Bsmt Qual,0
Garage Cond,0


___ 
## Create Dummy Columns

In [10]:
# Check the dtypes for the columns so we can dummify properly 
df[features].dtypes

MS Zoning         object
Neighborhood      object
Bldg Type         object
Overall Cond       int64
Year Remod/Add     int64
Exter Qual        object
Bsmt Qual         object
Garage Cond       object
dtype: object

In [11]:
# All object types will be dummfied 
# Year Remod/Add should be left as is

In [12]:
# Let us first rename these columns so there are no spaces
df.rename(columns={col: col.replace(' ', '_') for col in features}, inplace=True)

In [13]:
# Dummify all features except Year_Remod/Add

feats = ['MS_Zoning', 'Neighborhood', 'Bldg_Type', 'Exter_Qual', 'Bsmt_Qual', 'Garage_Cond']

for column in feats:
    dummies = pd.get_dummies(df[column], prefix=column, drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    df.drop(column, axis=1, inplace=True)

In [14]:
# Now we will make a model using the features we selected in the beginning
# Collect all of these dummified features
columns_after_index = df.columns[75:] # Used iloc to find the cutoff for our dummy columns
feats1 = columns_after_index.tolist()
feats1.append('Year_Remod/Add')
feats1.append('Overall_Cond')
# feats1 is a list of all of our features now

In [15]:
# We can make our model now
X = df[feats1]
y = df['SalePrice']

# Make Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Training Score: {lr.score(X_train, y_train)}')
print(f'Testing Score: {lr.score(X_test, y_test)}')

# We see pretty good model performance! Good training and testing score, indicating bias-varaince balance

Training Score: 0.7639556808388009
Testing Score: 0.7402778662111642


In [16]:
# Check RMSE to compare with baseline model
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f'Training RMSE: {rmse_train}')
print(f'Testing RMSE: {rmse_test}')

Training RMSE: 38093.41910647393
Testing RMSE: 41992.27033615909


In [17]:
# Before we move on, we will submit this model to Kaggle
lr = LinearRegression()
lr.fit(X_train, y_train)

prediction = []
for i in range(878):
    prediction.append(lr.predict(X_train)[i]) 
    
submission = test[['Id']]
submission['SalePrice'] = prediction
submission.to_csv('./datasets/submission_1.csv', index = None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['SalePrice'] = prediction


___
## Ridge and LASSO Aid For Feature Selection

In [18]:
# Next we will regularize our data so we can run Ridge and LASSO

sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [19]:
# Now we run a Ridge regression
alphas = np.logspace(0, 5, 100)
ridge_cv = RidgeCV(alphas = alphas, cv = 5)
ridge_cv.fit(Z_train, y_train)
ridge_cv.alpha_
# This is the optimal alpha, indicating regularization did not help much

4.0370172585965545

In [20]:
# Here are our R2 scores using the optimal alpha
print(ridge_cv.score(Z_train, y_train))
print(ridge_cv.score(Z_test, y_test))

0.7638023061103111
0.7400596185925026


In [21]:
# Set up a list of Lasso alphas to check
l_alphas = np.linspace(0, 1000, 200)
lasso_cv = LassoCV(alphas = l_alphas)
lasso_cv.fit(Z_train, y_train)

# The optimal value of alpha is 15, which seems a bit low
lasso_cv.alpha_

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


50.25125628140704

In [22]:
# These are the features it gave coefficients of zero on, thus we should drop them
list(np.array(feats1)[lasso_cv.coef_ == 0])

['MS_Zoning_FV', 'MS_Zoning_I (all)', 'Neighborhood_NWAmes']

___
## Redo Model With New Feature Set

In [23]:

feats2 = [element for element in feats1 if element not in list(np.array(feats1)[lasso_cv.coef_ == 0])]
len(feats2)

49

In [24]:
# Redo model
X = df[feats2]
y = df['SalePrice']

# Make Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
model = LinearRegression()
model.fit(X_train, y_train)

print(f'Training Score: {model.score(X_train, y_train)}')
print(f'Testing Score: {model.score(X_test, y_test)}')
# Slightly better model performance

Training Score: 0.7665248334245606
Testing Score: 0.7263332812532179


In [26]:
# Now we will add interaction terms and rerun our model

X = df[feats2]
y = df['SalePrice']

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

X_overfit = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_overfit,
    y,
    test_size=0.2,
    random_state=42
)

In [27]:
# Reularize
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [28]:
# Test model performance
lr = LinearRegression()
lr.fit(Z_train, y_train)
print(f'Training Score: {lr.score(Z_train, y_train)}')
print(f'Testing Score: {lr.score(Z_test, y_test)}')

# This model is way too overfit and thus adding interaction terms hurt the model prospects

Training Score: 0.8280582910019311
Testing Score: -1.803139029724081e+24
