## Linear, Lasso, and Ridge Regression on significant numeric data and 5 catagorical variables

| Model Features | --- | --- | --- |---|
| --- | --- |--- | --- |---|
| **Numeric** | --- |--- | --- |---|
| Overall Qual x Total SF | Total SF x Garage Area | Overall Qual^2 | Year Remod/Add x Total SF | Total Sf^2 |
| Overall Qual x Garage Area | Overall Qual x Total Bathrooms | Total SF x Total Bathrooms | Overall Qual x Year Remod/Add | Total Bathrooms x Garage Area|
| Total SF | Overall Cond |--- | --- |---|
| --- | --- |--- | --- |---|
| **Catagorical** | --- | --- | --- |---|
| MS SubClass | Neighborhood | Condition 1 | Exter Qual | Kitchen Qual  |
| --- | --- | --- | --- |---|


In [1]:
#importing libraries
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression,LassoCV,RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn import metrics
#import re

# Data Dictionary - [Link](http://jse.amstat.org/v19n3/decock/DataDocumentation.txt) 

In [2]:
#importing clean v5_data
housing_data = pd.read_csv("../datasets/complete_training_data.csv")
#importing testing data
testing_data = pd.read_csv("../datasets/complete_kaggle_test.csv")

In [3]:
housing_data.columns

Index(['Id', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'G

In [4]:
xvars = ["Overall Qual x Total SF","Total SF x Garage Area","Overall Qual^2",
                     "Year Remod/Add x Total SF","Total SF^2","Overall Qual x Garage Area",
                     "Overall Qual x Total Bathrooms","Total SF x Total Bathrooms","Overall Qual x Year Remod/Add",
                     "Total Bathrooms x Garage Area", "Total SF","Overall Qual"]

In [5]:
ms_subclass_dummies = pd.get_dummies(housing_data["MS SubClass"],prefix="SubClass")
ms_subclass_dummies.drop(columns=["SubClass_150","SubClass_40"],inplace=True)

neighborhood_dummies = pd.get_dummies(housing_data["Neighborhood"])
neighborhood_dummies.drop(columns=["Landmrk","GrnHill","Greens","Blueste"],inplace=True)

condition_1_dummies = pd.get_dummies(housing_data["Condition 1"])
condition_1_dummies.drop(columns=["RRNe","RRNn"],inplace=True)

exter_qual_dummies = pd.get_dummies(housing_data["Exter Qual"],prefix="ExQ")
exter_qual_dummies.drop(columns="ExQ_Fa",inplace=True)

kitchen_qual_dummies = pd.get_dummies(housing_data["Kitchen Qual"],prefix="Kit")
kitchen_qual_dummies.drop(columns="Kit_Fa",inplace=True)

In [6]:
X = housing_data[xvars]
y = housing_data["SalePrice"]

In [7]:
#creating matrix for xvars and dummy cells
X = pd.concat([X,
ms_subclass_dummies,
neighborhood_dummies,
condition_1_dummies,
exter_qual_dummies,
kitchen_qual_dummies
              ], axis=1)

In [8]:
#spliting the data 70% train, 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=2020)

In [9]:
X_train.shape

(1416, 63)

In [10]:
ss = StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [11]:
linreg = LinearRegression()
lasso = LassoCV(n_alphas=300)
ridge = RidgeCV(alphas=np.logspace(0, 5, 100))

In [12]:
#looking at 5 part cross validation on traing data. The R2 is quite good at 0.88
linreg_scores = cross_val_score(linreg, Z_train, y_train, cv=5)
linreg_scores.mean()

0.9091628424107101

In [13]:
lasso_scores = cross_val_score(lasso, Z_train, y_train, cv=5)
lasso_scores.mean()

0.9055305679907562

In [14]:
ridge_scores = cross_val_score(ridge, Z_train, y_train, cv=5)
ridge_scores.mean()

0.9081491392467667

In [15]:
linreg.fit(Z_train,y_train)

LinearRegression()

In [16]:
display(linreg.score(Z_train,y_train))
display(linreg.score(Z_test,y_test))

0.9210735676272604

0.9071488499309683

In [17]:
Z_train_sm = Z_train
Z_train_sm = sm.add_constant(Z_train_sm)
y_train_sm = y_train

In [18]:
sm_model = sm.OLS(y_train_sm,Z_train_sm).fit()

In [19]:
results_summary = sm_model.summary()

In [20]:
results_summary.tables[0]

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.921
Model:,OLS,Adj. R-squared:,0.917
Method:,Least Squares,F-statistic:,250.4
Date:,"Sun, 16 Aug 2020",Prob (F-statistic):,0.0
Time:,18:10:34,Log-Likelihood:,-16208.0
No. Observations:,1416,AIC:,32540.0
Df Residuals:,1352,BIC:,32880.0
Df Model:,63,,
Covariance Type:,nonrobust,,


In [21]:
results_as_html = results_summary.tables[1].as_html()
coef = pd.read_html(results_as_html, header=0, index_col=0)[0]

In [22]:
indexes = X_train.columns

In [23]:
complete_indexes = ['const']
complete_indexes.extend(indexes)

In [24]:
coef["indexes"] = complete_indexes

In [25]:
coef.set_index("indexes",inplace=True);

In [26]:
coef.head(13)

Unnamed: 0_level_0,coef,std err,t,P>|t|,[0.025,0.975]
indexes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
const,182700.0,615.479,296.816,0.0,181000.0,184000.0
Overall Qual x Total SF,60410.0,11100.0,5.422,0.0,38600.0,82300.0
Total SF x Garage Area,4592.3729,6185.158,0.742,0.458,-7541.177,16700.0
Overall Qual^2,-5450.2817,9306.979,-0.586,0.558,-23700.0,12800.0
Year Remod/Add x Total SF,395500.0,114000.0,3.463,0.001,171000.0,619000.0
Total SF^2,8183.1488,6349.666,1.289,0.198,-4273.118,20600.0
Overall Qual x Garage Area,4448.6976,5346.239,0.832,0.405,-6039.128,14900.0
Overall Qual x Total Bathrooms,6142.9403,5904.927,1.04,0.298,-5440.875,17700.0
Total SF x Total Bathrooms,9715.8239,6418.432,1.514,0.13,-2875.344,22300.0
Overall Qual x Year Remod/Add,-135600.0,88400.0,-1.534,0.125,-309000.0,37800.0


### Conclusions from modeling

The train score for this model is 0.921 (test score is 0.907). I think that this model fits the data very well and doesn't have too much bias nor varience

### Calculating sales prices in training data (need to output .csv w/ header Id,SalePrice)

In [27]:
X_kaggle = testing_data[xvars]

In [28]:
kaggle_ms_subclass_dummies = pd.get_dummies(testing_data["MS SubClass"],prefix="SubClass")
kaggle_ms_subclass_dummies.drop(columns=["SubClass_40"],inplace=True)

kaggle_neighborhood_dummies = pd.get_dummies(testing_data["Neighborhood"])
kaggle_neighborhood_dummies.drop(columns=["Greens","Blueste"],inplace=True)

kaggle_condition_1_dummies = pd.get_dummies(testing_data["Condition 1"])
kaggle_condition_1_dummies.drop(columns=["RRNe","RRNn"],inplace=True)

kaggle_exter_qual_dummies = pd.get_dummies(testing_data["Exter Qual"],prefix="ExQ")
kaggle_exter_qual_dummies.drop(columns="ExQ_Fa",inplace=True)

kaggle_kitchen_qual_dummies = pd.get_dummies(testing_data["Kitchen Qual"],prefix="Kit")
kaggle_kitchen_qual_dummies.drop(columns=["Kit_Fa","Kit_Po"],inplace=True)

In [29]:
#creating matrix for xvars and dummy cells
X_kaggle = pd.concat([X_kaggle,
kaggle_ms_subclass_dummies,
kaggle_neighborhood_dummies,
kaggle_condition_1_dummies,
kaggle_exter_qual_dummies,
kaggle_kitchen_qual_dummies
              ], axis=1)

In [30]:
#need to regularize the kaggle data
Z_kaggle = ss.transform(X_kaggle)

In [31]:
#finding differences between train and testing data columns. They need to match for fit to make sense
for i,x in enumerate(X_train.columns):
    if X_kaggle.columns[i] != x:
        print(i,x)
    else:
        pass

In [32]:
price_X_testing = linreg.predict(Z_kaggle)
testing_data["SalePrice"] = price_X_testing
testing_data.head(1)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Total SF x Garage Area,Overall Qual^2,Year Remod/Add x Total SF,Total SF^2,Overall Qual x Garage Area,Overall Qual x Total Bathrooms,Total SF x Total Bathrooms,Overall Qual x Year Remod/Add,Total Bathrooms x Garage Area,SalePrice
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,1297120,36,5748600,8690704,2640,12.0,5896.0,11700,880.0,129411.178933


In [33]:
ols_sigfeat_fit = testing_data[["Id","SalePrice"]]
ols_sigfeat_fit.to_csv("../datasets/ols_sigfeat_fit.csv",index=False)