In [1]:
#Import packages to be used
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [2]:
# Imports raw data
housing_import = pd.read_csv('trimmed_data.csv', index_col='Id')

In [3]:
# Cleaning out NaN values remaining
housing_import = housing_import[housing_import.MasVnrType.isnull() == False]
housing = housing_import.drop('MiscFeature', axis=1)

In [4]:
pd.get_dummies(housing)

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,1stFlrSF,LowQualFinSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,856,0,...,0,0,0,1,0,0,0,0,1,0
2,20,80.0,9600,6,8,1976,1976,0.0,1262,0,...,0,0,0,1,0,0,0,0,1,0
3,60,68.0,11250,7,5,2001,2002,162.0,920,0,...,0,0,0,1,0,0,0,0,1,0
4,70,60.0,9550,7,5,1915,1970,0.0,961,0,...,0,0,0,1,1,0,0,0,0,0
5,60,84.0,14260,8,5,2000,2000,350.0,1145,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,62.0,7917,6,5,1999,2000,0.0,953,0,...,0,0,0,1,0,0,0,0,1,0
1457,20,85.0,13175,6,6,1978,1988,119.0,2073,0,...,0,0,0,1,0,0,0,0,1,0
1458,70,66.0,9042,7,9,1941,2006,0.0,1188,0,...,0,0,0,1,0,0,0,0,1,0
1459,20,68.0,9717,5,6,1950,1996,0.0,1078,0,...,0,0,0,1,0,0,0,0,1,0


My interpretation of the get dummies function is that it took each categorical variable and added a variable to the dataframe for each category and populated that variable with a binary to capture datapoints from that category. 

In [5]:
housing_dummies = pd.get_dummies(housing)

In [6]:
# Creating Target and Feature Arrays
# .values returns numpy array instead of dataframe
X = housing_dummies.drop('SalePrice', axis=1).values
y = housing_dummies.SalePrice.values



In [7]:
# Unsure as what I need as far as reshaping goes
X.shape
y.shape

y = y.reshape(-1, 1)

In [8]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

In [9]:
X_train.shape

(1089, 273)

In [10]:
# Creating a dataframe for performance metrics
d = {'R_Squared': [], 'RMSE': [], 'MAPE': []}
metrics = pd.DataFrame(data=d)
metrics.index.name = 'Model'
metrics

Unnamed: 0_level_0,R_Squared,RMSE,MAPE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [11]:
# Create LinearRegression Object
reg = LinearRegression()

In [12]:
# Fit the data
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [13]:
# Create Predictions based on fit
y_pred_test = reg.predict(X_test)
y_pred_train = reg.predict(X_train)

In [14]:
# Performance Metrics for training set
print("R^2: {}".format(reg.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_train-y_pred_train)/y_train * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

R^2: 0.930668487832045
Root Mean Squared Error: 20867.61910938707
Mean Absolute Percent Error: 8.179160424146515


In [15]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [reg.score(X_train, y_train)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Linear_Train'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [16]:
# Performance Metrics for test set
print("R^2: {}".format(reg.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_test-y_pred_test)/y_test * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

R^2: 0.8770252691748901
Root Mean Squared Error: 27774.277800625732
Mean Absolute Percent Error: 11.10724000427329


In [17]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [reg.score(X_test, y_test)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Linear_Test'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [18]:
# Applying Ridge Regression

# Train test split -- Random state 42 should produce same split as above
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

# Creating Ridge object, fitting, and predicting. Normalize = true ensures all variables are on the same scale
ridge = Ridge(alpha=0.1, normalize = True)
ridge.fit(X_train, y_train)
ridge_pred_test = ridge.predict(X_test)
ridge_pred_train = ridge.predict(X_train)


In [19]:
# Metrics for train set
print("R^2: {}".format(ridge.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_train, ridge_pred_train))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_train-ridge_pred_train)/y_train * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

R^2: 0.9247829736114249
Root Mean Squared Error: 21735.300298706716
Mean Absolute Percent Error: 8.211178219406166


In [20]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [ridge.score(X_train, y_train)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Ridge_Train'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [21]:
# Performance Metrics for test set
print("R^2: {}".format(ridge.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred_test))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_test-ridge_pred_test)/y_test * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

R^2: 0.8946196664700052
Root Mean Squared Error: 25710.742518631756
Mean Absolute Percent Error: 9.869781484030518


In [22]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [ridge.score(X_test, y_test)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Ridge_Test'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [23]:
# Applying Lasso Regression

# Train test split -- Random state 42 should produce same split as above
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

lasso = Lasso(alpha = 0.1, normalize = True)
lasso.fit(X_train, y_train)
lasso_pred_test = lasso.predict(X_test)
lasso_pred_train = lasso.predict(X_train)

  positive)


In [24]:
# Performance metrics for train set
print("R^2: {}".format(lasso.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_train, lasso_pred_train))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_train-lasso_pred_train)/y_train * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

R^2: 0.9306625361304
Root Mean Squared Error: 20868.514771213853
Mean Absolute Percent Error: 49.564077550621406


In [25]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [lasso.score(X_train, y_train)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Lasso_Train'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [26]:
# Performance Metrics for test set
print("R^2: {}".format(lasso.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, lasso_pred_test))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_test-lasso_pred_test)/y_test * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

R^2: 0.8870814680713186
Root Mean Squared Error: 26614.44697352724
Mean Absolute Percent Error: 50.80983987084027


In [27]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [lasso.score(X_test, y_test)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Lasso_Test'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [28]:
metrics

Unnamed: 0_level_0,R_Squared,RMSE,MAPE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Linear_Train,0.930668,20867.619109,8.17916
Linear_Test,0.877025,27774.277801,11.10724
Ridge_Train,0.924783,21735.300299,8.211178
Ridge_Test,0.89462,25710.742519,9.869781
Lasso_Train,0.930663,20868.514771,49.564078
Lasso_Test,0.887081,26614.446974,50.80984
