In [11]:
# README FIRST : This it the modeling and training section from `MainCodingFile.ipynb`
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.preprocessing import OneHotEncoder
import warnings
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning);
from itertools import cycle

%matplotlib inline

In [12]:
# train
df = pd.read_csv('../datasets/cleaned_training_data.csv')
df.shape

  df = pd.read_csv('../datasets/cleaned_training_data.csv')


(2051, 306)

In [13]:
# test
kaggle_data = pd.read_csv('../datasets/cleaned_testing_data.csv')
kaggle_data.shape

(878, 288)

## Training and Modeling Section With `Train.csv` file imported as `df`

In [14]:
features = ['Overall Qual', 'Gr Liv Area', 'Garage Area', 'Garage Cars','Total Bsmt SF',
            '1st Flr SF','Year Built','Year Remod/Add','Full Bath','Garage Yr Blt','TotRms AbvGrd',
            'Mas Vnr Area','Fireplaces','BsmtFin SF 1','Wood Deck SF','Open Porch SF',
            'After 2000','Exter Qual','Overall Qual','Gr Liv Area Times Garage Area',
            'Kitchen Qual_Ex','Kitchen Qual_Fa','Kitchen Qual_Gd','Kitchen Qual_TA','Total Bsmt SF Times 1st Flr SF',
            'Heating QC_Ex','Heating QC_Fa','Heating QC_Gd','Heating QC_TA','1st Flr SF Times 2nd Flr SF',
            'Neighborhood_NridgHt','Neighborhood_Timber','Bsmt Full Bath','Neighborhood_StoneBr',
            'Neighborhood_Somerst','Neighborhood_NoRidge',
            'Not Kitchen Qual_TA','Not Heating QC_TA','Not Neighborhood_OldTown'
           ]



X = df[features]
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,
                                                    train_size=0.90)


### Null Model prediction

In [15]:
predict_baseline = np.mean(y_train)

In [16]:
predict_baseline

182064.33983739838

### This is the mean of housing prices from the training data for our model. 

##### Create an array with same dimensions as test to find metrics of our base model.

In [17]:
y_pred_base = np.full_like(y_test, 182064.33983739838)

In [18]:
mean_squared_error(y_test, y_pred_base)

5922129689.684466

In [19]:
print(f"RMSE Ridge | {np.sqrt(mean_squared_error(y_test, y_pred_base))}")

RMSE Ridge | 76955.37466405102


### We can do better than that! Let us begin the modeling process! 

#### Scale Values

In [20]:
ss = StandardScaler()

X_train = ss.fit_transform(X_train)

X_test = ss.transform(X_test)

##### Check for null values - recall we filled all nulls with median values!

In [21]:
df[features].isna().sum()

Overall Qual                      0
Gr Liv Area                       0
Garage Area                       0
Garage Cars                       0
Total Bsmt SF                     0
1st Flr SF                        0
Year Built                        0
Year Remod/Add                    0
Full Bath                         0
Garage Yr Blt                     0
TotRms AbvGrd                     0
Mas Vnr Area                      0
Fireplaces                        0
BsmtFin SF 1                      0
Wood Deck SF                      0
Open Porch SF                     0
After 2000                        0
Exter Qual                        0
Overall Qual                      0
Gr Liv Area Times Garage Area     0
Kitchen Qual_Ex                   0
Kitchen Qual_Fa                   0
Kitchen Qual_Gd                   0
Kitchen Qual_TA                   0
Total Bsmt SF Times 1st Flr SF    0
Heating QC_Ex                     0
Heating QC_Fa                     0
Heating QC_Gd               

In [22]:
# Instantiate a linear regression model.
linmod = LinearRegression()

# Fit our linear regression model to the training data.
linmod.fit(X_train,y_train)

In [23]:
lasso = Lasso()

ridge = Ridge()

In [24]:
lasso.fit(X_train,y_train)

  model = cd_fast.enet_coordinate_descent(


In [25]:
ridge.fit(X_train,y_train)

In [26]:
# Print out intercept and coefficients.
print(f'Intercept: {linmod.intercept_}')
print(f'Coefficient: {linmod.coef_}')

Intercept: 182065.01499640834
Coefficient: [-3.31948790e+17  1.32320000e+04 -9.09200000e+03  1.80800000e+03
  2.73600000e+04  1.30080000e+04  3.06400000e+03  4.96400000e+03
 -6.36000000e+02 -8.32000000e+02  2.13200000e+03  4.09000000e+03
  5.48600000e+03  8.29600000e+03  2.04900000e+03  7.22000000e+02
  2.32400000e+03  2.58000000e+03  3.31948790e+17  2.57160000e+04
  2.40920159e+16  1.40342575e+16  4.44564765e+16  2.48799973e+16
 -4.19720000e+04  2.31200000e+03 -2.21000000e+02  7.64500000e+02
 -5.93048384e+14 -5.27800000e+03  6.65600000e+03  1.65050000e+03
  1.38200000e+03  6.57700000e+03  1.75000000e+03  4.60600000e+03
 -2.06067804e+16 -5.93048384e+14  1.86100000e+03]


In [27]:
# Print out intercept and coefficients - lasso
print(f'Intercept: {lasso.intercept_}')
print(f'Coefficient: {lasso.coef_}')

Intercept: 182064.33983739838
Coefficient: [ 53611.04829968  13121.64530994  -8941.93722285   1794.11149518
  27312.87364097  13023.42811885   3070.80650266   4942.35447428
   -646.23659825   -814.15010697   2141.59993342   4064.52037651
   5479.86440098   8250.30632264   2045.24955411    724.37576572
   2356.87385556   2567.07444475 -37140.40132651  25668.22536616
   9642.75162567   -566.66386506  -3368.4439683     300.70800329
 -41963.93079535   1941.25305732   -356.08999837    479.48054581
  -3048.45572372  -5231.49371091   6674.56488064   1645.27758112
   1352.06138172   6645.23092312   1733.23415343   4591.50476528
   4304.94516053  -1506.53148092   1862.71830857]


In [28]:
# Print out intercept and coefficients - ridge
print(f'Intercept: {ridge.intercept_}')
print(f'Coefficient: {ridge.coef_}')

Intercept: 182064.33983739838
Coefficient: [  8238.38167365  13074.205634    -8454.00466445   1774.90777505
  26959.75566083  13077.90420467   3075.0974914    4920.44348196
   -623.12811513   -817.24735647   2158.29891523   4072.27989999
   5491.05703532   8219.19350169   2044.36068214    729.30240531
   2350.56283334   2578.55511377   8238.38167361  24899.0128461
   9758.57826346   -507.60563889  -3175.4868255   -1908.44467986
 -41412.22593954   2282.5440157    -240.8348613     721.14185332
   -621.23271736  -4934.73640099   6700.8453815    1651.93200426
   1380.73934825   6654.73708596   1735.71652877   4612.37214104
   1908.44467987    621.23271744   1869.77209971]


In [29]:
#predict
prediction_lin = linmod.predict(X_test)

#actual
actual = y_test

train_score_lin = linmod.score(X_train, y_train)
test_score_lin = linmod.score(X_test, y_test)

print("The train score for lin model is {}".format(train_score_lin))
print("The test score for lin model is {}".format(test_score_lin))

The train score for lin model is 0.8731987050307888
The test score for lin model is 0.8971382603045467


In [30]:
# And same for the other models 
#predict
prediction_lasso = lasso.predict(X_test)

#actual
actual = y_test

train_score_lasso = lasso.score(X_train, y_train)
test_score_lasso = lasso.score(X_test, y_test)

print("The train score for lasso model is {}".format(train_score_lasso))
print("The test score for lasso model is {}".format(test_score_lasso))

The train score for lasso model is 0.8732174963791982
The test score for lasso model is 0.8967215151800544


In [31]:
#predict
prediction_ridge = ridge.predict(X_test)

#actual
actual = y_test

train_score_ridge = ridge.score(X_train, y_train)
test_score_ridge = ridge.score(X_test, y_test)

print("The train score for ridge model is {}".format(train_score_ridge))
print("The test score for ridge model is {}".format(test_score_ridge))

The train score for ridge model is 0.8732124382219942
The test score for ridge model is 0.8967139629736494


# Other Fits

A look at non-linearity, polynomial features for a better characteristic equation. 

In [32]:
poly = PolynomialFeatures(degree=2,include_bias=False)

In [33]:
X_train = poly.fit_transform(X_train)

X_test = poly.transform(X_test)

Cubed order units $\text{m}^3$ term present in characteristic equation. 

In [34]:
linmod.fit(X_train,y_train)

In [35]:
lasso_model = LassoCV(max_iter=10000)

In [36]:
lasso_model.fit(X_train,y_train)

In [37]:
ridge.fit(X_train,y_train)

In [38]:
r_alphas = np.logspace(0,3,100)

# Cross-validate over our list of ridge alphas.
ridge_model = RidgeCV(alphas=r_alphas, cv = 5, scoring='r2')
# Fit model using best ridge alpha!
ridge_model.fit(X_train, y_train)

In [39]:
# And same for the other models 
#predict
prediction_lasso = lasso_model.predict(X_test)

#actual
actual = y_test

train_score_lasso = lasso_model.score(X_train, y_train)
test_score_lasso = lasso_model.score(X_test, y_test)

print("The train score for lasso model is {}".format(train_score_lasso))
print("The test score for lasso model is {}".format(test_score_lasso))
print(f"{round(((train_score_lasso - test_score_lasso)*100),2)}% difference")

The train score for lasso model is 0.9282990746778413
The test score for lasso model is 0.927615651356227
0.07% difference


In [40]:
#predict
prediction_ridge = ridge_model.predict(X_test)

#actual
actual = y_test

train_score_ridge = ridge_model.score(X_train, y_train)
test_score_ridge = ridge_model.score(X_test, y_test)

print("The train score for ridge model is {}".format(train_score_ridge))
print("The test score for ridge model is {}".format(test_score_ridge))
print(f"{round(((train_score_ridge - test_score_ridge)*100),2)}% difference")

The train score for ridge model is 0.9361103626754603
The test score for ridge model is 0.9286080427137793
0.75% difference


# Error Metrics Section

In [41]:
print(f"MAE Ridge | {mean_absolute_error(y_test, prediction_ridge)}")

MAE Ridge | 15264.692066931315


In [42]:
print(f"MAE Lasso | {mean_absolute_error(y_test, prediction_lasso)}")

MAE Lasso | 14448.121266793032


In [43]:
print(f"MAE Linear | {mean_absolute_error(y_test, prediction_lin)}")

MAE Linear | 17255.294797148545


In [44]:
# RSS Ridge
sum((y_test - prediction_ridge)**2)

86579811155.35228

In [45]:
# RSS Lasso
sum((y_test - prediction_lasso)**2)

87783322861.64438

In [46]:
# RSS Linear
sum((y_test - prediction_lin)**2)

124744443722.68575

In [47]:
# MSE Ridge
mean_squared_error(y_test, prediction_ridge)

420290345.4143317

In [48]:
# MSE Lasso
mean_squared_error(y_test, prediction_lasso)

426132635.250701

In [49]:
# MSE Linear
mean_squared_error(y_test, prediction_lin)

605555552.0518726

In [50]:
# Root Mean Squared Error Ridge
print(f"RMSE Ridge | {np.sqrt(mean_squared_error(y_test, prediction_ridge))}")

RMSE Ridge | 20500.984010879372


In [51]:
# Root Mean Squared Error Lasso
print(f"RMSE Lasso | {np.sqrt(mean_squared_error(y_test, prediction_lasso))}")

RMSE Lasso | 20642.98028993636


In [52]:
# Root Mean Squared Error Linear
print(f"RMSE Linear | {np.sqrt(mean_squared_error(y_test, prediction_lin))}")

RMSE Linear | 24608.03836253253


# Export Section

In [53]:
df.shape

(2051, 306)

In [54]:
kaggle_data.shape

(878, 288)

In [55]:
X_train.shape

(1845, 819)

In [56]:
X_test.shape

(206, 819)

In [57]:
kaggle_data[features] = ss.transform(kaggle_data[features])

In [58]:
kaggle_data[features].columns

Index(['Overall Qual', 'Gr Liv Area', 'Garage Area', 'Garage Cars',
       'Total Bsmt SF', '1st Flr SF', 'Year Built', 'Year Remod/Add',
       'Full Bath', 'Garage Yr Blt', 'TotRms AbvGrd', 'Mas Vnr Area',
       'Fireplaces', 'BsmtFin SF 1', 'Wood Deck SF', 'Open Porch SF',
       'After 2000', 'Exter Qual', 'Overall Qual',
       'Gr Liv Area Times Garage Area', 'Kitchen Qual_Ex', 'Kitchen Qual_Fa',
       'Kitchen Qual_Gd', 'Kitchen Qual_TA', 'Total Bsmt SF Times 1st Flr SF',
       'Heating QC_Ex', 'Heating QC_Fa', 'Heating QC_Gd', 'Heating QC_TA',
       '1st Flr SF Times 2nd Flr SF', 'Neighborhood_NridgHt',
       'Neighborhood_Timber', 'Bsmt Full Bath', 'Neighborhood_StoneBr',
       'Neighborhood_Somerst', 'Neighborhood_NoRidge', 'Not Kitchen Qual_TA',
       'Not Heating QC_TA', 'Not Neighborhood_OldTown'],
      dtype='object')

In [59]:
kaggle_data[features].shape

(878, 39)

##### Ignore following warning we have verified there are no conflicts

In [60]:
# Transform test kaggle data
X_transformed = poly.transform(kaggle_data[features])  # X is your input data



In [61]:
X_transformed.shape

(878, 819)

In [62]:
kaggle_data.columns

Index(['Id', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour',
       ...
       'Misc Feature_Shed', 'Sale Type_CWD', 'Sale Type_Con',
       'Sale Type_ConLD', 'Sale Type_ConLI', 'Sale Type_ConLw',
       'Sale Type_New', 'Sale Type_Oth', 'Sale Type_VWD', 'Sale Type_WD '],
      dtype='object', length=288)

In [63]:
new_frame_to_pred = pd.DataFrame(X_transformed, columns=poly.get_feature_names_out(features))
new_frame_to_pred;

# Only Run the Following Cell When You Are Ready to Export!

Again, we ignore the warning as it turns out the problem has been 
resolved via indexing methods to confirm columns have been read in correctly. 

In [64]:
# #Only uncomment to create file - currently needs renaming
# submission = pd.DataFrame({'id': kaggle_data['Id']})
# submission['SalePrice'] = lasso_model.predict(new_frame_to_pred)

# submission.to_csv('../datasets/mean_submission_lasso_model_func_1835.csv', index=False)