# Predicting the Sale Price of a House in Ames, Iowa

### Production Model

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV 
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score

pd.options.display.max_columns = 100
pd.options.display.max_rows = 200

### EDA and Cleaning

In [2]:
# Load training data from train.csv in datasets folder
file_name_path = './datasets/train.csv'
df = pd.read_csv(file_name_path)

In [3]:
# Drop Columns not needed in analysis
df.drop(columns=['Id', 'PID', 'Utilities', 'Condition 2','Roof Matl', 'Heating','Low Qual Fin SF', 'Functional', 
                 'Fireplace Qu', 'Paved Drive','Pool Area','Pool QC', 'Mas Vnr Type', 'Mas Vnr Area', 'Bsmt Exposure', 
                 'BsmtFin Type 2','Bsmt Full Bath', 'Bsmt Half Bath', 'Garage Finish','Garage Yr Blt', 
                 'Garage Qual', 'Garage Cond'], inplace=True)

In [4]:
# Convert the data type in these columns
df['MS SubClass'] = df['MS SubClass'].map(lambda x: str(x))

In [5]:
# Check if there are any null values in any column
df.isnull().sum()[df.isnull().sum()>0]

Lot Frontage       330
Alley             1911
Bsmt Qual           55
Bsmt Cond           55
BsmtFin Type 1      55
BsmtFin SF 1         1
BsmtFin SF 2         1
Bsmt Unf SF          1
Total Bsmt SF        1
Garage Type        113
Garage Cars          1
Garage Area          1
Fence             1651
Misc Feature      1986
dtype: int64

In [6]:
# Drop Null values in these columns
df.dropna(subset=['Garage Area', 'Garage Cars', 'Total Bsmt SF'], inplace=True)

In [7]:
# Convert the null values in Lot Frontage to 0
df['Lot Frontage'] = df['Lot Frontage'].map(lambda cell: 0 if pd.isnull(cell) else cell)
# Convert the missing values in Alley to 'no alley'
df['Alley'] = df['Alley'].map(lambda cell: 'no_alley' if pd.isnull(cell) else cell)
# Convert the missing values in Bsmt Qual to 'no bsmt'
df['Bsmt Qual'] = df['Bsmt Qual'].map(lambda cell: 'no_bsmt' if pd.isnull(cell) else cell)
# Convert the missing values in Bsmt Cond to 'no bsmt'
df['Bsmt Cond'] = df['Bsmt Cond'].map(lambda cell: 'no_bsmt' if pd.isnull(cell) else cell)
# Convert the missing values in BsmtFin Type 1 to 'no bsmt'
df['BsmtFin Type 1'] = df['BsmtFin Type 1'].map(lambda cell: 'no_bsmt' if pd.isnull(cell) else cell)
# Convert the missing values in Garage Type to 'no garage'
df['Garage Type'] = df['Garage Type'].map(lambda cell: 'no_garage' if pd.isnull(cell) else cell)
# Convert the null values in Fence to no_fence
df['Fence'] = df['Fence'].map(lambda cell: 'no_fence' if pd.isnull(cell) else cell)
# Convert the null values in Misc Feature to none
df['Misc Feature'] = df['Misc Feature'].map(lambda cell: 'none' if pd.isnull(cell) else cell)

In [8]:
# Drop columns with multicollinearity
df.drop(columns=['BsmtFin SF 1','BsmtFin SF 2', 'Bsmt Unf SF','1st Flr SF','2nd Flr SF'], inplace=True)

In [9]:
# Drop outliers
index_to_drop = df[df['Gr Liv Area']>3500].index.values.astype(int)
df.drop(index=index_to_drop, inplace=True)

### Preprocessing and Feature Engineering

In [10]:
# Hot code categorical features
features_to_change = list(df.select_dtypes([np.object]).columns)

In [11]:
df = pd.get_dummies(df, columns=features_to_change, drop_first=True)

In [12]:
# Create a function to calculate the y-predictions and residuals for a given model

def check_residual(model, x_test_data, y_test_data):
    
    # Check residuals for baseline model
    if model== 'baseline':
        y_prediction = x_test_data.mean()
        y_predictions = []
        for i in range(0,len(y_test_data)):
            y_predictions.append(y_prediction)
        residuals = y_test_data - y_predictions
        
    # Check residuals for other models
    else:
        y_predictions = model.predict(x_test_data)
        residuals = y_test_data - y_predictions
        
    
    return (y_predictions,residuals)

In [13]:
# Create a function to calculate R2, MSE, RMSE

def evaluate_model(model, X, y):
    
    # Evalute baseline model
    if model == 'baseline':
        # Convert y_baseline to an array to calculate baseline metrics
        y_baseline = []
        for i in range(0,len(y)):
            y_baseline.append(X)
            
        print(f'Baseline: Avg Price = {X}')
        print(f'R2 Score: {r2_score(y, y_baseline)}')
        print(f'MSE: {mean_squared_error(y, y_baseline)}')
        print(f'RMSE: {np.sqrt(mean_squared_error(y, y_baseline))}')
        
    else:
    # Evaluate other models
        print(f'R2 Score: {r2_score(y, model.predict(X))}')
        print(f'MSE: {mean_squared_error(y, model.predict(X))}')
        print(f'RMSE: {np.sqrt(mean_squared_error(y, model.predict(X)))}')

In [14]:
# Set X and y
X_lasso = df.drop(columns=['SalePrice'], axis=1)
y_lasso = df['SalePrice']

In [16]:
X_lasso.drop(columns=['Misc Feature_TenC', 'Electrical_Mix', 'Heating QC_Po', 
                'Bsmt Cond_Fa', 'Garage Type_no_garage', 'Bsmt Cond_Po', 'Exterior 1st_CBlock', 
                'BsmtFin Type 1_no_bsmt', 'Exterior 1st_ImStucc', 'Neighborhood_Landmrk', 
                'Exterior 1st_Stone', 'Bsmt Qual_no_bsmt', 'Exterior 2nd_Stone', 'Misc Feature_none', 
                'MS Zoning_C (all)', 'Bsmt Cond_no_bsmt', 'Neighborhood_GrnHill', 'Alley_no_alley',
                      'Fence_no_fence'], inplace=True)

In [17]:
# Perform a split of x and y
XL_train, XL_test, yl_train, yl_test = train_test_split(X_lasso, y_lasso, random_state=42)

# Scale and fit 
ss_lasso = StandardScaler()
XLs_train = ss_lasso.fit_transform(XL_train)
XLs_test = ss_lasso.transform(XL_test)

### Modeling

In [18]:
# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(-3, 0, 100)

# Cross-validate over list of Lasso alphas.
lasso_cv = LassoCV(alphas=l_alphas, cv=5, max_iter=50000)

# Fit model using best alpha
lasso_cv.fit(XLs_train, yl_train);

lasso_cv.score(XLs_train, yl_train), lasso_cv.score(XLs_test, yl_test)

(0.9286577521862895, 0.9059810796058696)

In [19]:
cross_val_score(lasso_cv, XLs_train, yl_train).mean()

0.8919610872942642

### Model Coefficients

In [20]:
pd.DataFrame({
    'Feature': X_lasso.columns,
    'coef_value': lasso_cv.coef_
}).set_index('Feature').sort_values('coef_value', ascending=False)

Unnamed: 0_level_0,coef_value
Feature,Unnamed: 1_level_1
Gr Liv Area,25001.751692
Total Bsmt SF,13018.862891
MS SubClass_20,12435.041165
Year Built,11347.105597
Overall Qual,9772.625652
MS SubClass_60,8098.109039
MS SubClass_50,7677.490134
Overall Cond,7063.687348
Lot Area,6775.763518
MS SubClass_30,6689.415472


### Model Metrics

In [23]:
evaluate_model(lasso_cv, XLs_test, yl_test)

R2 Score: 0.9059810796058696
MSE: 609178044.6667025
RMSE: 24681.53246187729


In [22]:
yl_predictions, yl_residuals = check_residual(lasso_cv, XLs_test, yl_test)

Model Insights

Residual plots indicate that the model is not performing as well for outliers. Overall, the performance of the model is satisfactory. The model found strong predictors for house sale price. Model is low bias and low variance. 

Recommend to validate the model over time to confirm its accuracy.

Coefficients in model can be interpreted as 