## Project Code

Preparation:

In [213]:
%matplotlib inline
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats


pd.set_option('precision', 2)                    # Setting prices to exclude decimal points

pd.set_option('display.max_columns', 500)        # Setting DataFrame columns to show all of them

pd.options.mode.chained_assignment = None        # default='warning signs', when adding new columns to data frames

from sklearn.model_selection import train_test_split     # split data into random train and test subsets

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import KFold, cross_val_score

Open files to dataframes:

In [157]:
# Open housing data into a dataframe
dtitles = pd.DataFrame.from_csv('Housing_data/test.csv', index_col=None)

# Open housing price data into a dataframe
dprices = pd.DataFrame.from_csv('Housing_data/sample_submission.csv', index_col=None)

# open weather data file
weather_d = pd.DataFrame.from_csv('Weather_data/weather.csv', index_col=None)           

Wrangle housing data:

In [158]:
# Merge/combine the properties' descriptions and its sales prices into a new dataframe.
h_data = pd.merge(dtitles, dprices, on='Id')

# Filtering from 2006-2009!
hs_data = h_data[(h_data.YrSold >= 2006) & (h_data.YrSold <= 2009)]                            

# Rename columns
hs_data = hs_data.rename(columns={"MoSold":"month", "YrSold":"year"})          

# Building dataframe to a series datetime index                                                                          
hs_data.loc[:, 'day'] = 1                                                       

# adding column as one datetime.
date = pd.to_datetime(hs_data[['year', 'month', 'day']])                        
hs_data.loc[:, 'Date'] = date
hs_data.loc[:, 'Date'] = hs_data.Date.dt.to_period('M')
                                                                                   
# Seting, sorting and renaming index
hs_data = hs_data.set_index('Date').sort_index()                               
hs_data.drop(['day'], axis=1, inplace=True)

# changing/rename index to string object for later joining
hs_data.index = hs_data.index.map(str).rename('Year_Month')          

Wrangle weather data and join dataframes:

In [159]:
# Sorting to data needed
weather_d1 = weather_d[weather_d.STATION_NAME == 'AMES 8 WSW IA US']                   
weather_d2 = weather_d1[(weather_d1.DATE >= 20060101) & (weather_d1.DATE <= 20091231)]
weather_d3 = weather_d2[['DATE', 'PRCP', 'SNOW', 'TMAX', 'TMIN']]

 # Getting date to datetime, setting and sortinging index to ('Date')
weather_d3.loc[:, 'Date'] = pd.to_datetime(weather_d3.DATE, format='%Y%m%d')           
weather_d4 = weather_d3[['Date', 'PRCP', 'SNOW', 'TMAX', 'TMIN']]                      

# Building and setting index to year, and month.
weather_d4['Date'] =weather_d4.Date.dt.to_period('M')                                   
weather_data = weather_d4.set_index('Date').sort_index()                                 

# Changed corrupted values to Not a Number(NaN)
weather_data[weather_data == -9999] = np.nan                                   

# Changed names for columns
col = ['Avg_Prcp', 'Avg_Snow', 'Avg_Tmax', 'Avg_Tmin']                          
weather_data.columns = col

# Getting weather averages
wth_data_avg = weather_data.groupby(weather_data.index).mean()                 

# Change index type to string object for later merging of dataframes
wth_data_avg.index = wth_data_avg.index.map(str).rename('Year_Month') 

# join new housing and weather dataframes
hs_wth_data = hs_data.join(wth_data_avg)

# changing index to dates
hs_wth_df = hs_wth_data.reset_index(level=['Year_Month'])

# Drop 'Id' column, not needed
data = hs_wth_df.drop('Id', 1)

Split data, build values for not a number values(NaN)

In [160]:
#data.isnull().sum().sort_values(ascending=False).head(30) <---> Function to see values
# Remove columns/features with more than 1000 NaN values
data = data.dropna(axis=1, thresh=1000)

#data_object_sum = data.select_dtypes(include=['object']).isnull().sum() <---> Function to see non-numerical NaN values
# Object/Categorical data NaN values droped. Keep all NaN values for float and int.
data = data.dropna(axis=0, subset=['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'BsmtQual',
                                         'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 
                                         'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType'])
# Value drop
X, y = data.drop(['Year_Month', 'SalePrice'], axis=1), data.SalePrice

# Data split
X_trainp, X_testp, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 5)

print(X_trainp.shape)
print(X_testp.shape)
print(y_train.shape)
print(y_test.shape)

(819, 78)
(351, 78)
(819,)
(351,)


Fill not a number(NaN) with values for X_train

In [161]:
# Fill float columns' NaN values with means
X_trainp[['LotFrontage', 'MasVnrArea','GarageArea', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1']] = X_trainp[
    ['LotFrontage', 'MasVnrArea','GarageArea', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1']].fillna(
    X_trainp[['LotFrontage', 'MasVnrArea','GarageArea', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1']].mean())

# Fill int columns' NaN values with mode, most frequent
X_trainp[['GarageYrBlt','BsmtFullBath','BsmtHalfBath', 'GarageCars']] = X_trainp[
    ['GarageYrBlt','BsmtFullBath','BsmtHalfBath', 'GarageCars']].fillna(
    X_trainp[['GarageYrBlt','BsmtFullBath','BsmtHalfBath', 'GarageCars']].mode().iloc[0])


# build dummies/1-hot encode for two or more categorical values for each column/feature.
# for more than binary outcome values, we use n(total number of possiable values/outcomes)-1 = new n (number of variables
# to capture all the information about the specify feature)
X_train = pd.get_dummies(X_trainp, columns=['MSZoning', 'Street','LotShape','LandContour','Utilities','LotConfig'
            ,'LandSlope', 'Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl'
            ,'Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond'
            ,'BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical','KitchenQual'
            ,'Functional','GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive','SaleType'
            ,'SaleCondition'], drop_first=True)
X_train = X_train.drop(['LandSlope_Sev', 'Condition2_PosN', 'RoofStyle_Mansard', 'RoofStyle_Shed', 'RoofMatl_Tar&Grv'
             , 'RoofMatl_WdShake', 'RoofMatl_WdShngl', 'Exterior1st_BrkComm', 'Foundation_Wood', 'HeatingQC_Po'
             , 'Functional_Maj2', 'GarageQual_Po', 'SaleType_ConLI', 'SaleType_ConLw'], axis=1)
X_train.shape

(819, 193)

Fill not a number(NaN) with values for X_test

In [162]:
#X_testp
# Fill float columns' NaN values with means
X_testp[['LotFrontage', 'MasVnrArea','GarageArea', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1']] = X_testp[
    ['LotFrontage', 'MasVnrArea','GarageArea', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1']].fillna(
    X_trainp[['LotFrontage', 'MasVnrArea','GarageArea', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1']].mean())

# Fill int columns' NaN values with mode, most frequent
X_testp[['GarageYrBlt','BsmtFullBath','BsmtHalfBath', 'GarageCars']] = X_testp[
    ['GarageYrBlt','BsmtFullBath','BsmtHalfBath', 'GarageCars']].fillna(
    X_trainp[['GarageYrBlt','BsmtFullBath','BsmtHalfBath', 'GarageCars']].mode().iloc[0])


# build dummies/1-hot encode for two or more categorical values for each column/feature.
# for more than binary outcome values, we use n(total number of possiable values/outcomes)-1 = new n (number of variables
# to capture all the information about the specify feature)
X_test = pd.get_dummies(X_testp, columns=['MSZoning', 'Street','LotShape','LandContour','Utilities','LotConfig'
            ,'LandSlope', 'Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl'
            ,'Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond'
            ,'BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical','KitchenQual'
            ,'Functional','GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive','SaleType'
            ,'SaleCondition'], drop_first = True)
X_test.shape

(351, 193)

In [223]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(819, 193)
(351, 193)
(819,)
(351,)


In [228]:
# It averages to one overall mean, from the mean prediction of each individual tree
# to improve the predictive accuracy and controls overfitting from single decision trees
regr = RandomForestRegressor()

# Range of different parameters, try to find the bests between these.
param_grid = {'n_estimators': [100, 500, 700],
             'max_features' : [10, 20, 30],
             'max_depth'    : [10, 20]}

# Search for previous specify parameters, within the given estimator RandomForestRegressor()
regr_grid = GridSearchCV(regr, param_grid)

# Build a forest of trees from the training set (X_train, y_train)
regr_grid.fit(X_train, y_train)

# regr_grid.best_params_ --> Function to see best parameters
# Take best found parameters and predict values
y_estimate = regr_grid.best_estimator_.predict(X_test)

In [229]:
regr_grid.best_params_

{'max_depth': 20, 'max_features': 30, 'n_estimators': 500}

In [230]:
y_estimate[:20]

array([ 164585.4678521 ,  188002.81741133,  182624.74546677,
        178485.82419633,  162288.56653307,  198353.56293129,
        161993.84030478,  194546.13003704,  198531.3039898 ,
        156653.8145522 ,  186276.6248501 ,  202986.02484887,
        156829.02573184,  169101.494552  ,  182764.43816862,
        174228.87302294,  173504.32630467,  182377.48749162,
        178625.72476595,  197866.90376442])

Validate performance:

In [231]:
np.mean((y_train - regr_grid.predict(X_train)) ** 2)

5005750.240254967

In [232]:
np.mean((y_test - y_estimate) ** 2)

58598248.44668831