Feature Engineering

In [1]:
# Loading original data
import pandas as pd
data = pd.read_csv('data.xls')
df = pd.DataFrame(data)

In [2]:
# creating two new features consudering current year is 2024 see ref1 of documentation report
df['age_of_house'] = 2024 - df.yr_built
df['years_from_renovation'] = 2024 - df.yr_renovated

In [3]:
# now preprocessing again like day 2 see ref2 of documentation report
num_features = ['bedrooms' ,'bathrooms' ,'sqft_living' , 'sqft_lot' ,'sqft_above' , 'sqft_basement' , 'age_of_house','years_from_renovation']
cat_features = ['yr_built','yr_renovated','city','floors' ,'waterfront' , 'view' , 'condition']

In [4]:
# Normalizing and Encoding
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df[num_features] = scaler.fit_transform(df[num_features])

In [5]:
# Encoding
one_hot_encoded = pd.get_dummies(df , columns = cat_features)

In [9]:
#dropping unnecessary columns see ref3 of documentation report
final_df = one_hot_encoded.drop(['date' , 'country','statezip','street'],axis = 1)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,sqft_above,sqft_basement,age_of_house,years_from_renovation,yr_built_1900,...,view_0,view_1,view_2,view_3,view_4,condition_1,condition_2,condition_3,condition_4,condition_5
0,3.130000e+05,-0.441122,-0.843204,-0.829971,-0.193434,-0.565224,-0.672464,0.531014,-1.221670,False,...,True,False,False,False,False,False,False,True,False,False
1,2.384000e+06,1.759705,0.432802,1.568528,-0.161718,1.789559,-0.069128,1.674693,0.825693,False,...,False,False,False,False,True,False,False,False,False,True
2,3.420000e+05,-0.441122,-0.205201,-0.217367,-0.080978,0.119171,-0.672464,0.161000,0.825693,False,...,True,False,False,False,False,False,False,False,True,False
3,4.200000e+05,-0.441122,0.113800,-0.144686,-0.190145,-0.959621,1.482306,0.261913,0.825693,False,...,True,False,False,False,False,False,False,False,True,False
4,5.500000e+05,0.659291,0.432802,-0.206984,-0.121306,-0.797222,1.051352,-0.175376,-1.208396,False,...,True,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,3.081667e+05,-0.441122,-0.524202,-0.653458,-0.236689,-0.368025,-0.672464,0.564651,-1.195121,False,...,True,False,False,False,False,False,False,False,True,False
4596,5.343333e+05,-0.441122,0.432802,-0.705374,-0.202882,-0.426025,-0.672464,-0.410840,-1.225755,False,...,True,False,False,False,False,False,False,True,False,False
4597,4.169042e+05,-0.441122,0.432802,0.904009,-0.218462,1.371962,-0.672464,-1.285418,0.825693,False,...,True,False,False,False,False,False,False,True,False,False
4598,2.034000e+05,0.659291,-0.205201,-0.051238,-0.229164,-0.878421,1.525401,-0.108101,0.825693,False,...,True,False,False,False,False,False,False,True,False,False


In [7]:
#splitting into training and testing
from sklearn.model_selection import train_test_split

y = final_df['price'] # target variable
X = final_df.drop('price',axis=1) # features

X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

Model training

In [8]:
#linear regression model
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train , y_train)

In [9]:
# decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train,y_train)

In [10]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor()
rf_model.fit(X_train,y_train)

In [11]:
# predictions
y_predict_lr = lr_model.predict(X_test)
y_predict_dt = dt_model.predict(X_test)
y_predict_rf = rf_model.predict(X_test)

In [12]:
# Evaluation
from sklearn.metrics import mean_absolute_error , mean_squared_error , r2_score

# lr_model Evaluation
lr_mse = mean_squared_error(y_test,y_predict_lr)
lr_mae = mean_absolute_error(y_test,y_predict_lr)
lr_r2 = r2_score(y_test,y_predict_lr)

# dt_model Evaluation
dt_mse = mean_squared_error(y_test,y_predict_dt)
dt_mae = mean_absolute_error(y_test,y_predict_dt)
dt_r2 = r2_score(y_test,y_predict_dt)

# rf_model Evaluation
rf_mse = mean_squared_error(y_test,y_predict_rf)
rf_mae = mean_absolute_error(y_test,y_predict_rf)
rf_r2 = r2_score(y_test,y_predict_rf)

In [13]:
print('Linear regression Model Accuracy :')
print('MSE : ',lr_mse)
print('MAE : ',lr_mae)
print('R2 score : ',lr_r2)
print()
print('Decision Tree Model Accuracy :')
print('MSE : ',dt_mse)
print('MAE : ',dt_mae)
print('R2 score : ',dt_r2)
print()
print('Random Forest Model Accuracy :')
print('MSE : ',rf_mse)
print('MAE : ',rf_mae)
print('R2 score : ',rf_r2)

Linear regression Model Accuracy :
MSE :  3.1286236993463385e+29
MAE :  30893476072066.266
R2 score :  -3.067740282403312e+17

Decision Tree Model Accuracy :
MSE :  1011680679600.8002
MAE :  205828.27800113044
R2 score :  0.00800675568974274

Random Forest Model Accuracy :
MSE :  973095321903.5906
MAE :  166793.07586365216
R2 score :  0.045841237395996215


In [14]:
# more details ref4 of documentation report

Model Tuning

In [29]:
# I will be using grid search for all of the models
from sklearn.model_selection import GridSearchCV
import numpy as np

In [30]:
# dictionary for parameters to be tuned
rf_para = { 'max_depth' : [ 10, 20 , 30, 40] ,
           'min_samples_split' : [ 2, 5,10],
           'n_estimators':np.arange(5,50,5) }

In [31]:
grid_search = GridSearchCV(estimator=rf_model,param_grid = rf_para,cv=5,scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

In [33]:
# Fiting Random Forest model
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [34]:
grid_search.best_params_

{'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 45}

In [35]:
grid_search.best_score_

-59549911807.139786

In [36]:
y_predict_gs = grid_search.predict(X_test)

In [37]:
gs_tuned_rf_mse = mean_squared_error(y_test,y_predict_gs)
gs_tuned_rf_mae = mean_absolute_error(y_test,y_predict_gs)
gs_tunned_rf_r2 = r2_score(y_test,y_predict_gs)

In [39]:
print('Grid Search RF tuned Model Accuracy :')
print('MSE : ',gs_tuned_rf_mse)
print('MAE : ',gs_tuned_rf_mae)
print('R2 score : ',gs_tunned_rf_r2)

Random Forest Model Accuracy :
MSE :  975022605714.2084
MAE :  167810.13273239468
R2 score :  0.04395145877458795


In [10]:
final_df.to_csv('cleaned_data.csv',index=False)