In [47]:
import xlrd
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [13]:
df = pd.read_excel('DS - Assignment Part 1 data set.xlsx')

In [14]:
df.drop(columns=['latitude','longitude'],inplace=True)

In [15]:
def read_date(date):
    return xlrd.xldate.xldate_as_datetime(date, 0)

In [16]:
df['Transaction date'] = pd.to_datetime(df['Transaction date'].apply(read_date)).dt.date

In [17]:
df.head(2)

Unnamed: 0,Transaction date,House Age,Distance from nearest Metro station (km),Number of convenience stores,Number of bedrooms,House size (sqft),House price of unit area
0,1905-07-04,32.0,84.87882,10,1,575,37.9
1,1905-07-04,19.5,306.5947,9,2,1240,42.2


In [22]:
df.corr(numeric_only=True)[['House price of unit area']]

Unnamed: 0,House price of unit area
House Age,-0.210567
Distance from nearest Metro station (km),-0.673613
Number of convenience stores,0.571005
Number of bedrooms,0.050265
House size (sqft),0.046489
House price of unit area,1.0


- Distance from nearest Metro station has a strong negative correlation with the per unit price of a house, so when the distance from the nearest station increases the price decreases as it is more inconvenient for people to travel to their jobs or other places.
- As the # of convenience stores increase the per unit price of the house also increases, as people can get their groceries and other needs quickly with ease. Hence people are ready to pay more for the convenience.
- As house age increases per unit price decreases as older houses might need more work done upon before we can move, they'll also be required to be redeveloped earlier than newer houses, increasing the expenses of the owners.
- The house size and # of bedrooms aren't highly correlated with per unit price, but affect the overall price of it.

In [31]:
temp = df.drop(columns='Transaction date')
vif_data = pd.DataFrame()
vif_data["feature"] = temp.columns
vif_data["VIF"] = [variance_inflation_factor(temp.values, i) for i in range(len(temp.columns))]

In [32]:
vif_data

Unnamed: 0,feature,VIF
0,House Age,3.041041
1,Distance from nearest Metro station (km),2.115252
2,Number of convenience stores,5.100274
3,Number of bedrooms,15.843422
4,House size (sqft),18.427444
5,House price of unit area,8.546484


- The house size and the number of bedrooms would be highly correlated, hence the spike in VIF values

In [33]:
df.drop(columns='Number of bedrooms',inplace=True)

In [35]:
df.columns

Index(['Transaction date', 'House Age',
       'Distance from nearest Metro station (km)',
       'Number of convenience stores', 'House size (sqft)',
       'House price of unit area'],
      dtype='object')

In [40]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['Transaction date','House price of unit area']),df['House price of unit area'],test_size=0.25)

# Predicting house prices

## Linear regression

In [77]:
model = LinearRegression()
model.fit(x_train,y_train)

In [78]:
pred = model.predict(x_test)

In [79]:
mean_squared_error(y_test,pred)**(0.5)

8.985855849886558

In [80]:
pred_train = model.predict(x_train)

In [81]:
mean_squared_error(y_train,pred_train)**0.5

9.291108063769983

## Lasso

In [82]:
model = Lasso()
model.fit(x_train,y_train)

In [83]:
pred = model.predict(x_test)

In [84]:
mean_squared_error(y_test,pred)**0.5

9.049606497286273

In [85]:
pred_train = model.predict(x_train)

In [86]:
mean_squared_error(y_train,pred_train)**0.5

9.301550825392804

## Ridge

In [87]:
model = Ridge()
model.fit(x_train,y_train)

In [88]:
pred = model.predict(x_test)

In [89]:
mean_squared_error(y_test,pred)**0.5

8.986043179180653

In [90]:
pred_train = model.predict(x_train)

In [91]:
mean_squared_error(y_train,pred_train)**0.5

9.291108206059334

## Random Forest

In [60]:
model = RandomForestRegressor(n_jobs=-1,n_estimators=50,max_depth=)
model.fit(x_train,y_train)

In [76]:
list(df.columns[1:][:-1])

['House Age',
 'Distance from nearest Metro station (km)',
 'Number of convenience stores',
 'House size (sqft)']

In [71]:
model.feature_importances_

array([0.21234856, 0.66216962, 0.04595209, 0.07952973])

In [61]:
pred = model.predict(x_test)

In [62]:
mean_squared_error(y_test,pred)**0.5

6.787831723017298

In [63]:
pred_train = model.predict(x_train)

In [64]:
mean_squared_error(y_train,pred_train)**0.5

3.5072336953143566

- The data is not very complex to use ensemble models, hence using a Random Forest leads to overfitting.
- We don't have many features hence using Lasso won't be a good option as it might remove some features from the model.
- Hence we can go with a simple linear model or Ridge regression.