# Task for Today  

***

## London House Sales Prediction  

Given *data about houses in London*, let's try to predict how many **houses will be sold** in a given month and area.

We will use a variety of regression models to make our predictions.

# Getting Started

In [14]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [15]:
data = pd.read_csv('housing_in_london_monthly_variables.csv')

In [16]:
data

Unnamed: 0,date,area,average_price,code,houses_sold,no_of_crimes,borough_flag
0,1995-01-01,city of london,91449,E09000001,17.0,,1
1,1995-02-01,city of london,82203,E09000001,7.0,,1
2,1995-03-01,city of london,79121,E09000001,14.0,,1
3,1995-04-01,city of london,77101,E09000001,7.0,,1
4,1995-05-01,city of london,84409,E09000001,10.0,,1
...,...,...,...,...,...,...,...
13544,2019-09-01,england,249942,E92000001,64605.0,,0
13545,2019-10-01,england,249376,E92000001,68677.0,,0
13546,2019-11-01,england,248515,E92000001,67814.0,,0
13547,2019-12-01,england,250410,E92000001,,,0


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13549 entries, 0 to 13548
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           13549 non-null  object 
 1   area           13549 non-null  object 
 2   average_price  13549 non-null  int64  
 3   code           13549 non-null  object 
 4   houses_sold    13455 non-null  float64
 5   no_of_crimes   7439 non-null   float64
 6   borough_flag   13549 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 741.1+ KB


# Preprocessing

In [18]:
def preprocess_inputs(df):
    df = df.copy()

    # Drop redundant columns
    df = df.drop('code', axis=1)

    # Drop columns with too many missing values
    df = df.drop('no_of_crimes', axis=1)

    # Drop rows with missing target values
    missing_target_rows = df[df['houses_sold'].isna()].index
    df = df.drop(missing_target_rows, axis=0).reset_index(drop=True)

    # Extract date features
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].apply(lambda x: x.year)
    df['month'] = df['date'].apply(lambda x: x.month)
    df = df.drop('date', axis=1)

    # One-hot encode the area column
    area_dummies = pd.get_dummies(df['area'], prefix='area')
    df = pd.concat([df, area_dummies], axis=1)
    df = df.drop('area', axis=1)

    # Split df into X and y
    y = df['average_price']
    X = df.drop('average_price', axis=1)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    return X_train, X_test, y_train, y_test

In [19]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [20]:
X_train

Unnamed: 0,houses_sold,borough_flag,year,month,area_barking and dagenham,area_barnet,area_bexley,area_brent,area_bromley,area_camden,...,area_south east,area_south west,area_southwark,area_sutton,area_tower hamlets,area_waltham forest,area_wandsworth,area_west midlands,area_westminster,area_yorks and the humber
10752,-0.005908,-1.657173,1.542734,1.315615,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
10236,0.222400,-1.657173,-0.963307,1.024950,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
4512,-0.298115,0.603437,-1.380980,-1.300369,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
9208,-0.293663,0.603437,0.985836,1.024950,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,6.754536,-0.152846,-0.152117,-0.152482,-0.151385
2672,-0.303063,0.603437,1.542734,-0.428374,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
905,-0.298033,0.603437,-1.659429,0.734285,-0.145406,-0.150649,6.720615,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
5192,-0.281790,0.603437,-0.545633,1.606280,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
12172,0.323815,-1.657173,0.707387,0.443620,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
235,-0.318976,0.603437,0.985836,0.443620,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385


In [21]:
y_train

10752    128885
10236    137190
4512      73211
9208     342013
2672     473140
          ...  
905       64510
5192     203931
12172    196204
235      761544
13349    173811
Name: average_price, Length: 9418, dtype: int64

# Training

In [22]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor(),
    "                              LightGBM": LGBMRegressor(),
    "                              CatBoost": CatBoostRegressor(verbose=0)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                        Neural Network trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000915 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 9418, number of used features: 49
[LightGBM] [Info] Start training from score 261386.268741
                              LightGBM trained.
               

# Results

In [23]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    print(name + " RMSE: {:.4f}".format(rmse))

                     Linear Regression RMSE: 72260.3762
 Linear Regression (L2 Regularization) RMSE: 72254.9678
 Linear Regression (L1 Regularization) RMSE: 72255.2946
                   K-Nearest Neighbors RMSE: 17400.8290
                        Neural Network RMSE: 283461.5660
Support Vector Machine (Linear Kernel) RMSE: 315162.8536
   Support Vector Machine (RBF Kernel) RMSE: 190058.0919
                         Decision Tree RMSE: 15635.7151
                         Random Forest RMSE: 11770.5720
                     Gradient Boosting RMSE: 39577.2622
                               XGBoost RMSE: 13471.4899
                              LightGBM RMSE: 12839.6276
                              CatBoost RMSE: 11595.8185


In [24]:
for name, model in models.items():
    print(name + " R^2: {:.4f}".format(model.score(X_test, y_test)))

                     Linear Regression R^2: 0.8482
 Linear Regression (L2 Regularization) R^2: 0.8482
 Linear Regression (L1 Regularization) R^2: 0.8482
                   K-Nearest Neighbors R^2: 0.9912
                        Neural Network R^2: -1.3365
Support Vector Machine (Linear Kernel) R^2: -1.8883
   Support Vector Machine (RBF Kernel) R^2: -0.0504
                         Decision Tree R^2: 0.9929
                         Random Forest R^2: 0.9960
                     Gradient Boosting R^2: 0.9545
                               XGBoost R^2: 0.9947
                              LightGBM R^2: 0.9952
                              CatBoost R^2: 0.9961


# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/iPJ0ZgO04m4