In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading london housing prices data

In [2]:
houses = pd.read_csv('/kaggle/input/housing-in-london/housing_in_london_monthly_variables.csv')

## Loading london crime data
The london crime data is only accessible as big query dataset, so it is a bit more difficult to query it.

To merge it, we'll count the crime data per year per borough. We filter out boroughs outside of London to match the London housing dataset 

In [3]:
import bq_helper
from bq_helper import BigQueryHelper

crime_bq = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="london_crime")

In [4]:
# Get the number of crimes per year per borough
# Split into following categories:
# Theft and Handling, burglary, Robbery, Fraud or Forgery, Criminal Damage, Other Notifiable Offences
# Drugs
# Sexual Offences, Violence Against the Person
crimesPerYearPerBoroughQuery = """
  SELECT
    year,
    month,
    borough,
    SUM(IF(major_category IN UNNEST(SPLIT("Sexual Offences, Violence Against the Person", ", ")), value, 0)) as no_personal_offenses,
    SUM(IF(major_category IN UNNEST(SPLIT("Drugs", ", ")), value, 0)) as no_drug_offenses,
    SUM(IF(major_category IN UNNEST(SPLIT("Theft and Handling, burglary, Robbery, Fraud or Forgery, Criminal Damage, Other Notifiable Offences", ", ")), value, 0)) as no_material_offenses,
  FROM
    `bigquery-public-data.london_crime.crime_by_lsoa`
  GROUP BY
    year,
    month,
    borough
  ORDER BY
    year
;
        """
crimesPerYearPerBorough = crime_bq.query_to_pandas_safe(crimesPerYearPerBoroughQuery)
crimesPerYearPerBorough[100:200].head()

## Lets have a look at the data

First, lets see how many boroughs we have in both datasets to see how to merge it

In [5]:
crimesPerYearPerBorough['borough'].unique()

In [6]:
houses['area'].unique()

As we can see, the houses dataset also contains surrounding areas of london and writes everything lowercase. Let's see if merging works as expected with applying some transformations:

In [7]:
# Only keep the year, as the date is always set to 1-12-YEAR anyway
houses['year'] = houses['date'].str.split('-', 1, expand=True)[0].astype(int)
houses['month'] = houses['date'].str.split('-', 2, expand=True)[1].astype(int)
houses.info()

In [8]:
crimesPerYearPerBorough['borough'] = crimesPerYearPerBorough['borough'].str.lower()

In [9]:
expectedMergeCount = len(crimesPerYearPerBorough['borough'].unique())

crimesTestFrame = pd.DataFrame(crimesPerYearPerBorough['borough'].unique())
crimesTestFrame[0] = crimesTestFrame[0].str.lower()
print(f"Expected: {expectedMergeCount}")
print(f"Actual: {len(crimesTestFrame.merge(pd.DataFrame(houses['area'].unique())))}")

Additionally, 

Nice, it works! Lets merge the frames and look at the columns

In [10]:
df = crimesPerYearPerBorough.merge(houses, left_on=['year', 'month', 'borough'], right_on=['year', 'month', 'area'])
df.dropna(axis=0, how="all", inplace=True)
print(df.info())
df[100:200].head()

### Split into test and training dataset

In [11]:
# Define the columns we expect as input and output
X_cols = ['no_personal_offenses', 'no_drug_offenses', 'no_material_offenses']
y_cols = ['average_price']

In [12]:

ml_df = df[X_cols + y_cols]
ml_df = ml_df.dropna()
# ml_df = ml_df[(ml_df != 0).all(1)]

X, y = ml_df[X_cols], ml_df[y_cols]

# Optional if just one column (transforms 1D array in array of 1-element arrays)
y = y.values.ravel()

Now, we want to split it into test and training set randomly

In [13]:
from sklearn.model_selection import train_test_split

# train_test_split shuffles the data and then splits it according to test_size. As we have not so much data, we 
# Reduce the test size a bit to 20%, but might have a look at how this affects training later
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [14]:
from matplotlib import pyplot as plt

print(X_train.describe())
print(X_val.describe())
X_test.describe()

fig, axs = plt.subplots(1, 3, figsize=(12, 3.5), dpi=160)

plt = axs[0]
col = X_cols[0]
plt.boxplot([X_train[col], X_test[col], X_val[col]])
plt.set_title('Personal Offenses')
plt.set_xticks([1, 2, 3], ['Train', 'Test', 'Validation'])

plt = axs[1]
col = X_cols[1]
plt.boxplot([X_train[col], X_test[col], X_val[col]])
plt.set_title('Drug Offenses')
plt.set_xticks([1, 2, 3], ['Train', 'Test', 'Validation'])

plt = axs[2]
col = X_cols[2]
plt.boxplot([X_train[col], X_test[col], X_val[col]])
plt.set_title('Material Offenses')
plt.set_xticks([1, 2, 3], ['Train', 'Test', 'Validation'])

# 2 â€” Training ML Models

In [15]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

In [16]:
rfr.score(X_val, y_val)

Thats already better than a constant output would be, but lets see if we can optimize it

In [17]:
for i in range(10,200,10):
    rfr = RandomForestRegressor(n_estimators=i)
    rfr.fit(X_train, y_train)
    print(f"{i} trees: {rfr.score(X_val, y_val)}")

Looks like 80 trees seems to be a sweet spot, but all pretty similar

In [None]:
for min_leaf in range(1,10,2):
    for min_split in range(2,20,2):
        for i in range(50,130,20):
            rfr = RandomForestRegressor(n_estimators=100, min_samples_split=min_split, min_samples_leaf=min_leaf)
            rfr.fit(X_train, y_train)
            print(f"{min_leaf} -- {min_split} -- {i} trees: {rfr.score(X_val, y_val)}")

It also looks like a higher min_samples_split improves performance a bit

In [18]:
rfr = RandomForestRegressor(n_estimators=100, min_samples_split=15, min_samples_leaf=10)
rfr.fit(X_train, y_train)

In [19]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

def evaluate(model, X_t = X_test, y_t = y_test):
    predictions = model.predict(X_t)
    house_price_e = mean_absolute_error(y_t, predictions)

    house_sqr_price_e = mean_squared_error(y_t, predictions)
    
    print('Score')
    print(model.score(X_t, y_t))
    print('Mean Abs Error')
    print(house_price_e)
    return model.score(X_t, y_t), house_price_e
    # print('Root Mean Squared Error')
    # print(np.sqrt(house_sqr_price_e), np.sqrt(house_sqr_count_e))

In [20]:
evaluate(rfr)
plt.boxplot(y_test)

Well that is a high error...

In [21]:
# Linear Regression model
from sklearn.linear_model import LinearRegression

lrm = LinearRegression()
lrm.fit(X_train, y_train)

In [22]:
lrm.score(X_test, y_test)

In [23]:
evaluate(lrm)

In [24]:
from matplotlib import pyplot as plt
import pandas as pd

model = rfr
y_pred = model.predict(X_test)
y_error = y_pred - y_test

X_all = X_test.copy(deep=True)
X_all['error'] = y_error

fig, axs = plt.subplots(1, 3, figsize=(12, 3.5), dpi=160)

plt = axs[0]
col = X_cols[0]
plt.scatter(X_all[col], X_all['error'])
plt.set_title('Personal Offenses')
plt.set_ylabel('Error')
plt.set_xlabel('Offenses')

plt = axs[1]
col = X_cols[1]
plt.scatter(X_all[col], X_all['error'])
plt.set_title('Drug Offenses')
plt.set_ylabel('Error')
plt.set_xlabel('Offenses')

plt = axs[2]
col = X_cols[2]
plt.scatter(X_all[col], X_all['error'])
plt.set_title('Material Offenses')
plt.set_ylabel('Error')
plt.set_xlabel('Offenses')

X_all
# X_all['BUCKET'] = pd.qcut(X_all['error'], 20)
# X_all.plot(column='no_personal_offenses', by='BUCKET')

In [25]:
import matplotlib.pyplot as plt
# Calculate error per area
ml_df = df[X_cols + y_cols + ['area']]
ml_df = ml_df.dropna()
X, y = ml_df[X_cols + ['area']], ml_df[y_cols]
# As the random_state is the same -> test set is the same
_, X_test2, _, y_test2 = train_test_split(X, y, test_size=0.2, random_state=1)

areas = {}
for area in X_test2['area'].unique():
    indices = np.where(X_test2['area'] == area)
    _, areas[area] = evaluate(lrm, X_test2[X_cols].values[indices], y_test2.values[indices])
    
# import matplotlib.pylab as plt

plt.tick_params(bottom=False)
lists = sorted(areas.items(),key=lambda item: item[1]) # sorted by key, return a list of tuples
x, y = zip(*lists) # unpack a list of pairs into two tuples
print(x, y)
plt.plot(x, y, 'bo')
# plt.set_xlabel('Mean absolute error')
# plt.set_ylabel('Area')
plt.show()

In [26]:
import matplotlib.pyplot as plt
# Calculate error per area
ml_df = df[X_cols + y_cols + ['area']]
ml_df = ml_df.dropna()
X, y = ml_df[X_cols + ['area']], ml_df[y_cols]
# As the random_state is the same -> test set is the same
_, X_test2, _, y_test2 = train_test_split(X, y, test_size=0.2, random_state=1)

areas = {}
for area in X_test2['area'].unique():
    indices = np.where(X_test2['area'] == area)
    _, areas[area] = evaluate(rfr, X_test2[X_cols].values[indices], y_test2.values[indices])
    
# import matplotlib.pylab as plt

plt.tick_params(bottom=False)
lists = { k: areas[k] for k in x } # sorted by key, return a list of tuples
x2, y2 = zip(*(lists.items())) # unpack a list of pairs into two tuples
print(x2, y2)
plt.plot(x2, y2, 'bo')
# plt.set_xlabel('Mean absolute error')
# plt.set_ylabel('Area')
plt.show()

In [27]:
X['area'].values.reshape((-1,1)).shape

## Training only on area

In [28]:
from sklearn.preprocessing import OneHotEncoder

ml_df = df[y_cols + ['area']]
ml_df = ml_df.dropna()
X, y = ml_df[['area']], ml_df[y_cols]

ohe = OneHotEncoder()
ohe.fit(X['area'].unique().reshape((-1,1)))
X = ohe.transform(X['area'].values.reshape((-1,1)))
# As the random_state is the same -> test set is the same
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, random_state=1)
X_train2, _, y_train2, _ = train_test_split(X_train2, y_train2, test_size=0.25, random_state=1)

rfr2 = RandomForestRegressor()
rfr2.fit(X_train2, y_train2)
evaluate(rfr2, X_test2, y_test2)

## Leave area out

In [30]:
# Change model / boroughs for different results:
from sklearn.preprocessing import OneHotEncoder

ml_df = df[X_cols + y_cols + ['area']]
ml_df = ml_df.dropna()
# ml_df = ml_df[(ml_df != 0).all(1)]

X, y = ml_df[X_cols + ['area']], ml_df[y_cols]

# Optional if just one column (transforms 1D array in array of 1-element arrays)
y = y.values.ravel()

# Two areas randomly selected from the ones which had no outlying error
split_indices = np.where((X['area'] == 'tower hamlets') | (X['area'] == 'islington'))
split_indices_anti = np.where(~((X['area'] == 'tower hamlets') | (X['area'] == 'islington')))

X = X[X_cols]

# As the random_state is the same -> test set is the same
X_train2, X_test2, y_train2, y_test2 = X.values[split_indices_anti], X.values[indices], y[split_indices_anti], y[indices]
X_train2, _, y_train2, _ = train_test_split(X_train2, y_train2, test_size=0.25, random_state=1)

rfr2 = RandomForestRegressor()
rfr2.fit(X_train2, y_train2)
print(evaluate(rfr2, X_test2, y_test2))

lrm2 = LinearRegression()
lrm2.fit(X_train2, y_train2)
evaluate(lrm2, X_test2, y_test2)