In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

Read in the data:

In [67]:
DATA_RAW = pd.read_csv('../source data/new data/CP ETL output.csv')
DATA_ENG_WAL = DATA_RAW[(DATA_RAW['ONS_Census_2011'].str[0] == 'E') | (DATA_RAW['ONS_Census_2011'].str[0] == 'W')]
data_train = DATA_ENG_WAL[DATA_ENG_WAL['LA_Population'].notnull()]
#data_eng_nonan = data_eng_nonan[data_eng_nonan['Modifed_Total'].notnull()]

#kick out the outlier: Milton Keynes
data_train = data_train[data_train['ONS_Census_2011'] != 'E06000042']

The simplest, dumbest approximation: calculate the average proportion of "Total presented persons" to "Local authority population".

In [68]:
avg_homeless = (data_train['TOTAL_Presented_Young_ppl_2014']/data_train['LA_Population']).mean()
print(avg_homeless)
data_train['Presented_normalised'] = data_train['TOTAL_Presented_Young_ppl_2014']/data_train['LA_Population']
data_train['prediction'] = avg_homeless*data_train['LA_Population']

0.00120562004139


In [69]:
data_notnull = data_train[data_train['TOTAL_Presented_Young_ppl_2014'].notnull()].reset_index()

"Predict" the number of _presented_ persons as the (national) average homeless rate times local authority population: (`notnull()` is there because we are only interested in existing data, as we want to compare our prediction with them)

In [70]:
#define a function that prints model metrics

from sklearn.metrics import r2_score, mean_absolute_error, median_absolute_error

def print_metrics(target, prediction):
    print('R^2 = {0}'.format(r2_score(target, prediction)))
    print('Mean error = {0}'.format(mean_absolute_error(target, prediction)))
    print('Median error = {0}'.format(median_absolute_error(target, prediction)))

### Cross-validation

Do the same as the above, but cross-validate the "model":

In [77]:
from sklearn import cross_validation

kf = cross_validation.KFold(len(data_notnull), shuffle=True, n_folds=5)
for i, (train_index, test_index) in enumerate(kf):
    avg_homeless = (data_notnull.ix[train_index]['TOTAL_Presented_Young_ppl_2014']/data_notnull.ix[train_index]['LA_Population']).mean()
    data_notnull.ix[test_index]['prediction'] = avg_homeless*data_notnull.ix[test_index]['LA_Population']
    
    print('Metrics in fold no {0}'.format(i+1))
    print_metrics(data_notnull.ix[test_index]['TOTAL_Presented_Young_ppl_2014'], data_notnull.ix[test_index]['prediction'])
    print()

Metrics in fold no 1
R^2 = -0.95944096524616
Mean error = 115.42844157679406
Median error = 107.50524712329886

Metrics in fold no 2
R^2 = 0.3309889479021969
Mean error = 220.0394192201971
Median error = 132.86275985662448

Metrics in fold no 3
R^2 = 0.3418677193006461
Mean error = 165.91219567801707
Median error = 96.24293974444052

Metrics in fold no 4
R^2 = 0.42637942255718086
Mean error = 130.24175642202022
Median error = 106.39860581493052

Metrics in fold no 5
R^2 = 0.21190162163395132
Mean error = 201.65492139499753
Median error = 98.9767841560721



Now, apply this **very rough** approximation to all English and Welsh districts:

In [78]:
baseline_prediction = DATA_ENG_WAL.copy()
avg_homeless = (data_train['TOTAL_Presented_Young_ppl_2014']/data_train['LA_Population']).mean()
baseline_prediction['predicted_presented'] = avg_homeless*baseline_prediction['LA_Population']

In [79]:
baseline_prediction.to_csv('baseline_prediction.csv', columns=['ONS_Census_2011', 'County_Name', 'predicted_presented'])

And the "predicted" number of homeless young people is...

In [80]:
print(baseline_prediction['predicted_presented'].sum())

67606.2433462
