## Processing house price data

Imputing missing values.

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

In [2]:
df_houseprice = pd.read_excel('data/houseprices_median_lsoa.xls', sheet_name ='Data', skiprows=5)

In [3]:
df_houseprice.head()

Unnamed: 0,Local authority code,Local authority name,LSOA code,LSOA name,Year ending Dec 1995,Year ending Mar 1996,Year ending Jun 1996,Year ending Sep 1996,Year ending Dec 1996,Year ending Mar 1997,...,Year ending Sep 2018,Year ending Dec 2018,Year ending Mar 2019,Year ending Jun 2019,Year ending Sep 2019,Year ending Dec 2019,Year ending Mar 2020,Year ending Jun 2020,Year ending Sep 2020,Unnamed: 104
0,E06000001,Hartlepool,E01011949,Hartlepool 009A,34750,34500,30500,30000,29950,29000,...,77500,77500,73250,77500,77500,77000.0,89000,84000,88000,
1,E06000001,Hartlepool,E01011950,Hartlepool 008A,25000,25000,25300,25625,25000,24800,...,30000,28000,28000,29500,27000,27500.0,27000,26000,28000,
2,E06000001,Hartlepool,E01011951,Hartlepool 007A,27000,27000,27250,28950,28500,28950,...,49000,50000,50000,46250,42800,39997.5,40000,29425,30000,
3,E06000001,Hartlepool,E01011952,Hartlepool 002A,44500,44500,30000,26675,26000,25500,...,57000,58000,70000,70000,70000,66475.0,70000,66475,85000,
4,E06000001,Hartlepool,E01011953,Hartlepool 002B,22000,27000,27000,20600,20000,19500,...,50000,68000,66000,59000,62000,58000.0,58000,60000,:,


### Keep most recent data, and assume constant

In [4]:
df_houseprice = df_houseprice[['Local authority code', 'LSOA code', 'Year ending Dec 2019']]

## Function definition for converting int to float, and imputing srt to mean of float

In [6]:
def func_impute_to_float(df, col):
    str_mask = [type(x) == str for x in df[col]]
    int_mask = [type(x) == int for x in df[col]]
    float_mask = [type(x) == float for x in df[col]]
    type_mask = [type(x) for x in df[col]]
    
    if int in set(type_mask):
        #set to be float
        df[col][int_mask] = df[col][int_mask].astype('float')
        float_mask = [type(x) == float for x in df[col]]
    
    if str in set(type_mask):
        #set to be mean of numeric values
        df[col][str_mask] = df[col][float_mask].mean()
        
    df[col] = df[col].astype('float')
    
    assert df[col].dtypes == 'float64'
    
    return df

In [7]:
df_houseprice = func_impute_to_float(df_houseprice, 'Year ending Dec 2019')

In [8]:
# run over every column we want to be numeric
for col in df_houseprice.columns[4:]:
    df_houseprice = func_impute_to_float(df_houseprice, col)


In [9]:
df_houseprice.dtypes

Local authority code     object
LSOA code                object
Year ending Dec 2019    float64
dtype: object

In [10]:
# save as csv
#df_houseprice.to_csv('data/lsoa_houseprices.csv')

### Map to MSOA

In [12]:
msoa_to_lsoa = pd.read_csv('data/Output_Area_to_LSOA_to_MSOA_to_Local_Authority_District_(December_2017)_Lookup_with_Area_Classifications_in_Great_Britain.csv')

In [13]:
msoa_to_lsoa = msoa_to_lsoa[['LSOA11CD','MSOA11CD']]

In [14]:
print(msoa_to_lsoa.shape)
print(msoa_to_lsoa.LSOA11CD.nunique())
print(msoa_to_lsoa.MSOA11CD.nunique())

(227759, 2)
41729
8480


In [15]:
msoa_to_lsoa.drop_duplicates(inplace=True)
msoa_to_lsoa.shape

(41729, 2)

In [16]:
msoa_to_lsoa.LSOA11CD.nunique()

41729

In [17]:
df_houseprice['LSOA code'].nunique()

34753

In [18]:
# drop unnecessary lsoa regions
msoa_to_lsoa.drop(msoa_to_lsoa[~msoa_to_lsoa['LSOA11CD'].isin(df_houseprice['LSOA code'])].index,inplace=True)

In [19]:
msoa_to_lsoa.LSOA11CD.nunique()

34753

In [20]:
# merge with df
msoa_houseprice = df_houseprice.merge(msoa_to_lsoa, left_on='LSOA code', right_on='LSOA11CD', how ='inner')

In [22]:
msoa_houseprice.head()

Unnamed: 0,Local authority code,LSOA code,Year ending Dec 2019,LSOA11CD,MSOA11CD
0,E06000001,E01011949,77000.0,E01011949,E02002491
1,E06000001,E01011950,27500.0,E01011950,E02002490
2,E06000001,E01011951,39997.5,E01011951,E02002489
3,E06000001,E01011952,66475.0,E01011952,E02002484
4,E06000001,E01011953,58000.0,E01011953,E02002484


In [23]:
msoa_houseprice.MSOA11CD.nunique()

7201

In [24]:
msoa_houseprice.shape

(34753, 5)

In [25]:
msoa_summed_houseprice = msoa_houseprice[['MSOA11CD', 'Year ending Dec 2019']].groupby(['MSOA11CD']).sum()

In [26]:
msoa_lsoa_count = msoa_houseprice[['MSOA11CD', 'LSOA11CD']].groupby(['MSOA11CD']).count()

In [27]:
msoa_avg_houseprice = msoa_summed_houseprice.merge(msoa_lsoa_count, on='MSOA11CD')
msoa_avg_houseprice['houseprice_avg_2019'] = msoa_avg_houseprice['Year ending Dec 2019'] / msoa_avg_houseprice['LSOA11CD']
msoa_avg_houseprice.head()

Unnamed: 0_level_0,Year ending Dec 2019,LSOA11CD,houseprice_avg_2019
MSOA11CD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
E02000001,4838950.0,6,806491.62374
E02000002,1428824.0,4,357206.0
E02000003,2138500.0,6,356416.666667
E02000004,1369000.0,4,342250.0
E02000005,1632500.0,5,326500.0


In [28]:
msoa_avg_houseprice.reset_index(inplace=True)
msoa_avg_houseprice = msoa_avg_houseprice[['MSOA11CD', 'houseprice_avg_2019']]
msoa_avg_houseprice.head()

Unnamed: 0,MSOA11CD,houseprice_avg_2019
0,E02000001,806491.62374
1,E02000002,357206.0
2,E02000003,356416.666667
3,E02000004,342250.0
4,E02000005,326500.0


In [56]:
#msoa_avg_houseprice.to_csv('data/msoa_houseprice.csv')