In [209]:
import numpy as np
import pandas as pd
import requests
import json
import time
import io
import pickle as pkl
from bs4 import BeautifulSoup
import re

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

### Scraping Foodbank Data

I want to get data on Foodbank density for the ~430 LADs. To do this I'll scrape givefood.org.uk's search. Searching for each LAD by name returns the (usually) 10 nearest foodbanks.

First I'll fetch the LAD names from the income df:

<font color='orange'><b>Note:</b> the cell below is wrapped in <code>if False</code> to prevent accidental redownloading. To run, change False to True</font>

In [143]:
if False:
    df = pd.read_pickle('LAD_incomes.pkl')
    regions = set(df['Region name'].values)

    start = time.time()
    for i, region in enumerate(regions):
        r = requests.get(url=f'https://www.givefood.org.uk/needs/?address={region}')
        soup = BeautifulSoup(r.text, 'html.parser')
        r_distances = [tag.text for tag in soup.find_all(class_='distance')[:10]]
        distances[region]=r_distances
        if i%10 == 0 and i!=0:
            print(f'\x1b[31m proccessed: {i} in: {np.round(time.time()-start)} avg_time:{ (np.round(time.time()-start))/i } expected_time: { ((np.round(time.time()-start))/i)*len(regions)} \x1b[0m')
        print(f'{region}: {r_distances}')
        time.sleep(1.25)
    with open('foodbank_distances.pkl', 'wb') as f:
        pkl.dump(distances,f)
else:
    with open('foodbank_distances.pkl', 'rb') as f:
        fb_distances = pkl.load(f)


### ONS Dataset Api

In [11]:
ons_root = "https://api.beta.ons.gov.uk/v1/"

let's get all the datasets. By default, the ons_api will return only 20 per request. There are around 100 datasets. 

To fetch all, I'll keep making requests, incrementing offset by 20 until I receive no further datasets:

In [31]:
datasets = []
for offset in range(0,200,20):
    params = {
        'offset':offset
    }
    r = requests.get(f'{ons_root}datasets', params=params)
    received_datasets = json.loads(r.content)['items']
    if(len(received_datasets)==0):
        break
    datasets += received_datasets
    time.sleep(1)

Checking for duplicates by comparing the length of the set of database titles with the list:

In [36]:
print(f'Unique Datasets: {len(set([dataset["title"] for dataset in datasets]))}')
print(f'Total Datasets: {len(datasets)}')

Unique Datasets: 117
Total Datasets: 117


Looping over names and descriptions to find those of interest:

In [40]:
for dataset in datasets:
    print('____________________________________________________________________________________________________________________________________')
    print('Title: ' + dataset['title'])
    print(dataset['description'])

____________________________________________________________________________________________________________________________________
Title: Quarterly personal well-being estimates
Seasonally and non seasonally-adjusted quarterly estimates of life satisfaction, feeling that the things done in life are worthwhile, happiness and anxiety in the UK.
____________________________________________________________________________________________________________________________________
Title: Personal well-being estimates by local authority
Estimates of life satisfaction, feeling that the things done in life are worthwhile, happiness and anxiety at the UK, country, regional, county, local and unitary authority level.
____________________________________________________________________________________________________________________________________
Title: Deaths registered weekly in England and Wales by region
Provisional counts of the number of deaths registered in England and Wales, by region, i

I'm interested in data at the LAD level. So I'll use:

- 'Life Expectancy by Local Authority'
- 'Personal well-being estimates by local authority'
- 'Households by deprivation dimensions'

I'll filter the datasets by title:

In [41]:
relevant_titles = ['Life Expectancy by Local Authority','Personal well-being estimates by local authority', 'Households by deprivation dimensions']
relevant_datasets = [dataset for dataset in datasets if dataset['title'] in relevant_titles]

In [46]:
relevant_datasets[0]['id']

'wellbeing-local-authority'

Then download the data for each and store it in a dict:
(pd.read_csv should accept urls but yields a 403 response code each time, instead I'll make a request and wrap the resulting string in an StringIO)

In [62]:
if False: #Set to false to prevent accidental downloading
    data = {}
    for dataset in relevant_datasets if False: #Set to false to prevent accidental downloading
        url = dataset['links']['latest_version']['href']
        r = requests.get(url)
        latest_version_page = json.loads(r.content)
        data_url = latest_version_page['downloads']['csv']['href']
        data_r = requests.get(data_url)
        data_io = io.StringIO(data_r.text)
        data[dataset['id']] = pd.read_csv(data_io)
        time.sleep(2)



Now pickling the fetched data to preserve it:

In [66]:
for df in data:
    pd.to_pickle(data[df], f'{df}.pkl')

### Cleaning the fetched Data
first the wellbeing data:

I want only 'Life Satisfaction' and only the average values

In [188]:
df = data['wellbeing-local-authority'].copy() #avoid mutability problems
df = df[df['measure-of-wellbeing']	== 'life-satisfaction'] #only care about one easure
df = df[df['wellbeing-estimate'] == 'average-mean'] # Want the average 
df = df[df['administrative-geography'].str[0] == 'E'] # Only want LADs
df = df[pd.to_numeric(df['v4_3'], errors='coerce').notnull()] # Drop all non_numeric (i.e. missing values)
df = df.sort_values(by='Time', ascending=False).reset_index() # To make drop_duplicates behave
df = df.drop_duplicates(subset=['administrative-geography'], keep='first') #Keep only latest data
df = df[['administrative-geography', 'v4_3']]
df.columns = ['LAD Code', 'Life Satisfaction']
satis_df = df
satis_df

Unnamed: 0,LAD Code,Life Satisfaction
0,E07000163,7.43
1,E11000006,7.54
2,E06000052,7.58
3,E08000028,7.29
4,E06000044,7.36
...,...,...
343,E06000037,7.50
344,E06000043,7.57
345,E07000165,7.86
466,E07000109,6.81


In [92]:
data.keys()

dict_keys(['wellbeing-local-authority', 'life-expectancy-by-local-authority', 'TS011'])

Nex the life expectency data:

In [106]:
df = data['life-expectancy-by-local-authority'].copy() #avoid mutability problems
df = df[df.AgeGroups == '00-01'] # Only care about life expectency at birth
df = df[pd.to_numeric(df['v4_2'], errors='coerce').notnull()] # Drop all non_numeric (i.e. missing values)
df = df.sort_values(by='Time', ascending=False).reset_index() # To make drop_duplicates behave
df = df.drop_duplicates(subset=['administrative-geography'], keep='first') #Keep only latest data
df = df[['administrative-geography', 'v4_2']]
df.columns = ['LAD Code', 'Life Expectency']
expec_df = df
expec_df

Unnamed: 0,LAD Code,Life Expectency
0,E09000027,82.58
1,E08000015,82.27
2,S12000023,79.04
3,E08000022,81.98
4,E07000203,85.00
...,...,...
781,N09000002,83.11
784,S12000034,82.52
796,E07000116,84.56
806,E12000006,83.88


and finally the deprivation dimensions(TS011):

In [139]:
df = data['TS011'].copy() #avoid mutability problems
df = df[pd.to_numeric(df['Observation'], errors='coerce').notnull()] # Drop all non_numeric (i.e. missing values)
df = df[['Lower Tier Local Authorities Code','Household deprivation (6 categories)' , 'Observation']]
df.columns = ['LAD Code', 'Measure', 'Value']

df = df.pivot(index='LAD Code', columns='Measure', values='Value')
df.columns = list(df.columns) #Hacky way of flattening
df = df.reset_index()
df['Total Households'] = df.iloc[:, 1:].sum(axis=1)
df['Deprived Pct'] = (df['Total Households']-df['Household is not deprived in any dimension'])/df['Total Households']
df = df[['LAD Code', 'Deprived Pct']]
depr_df = df
depr_df

Unnamed: 0,LAD Code,Deprived Pct
0,E06000001,0.578256
1,E06000002,0.578391
2,E06000003,0.558844
3,E06000004,0.516494
4,E06000005,0.513973
...,...,...
326,W06000020,0.577148
327,W06000021,0.483348
328,W06000022,0.541390
329,W06000023,0.514423


### ...and processing our foodbank data:

I want to get the average distance to foodbanks for the 5 closest foodbanks:

In [171]:
avg_dist = {}
for region in fb_distances:
    dist_list = fb_distances[region]
    if len(dist_list)<5:
        avg_dist[region] = None
    else:
        dist_list = [float(re.findall("[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", d)[0]) for d in dist_list] # Regex for properly matching numbers, stolen from https://stackoverflow.com/questions/4289331/how-to-extract-numbers-from-a-string-in-python
        dist_list = [dist*1.60934 for dist in dist_list] #Convert to KM
        avg_dist[region] = np.average(dist_list)

let's make a df of this and drop outliers (some locations were incorrectly found):

In [202]:
df = pd.DataFrame(data={'Region name':avg_dist.keys(), 'Average Distance':avg_dist.values() })
df = df[df['Average Distance'] < 200]
df = df[df['Average Distance'] != None]
df.sort_values(by='Average Distance')
fb_df = df
fb_df

Unnamed: 0,Region name,Average Distance
0,Leicester,2.391479
1,Gloucester,12.820002
3,Harrogate,16.479642
4,Luton,4.612368
5,York,10.455882
...,...,...
369,Liverpool,4.913315
370,Sutton,5.006657
371,Clackmannanshire,17.163611
372,Epsom and Ewell,5.151497


### ... and load our income data:

In [198]:
df = pd.read_pickle('LAD_incomes.pkl')
df = df[['Region name', 'LAD code', '2019']]
df.columns = ['Region name', 'LAD Code', 'Avg Income']
income_df = df
income_df

Unnamed: 0,Region name,LAD Code,Avg Income
0,Hartlepool,E06000001,15962.0
1,Stockton-on-Tees,E06000004,17519.0
2,Middlesbrough,E06000002,15835.0
3,Redcar and Cleveland,E06000003,16882.0
4,Darlington,E06000005,17391.0
...,...,...,...
391,Knowsley,E08000011,16627.0
392,St. Helens,E08000013,17325.0
393,Liverpool,E08000012,15673.0
394,Sefton,E08000014,19325.0


### Merging All Our Data

In [208]:
df = pd.merge(satis_df, expec_df, on=['LAD Code'], how='inner')
df = pd.merge(df, depr_df, on=['LAD Code'], how='inner')
df = pd.merge(df, income_df, on=['LAD Code'], how='inner')
df = pd.merge(df, fb_df, on=['Region name'], how='inner')
df = df[['LAD Code', 'Region name' ,'Life Satisfaction', 'Life Expectency', 'Deprived Pct',
        'Avg Income', 'Average Distance']]
df

Unnamed: 0,LAD Code,Region name,Life Satisfaction,Life Expectency,Deprived Pct,Avg Income,Average Distance
0,E06000052,Cornwall,7.58,83.59,0.538899,18846.0,13.758248
1,E08000028,Sandwell,7.29,77.04,0.620804,14454.0,6.757619
2,E06000044,Portsmouth,7.36,81.93,0.528083,17028.0,14.929847
3,E06000012,North East Lincolnshire,7.38,82.13,0.560321,16569.0,11.065822
4,E07000212,Runnymede,7.37,84.65,0.466014,26925.0,8.070840
...,...,...,...,...,...,...,...
269,E06000037,West Berkshire,7.50,85.17,0.440112,26714.0,12.659068
270,E06000043,Brighton and Hove,7.57,79.10,0.521240,23142.0,8.262352
271,E07000165,Harrogate,7.86,81.05,0.427321,26868.0,16.479642
272,E07000109,Gravesham,6.81,79.54,0.548581,20665.0,10.673143


### and Clustering!

In [210]:
data = df[['Life Satisfaction', 'Life Expectency', 'Deprived Pct',
        'Avg Income', 'Average Distance']]

# Create the scaler
scaler = StandardScaler()

# Normalizing
scaler.fit(data)
normalized_data = scaler.transform(data)

In [211]:
# Create the KMeans model
kmeans = KMeans(n_clusters=4)

# Fit the model to the data
kmeans.fit(normalized_data)

# Predict the clusters
predicted_clusters = kmeans.predict(normalized_data)

adding our clusters back to df:

In [223]:
normalized_data

array([[ 0.03110303,  0.7478929 ,  0.60447071, -0.54478239,  0.06766991],
       [-1.24283861, -2.02791807,  2.16784875, -1.20177596, -0.82394815],
       [-0.93533546,  0.04440493,  0.39802551, -0.81673464,  0.21688783],
       ...,
       [ 1.26111565, -0.32852845, -1.52528712,  0.65521816,  0.41427362],
       [-3.35143168, -0.96844823,  0.78928476, -0.27268054, -0.32525694],
       [-1.59427079,  1.27338994,  0.02952446, -0.57724314, -1.18961404]])

In [225]:
df['cluster'] = predicted_clusters
df.rename(columns={'Average Distance':'Foodbank Distance'})
df[['Life Satisfaction (Normalised)', 'Life Expectency (Normalised)', 'Household Deprivation (Normalised)',  'Average Income (Normalised)', 'Foodbank Distance (Normalised)']] = normalized_data

### exporting our data

In [226]:
df.to_json('clustered_data.json', orient='records')

In [227]:
df

Unnamed: 0,LAD Code,Region name,Life Satisfaction,Life Expectency,Deprived Pct,Avg Income,Average Distance,cluster,Life Satisfaction (Normalised),Life Expectency (Normalised),Household Deprivation (Normalised),Average Income (Normalised),Foodbank Distance (Normalised)
0,E06000052,Cornwall,7.58,83.59,0.538899,18846.0,13.758248,2,0.031103,0.747893,0.604471,-0.544782,0.067670
1,E08000028,Sandwell,7.29,77.04,0.620804,14454.0,6.757619,2,-1.242839,-2.027918,2.167849,-1.201776,-0.823948
2,E06000044,Portsmouth,7.36,81.93,0.528083,17028.0,14.929847,2,-0.935335,0.044405,0.398026,-0.816735,0.216888
3,E06000012,North East Lincolnshire,7.38,82.13,0.560321,16569.0,11.065822,2,-0.847477,0.129163,1.013378,-0.885396,-0.275244
4,E07000212,Runnymede,7.37,84.65,0.466014,26925.0,8.070840,1,-0.891406,1.197108,-0.786725,0.663745,-0.656693
...,...,...,...,...,...,...,...,...,...,...,...,...,...
269,E06000037,West Berkshire,7.50,85.17,0.440112,26714.0,12.659068,1,-0.320329,1.417478,-1.281131,0.632181,-0.072324
270,E06000043,Brighton and Hove,7.57,79.10,0.521240,23142.0,8.262352,2,-0.012826,-1.154915,0.267398,0.097851,-0.632302
271,E07000165,Harrogate,7.86,81.05,0.427321,26868.0,16.479642,0,1.261116,-0.328528,-1.525287,0.655218,0.414274
272,E07000109,Gravesham,6.81,79.54,0.548581,20665.0,10.673143,2,-3.351432,-0.968448,0.789285,-0.272681,-0.325257
