# Analysis of top preventable causes of death by MSOA in England

SDC Coursework 

In this analysis proportions of deaths by preventable causes, as defined by the [Office for National Statistics (ONS)](https://www.ons.gov.uk/peoplepopulationandcommunity/healthandsocialcare/causesofdeath/bulletins/avoidablemortalityinenglandandwales/2018#measuring-the-data). All data used in this analysis can be found on the relevant [Github Repo](https://github.com/signesw/Seeya_later). The causes of death were sourved from the [Nomis Portal](https://www.nomisweb.co.uk/query/construct/components/simpleapicomponent.aspx?menuopt=1613&subcomp=).

Yasmine Hujair and Emmanuel Farinre (Data team).

Any questions speak to [Yasmine Hujair](yasmine.hujair@gmail.com)

## Read in data 

In [1]:
#Importing packages

import numpy as np
import pandas as pd
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm                                                                                                                                                     
import matplotlib as mpl
import re
import os

import sklearn
from sklearn.neighbors import NearestNeighbors
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans, DBSCAN, OPTICS
from esda.adbscan import ADBSCAN

import random
random.seed(42)    # For reproducibility
np.random.seed(42) # For reproducibility

# Make numeric display a bit neater
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [2]:
# Set download URL
url = 'https://raw.githubusercontent.com/signesw/Seeya_later/main/data/Causes/Deaths-by-preventable-causes-counts.csv'
#read in csv
counts = pd.read_csv(url, low_memory=False)

#set index
counts.set_index(['MSOA Code'], inplace = True)
counts = counts.drop(['MSOA Name'], axis = 1)
#drop first row
counts = counts[1:]
counts.head()

Unnamed: 0_level_0,A00,A01,A02,A03,A04,A05,A06,A07,A08,A09,...,Y73,Y74,Y75,Y76,Y77,Y78,Y79,Y80,Y81,Y82
MSOA Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E02001347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
E02001348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
E02001349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
E02001350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
E02001351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
url = 'https://raw.githubusercontent.com/signesw/Seeya_later/main/data/Causes/Deaths-by-preventable-causes.csv'
causes = pd.read_csv(url, low_memory=False)
causes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 876 entries, 0 to 875
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Condition Group  876 non-null    object 
 1   Cause            876 non-null    object 
 2   ICD-10 codes     876 non-null    object 
 3   Age              876 non-null    object 
 4   Treatable        876 non-null    float64
 5   Preventable      876 non-null    float64
dtypes: float64(2), object(4)
memory usage: 41.2+ KB


# Data Wrangling

In [4]:
#converty NaN values to 0 in causes
causes = causes.fillna(0)

In [5]:
#convert columns to floats
for col in counts:
    counts[col] = counts[col].astype('float64')

In [6]:
#transpose counts
counts_t = counts.T
msoas = counts_t.columns.values.tolist()
#add total columns
counts_t['Total'] = counts_t.sum(axis=1)
counts_t.head()

MSOA Code,E02001347,E02001348,E02001349,E02001350,E02001351,E02001352,E02001353,E02001354,E02001355,E02001356,...,E02005123,E02005124,E02005125,E02005126,E02005127,E02005128,E02005129,E02005130,E02005131,Total
A00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [7]:
#sort by top values
counts_t.sort_values(by=['Total'],ascending=False, inplace=True)
counts_t.head(10)

MSOA Code,E02001347,E02001348,E02001349,E02001350,E02001351,E02001352,E02001353,E02001354,E02001355,E02001356,...,E02005123,E02005124,E02005125,E02005126,E02005127,E02005128,E02005129,E02005130,E02005131,Total
I25,0.0,5.0,6.0,6.0,5.0,5.0,5.0,5.0,5.0,5.0,...,6.0,5.0,5.0,7.0,0.0,5.0,5.0,5.0,5.0,35492.0
C34,7.0,5.0,10.0,6.0,7.0,6.0,5.0,7.0,5.0,6.0,...,7.0,9.0,0.0,0.0,0.0,6.0,6.0,5.0,5.0,29690.0
J44,5.0,9.0,7.0,13.0,5.0,5.0,0.0,7.0,7.0,5.0,...,12.0,5.0,0.0,0.0,0.0,12.0,5.0,8.0,5.0,26424.0
J18,0.0,5.0,0.0,5.0,6.0,5.0,0.0,5.0,5.0,0.0,...,5.0,6.0,10.0,10.0,5.0,6.0,9.0,5.0,0.0,24569.0
I21,0.0,5.0,6.0,5.0,0.0,0.0,5.0,5.0,0.0,5.0,...,5.0,0.0,5.0,5.0,0.0,5.0,5.0,0.0,0.0,17838.0
I64,0.0,0.0,0.0,7.0,6.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,5.0,5.0,0.0,0.0,10780.0
C50,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,5.0,0.0,6305.0
C18,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,4482.0
C15,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,3259.0
I67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1716.0


In [8]:
#drop rows that aren't preventable
causes = causes[causes.Treatable != 0]
causes = causes.reset_index(drop=True)
counts_t = counts_t.reset_index()
counts_tp = causes.merge(counts_t, how='left', left_on ='ICD-10 codes', right_on = 'index')

In [9]:
#sort dataframe by most prevalant preventable causes
counts_tp['PreventableCount'] = counts_tp['Total']*counts_tp['Preventable']
counts_tp.sort_values(by=['PreventableCount'],ascending=False, inplace=True)
#multiply all msoas by preventable %
counts_tp[msoas] = counts_tp[msoas].multiply(counts_tp['Preventable'], axis = 0)
#drop unneccessary columns
counts_top = counts_tp.drop(['Age','Treatable', 'Preventable', 'Total','index'], axis=1)
counts_top.head(10)

Unnamed: 0,Condition Group,Cause,ICD-10 codes,E02001347,E02001348,E02001349,E02001350,E02001351,E02001352,E02001353,...,E02005123,E02005124,E02005125,E02005126,E02005127,E02005128,E02005129,E02005130,E02005131,PreventableCount
80,Diseases of the circulatory system,Ischaemic heart diseases,I25,0.0,2.5,3.0,3.0,2.5,2.5,2.5,...,3.0,2.5,2.5,3.5,0.0,2.5,2.5,2.5,2.5,17746.0
76,Diseases of the circulatory system,Ischaemic heart diseases,I21,0.0,2.5,3.0,2.5,0.0,0.0,2.5,...,2.5,0.0,2.5,2.5,0.0,2.5,2.5,0.0,0.0,8919.0
85,Diseases of the circulatory system,Cerebrovascular diseases,I64,0.0,0.0,0.0,3.5,3.0,0.0,0.0,...,0.0,0.0,0.0,2.5,0.0,2.5,2.5,0.0,0.0,5390.0
88,Diseases of the circulatory system,Cerebrovascular diseases,I67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,858.0
82,Diseases of the circulatory system,Cerebrovascular diseases,I61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,769.0
69,Diseases of the circulatory system,Aortic aneurysm,I71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,698.0
52,Endocrine and metabolic diseases,Diabetes mellitus,E11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,426.0
71,Diseases of the circulatory system,Aortic aneurysm,I11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,361.0
84,Diseases of the circulatory system,Cerebrovascular diseases,I63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,260.0
55,Endocrine and metabolic diseases,Diabetes mellitus,E14,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,165.5


In [10]:
#create proportions (%) of deaths per preventable cause
counts_top[msoas] = counts_top[msoas].divide(counts_top["PreventableCount"], axis=0)
counts_top.head(10)

Unnamed: 0,Condition Group,Cause,ICD-10 codes,E02001347,E02001348,E02001349,E02001350,E02001351,E02001352,E02001353,...,E02005123,E02005124,E02005125,E02005126,E02005127,E02005128,E02005129,E02005130,E02005131,PreventableCount
80,Diseases of the circulatory system,Ischaemic heart diseases,I25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17746.0
76,Diseases of the circulatory system,Ischaemic heart diseases,I21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8919.0
85,Diseases of the circulatory system,Cerebrovascular diseases,I64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5390.0
88,Diseases of the circulatory system,Cerebrovascular diseases,I67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,858.0
82,Diseases of the circulatory system,Cerebrovascular diseases,I61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,769.0
69,Diseases of the circulatory system,Aortic aneurysm,I71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,698.0
52,Endocrine and metabolic diseases,Diabetes mellitus,E11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,426.0
71,Diseases of the circulatory system,Aortic aneurysm,I11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,361.0
84,Diseases of the circulatory system,Cerebrovascular diseases,I63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,260.0
55,Endocrine and metabolic diseases,Diabetes mellitus,E14,0.0,0.0,0.0,0.02,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,165.5


### Secondary Analysis - checking causes with aggregated data 

In [11]:
#group data by condition group
counts_tp1 = counts_tp.drop(['Cause','ICD-10 codes', 'Treatable', 'Preventable'], axis=1)
counts_agg_cg = counts_tp1.groupby('Condition Group').sum()
counts_agg_cg.sort_values(by=['PreventableCount'],ascending=False, inplace=True)
print(f"Data frame is {counts_agg_cg.shape[0]:,} x {counts_agg_cg.shape[1]}")

Data frame is 10 x 7150


In [12]:
#group data by Cause group
counts_tp2 = counts_tp.drop(['Condition Group','ICD-10 codes', 'Treatable', 'Preventable'], axis=1)
counts_agg_c = counts_tp2.groupby('Cause').sum()
counts_agg_c.sort_values(by=['PreventableCount'],ascending=False, inplace=True)
print(f"Data frame is {counts_agg_c.shape[0]:,} x {counts_agg_c.shape[1]}")

Data frame is 51 x 7150


## Aggregated by cause group selected

In this section, two alternative iterations were considered.

In [13]:
remaining_causes = counts_agg_c.iloc[5:]
remaining_causes.loc['Other'] = remaining_causes.sum()
remaining_causes.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


Unnamed: 0_level_0,E02001347,E02001348,E02001349,E02001350,E02001351,E02001352,E02001353,E02001354,E02001355,E02001356,...,E02005124,E02005125,E02005126,E02005127,E02005128,E02005129,E02005130,E02005131,Total,PreventableCount
Cause,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Renal colic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Nephritis and nephrosis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Obstructive uropathy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Other diseases of gallbladder or biliary tract,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,0.0
Other diseases of pancreas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Other pleural disorders,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Pneumonia, not elsewhere classified or organism unspecified",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24569.0,0.0
"Pregnancy, childbirth and the puerperium",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Prostatic hyperplasia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pulmonary oedema,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
counts_agg_c_top = counts_agg_c.iloc[:6,:]
counts_agg_c_top.head()

Unnamed: 0_level_0,E02001347,E02001348,E02001349,E02001350,E02001351,E02001352,E02001353,E02001354,E02001355,E02001356,...,E02005124,E02005125,E02005126,E02005127,E02005128,E02005129,E02005130,E02005131,Total,PreventableCount
Cause,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ischaemic heart diseases,0.0,5.0,6.0,5.5,2.5,2.5,5.0,5.0,2.5,5.0,...,2.5,5.0,6.0,0.0,5.0,5.0,2.5,2.5,53335.0,26667.5
Cerebrovascular diseases,0.0,0.0,0.0,3.5,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.5,0.0,2.5,5.0,0.0,0.0,14959.0,7479.5
Aortic aneurysm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2333.0,1166.5
Diabetes mellitus,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,1188.0,594.0
Cervical cancer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2.5


In [15]:
#append 'other' row
df2 = pd.DataFrame(remaining_causes, index=['Other'])
counts_agg_c_top = counts_agg_c_top.append(df2)
counts_agg_c_top.head(10)

Unnamed: 0,E02001347,E02001348,E02001349,E02001350,E02001351,E02001352,E02001353,E02001354,E02001355,E02001356,...,E02005124,E02005125,E02005126,E02005127,E02005128,E02005129,E02005130,E02005131,Total,PreventableCount
Ischaemic heart diseases,0.0,5.0,6.0,5.5,2.5,2.5,5.0,5.0,2.5,5.0,...,2.5,5.0,6.0,0.0,5.0,5.0,2.5,2.5,53335.0,26667.5
Cerebrovascular diseases,0.0,0.0,0.0,3.5,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.5,0.0,2.5,5.0,0.0,0.0,14959.0,7479.5
Aortic aneurysm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2333.0,1166.5
Diabetes mellitus,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.5,2.5,0.0,0.0,1188.0,594.0
Cervical cancer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2.5
Renal colic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Other,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39022.0,0.0


In [16]:
counts_agg_c_top.loc['Total'] = counts_agg_c_top.sum() #create total deaths per msoa row

#transpose df
counts_agg_c_top = counts_agg_c_top.T


counts_agg_c_top

Unnamed: 0,Ischaemic heart diseases,Cerebrovascular diseases,Aortic aneurysm,Diabetes mellitus,Cervical cancer,Renal colic,Other,Total
E02001347,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
E02001348,5.00,0.00,0.00,0.00,0.00,0.00,0.00,5.00
E02001349,6.00,0.00,0.00,0.00,0.00,0.00,0.00,6.00
E02001350,5.50,3.50,0.00,2.50,0.00,0.00,0.00,11.50
E02001351,2.50,3.00,0.00,0.00,0.00,0.00,0.00,5.50
...,...,...,...,...,...,...,...,...
E02005129,5.00,5.00,0.00,2.50,0.00,0.00,0.00,12.50
E02005130,2.50,0.00,0.00,0.00,0.00,0.00,0.00,2.50
E02005131,2.50,0.00,0.00,0.00,0.00,0.00,0.00,2.50
Total,53335.00,14959.00,2333.00,1188.00,5.00,0.00,39022.00,110842.00


In [17]:
columns = ['Ischaemic heart diseases', 'Cerebrovascular diseases',
       'Aortic aneurysm', 'Diabetes mellitus', 'Cervical cancer',
       'Renal colic', 'Other']

#calculate proportions of deaths per category per msoa
counts_agg_c_top[columns] = counts_agg_c_top[columns].divide(counts_agg_c_top['Total'], axis=0)
counts_agg_c_top

Unnamed: 0,Ischaemic heart diseases,Cerebrovascular diseases,Aortic aneurysm,Diabetes mellitus,Cervical cancer,Renal colic,Other,Total
E02001347,,,,,,,,0.00
E02001348,1.00,0.00,0.00,0.00,0.00,0.00,0.00,5.00
E02001349,1.00,0.00,0.00,0.00,0.00,0.00,0.00,6.00
E02001350,0.48,0.30,0.00,0.22,0.00,0.00,0.00,11.50
E02001351,0.45,0.55,0.00,0.00,0.00,0.00,0.00,5.50
...,...,...,...,...,...,...,...,...
E02005129,0.40,0.40,0.00,0.20,0.00,0.00,0.00,12.50
E02005130,1.00,0.00,0.00,0.00,0.00,0.00,0.00,2.50
E02005131,1.00,0.00,0.00,0.00,0.00,0.00,0.00,2.50
Total,0.48,0.13,0.02,0.01,0.00,0.00,0.35,110842.00


In [18]:
#convert nans to 0s (if there are no deaths there are no deaths!)
counts_agg_c_top = counts_agg_c_top.fillna(0)
counts_agg_c_top

Unnamed: 0,Ischaemic heart diseases,Cerebrovascular diseases,Aortic aneurysm,Diabetes mellitus,Cervical cancer,Renal colic,Other,Total
E02001347,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
E02001348,1.00,0.00,0.00,0.00,0.00,0.00,0.00,5.00
E02001349,1.00,0.00,0.00,0.00,0.00,0.00,0.00,6.00
E02001350,0.48,0.30,0.00,0.22,0.00,0.00,0.00,11.50
E02001351,0.45,0.55,0.00,0.00,0.00,0.00,0.00,5.50
...,...,...,...,...,...,...,...,...
E02005129,0.40,0.40,0.00,0.20,0.00,0.00,0.00,12.50
E02005130,1.00,0.00,0.00,0.00,0.00,0.00,0.00,2.50
E02005131,1.00,0.00,0.00,0.00,0.00,0.00,0.00,2.50
Total,0.48,0.13,0.02,0.01,0.00,0.00,0.35,110842.00


In [19]:
#drop total & preventablecount rows
counts_agg_c_top = counts_agg_c_top.drop(['Total','PreventableCount'], axis = 0)
#drop total column
counts_agg_c_top = counts_agg_c_top.drop(['Total'], axis = 1)
#reset index 
counts_agg_c_top = counts_agg_c_top.reset_index()
counts_agg_c_top = counts_agg_c_top.rename(columns = {'index':'MSOACode'})
counts_agg_c_top

Unnamed: 0,MSOACode,Ischaemic heart diseases,Cerebrovascular diseases,Aortic aneurysm,Diabetes mellitus,Cervical cancer,Renal colic,Other
0,E02001347,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,E02001348,1.00,0.00,0.00,0.00,0.00,0.00,0.00
2,E02001349,1.00,0.00,0.00,0.00,0.00,0.00,0.00
3,E02001350,0.48,0.30,0.00,0.22,0.00,0.00,0.00
4,E02001351,0.45,0.55,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...
7143,E02005127,0.00,0.00,0.00,0.00,0.00,0.00,0.00
7144,E02005128,0.50,0.25,0.00,0.25,0.00,0.00,0.00
7145,E02005129,0.40,0.40,0.00,0.20,0.00,0.00,0.00
7146,E02005130,1.00,0.00,0.00,0.00,0.00,0.00,0.00


In [20]:
#export csv
counts_agg_c_top.to_csv('causes-of-preventable-deaths.csv')

## Create geojson

In [21]:
#loading the MSOAs for England and Wales

msoas = gpd.read_file('https://github.com/jreades/i2p/blob/master/data/src/Middle_Layer_Super_Output_Areas__December_2011__EW_BGC_V2-shp.zip?raw=true')

#extract england from msoas
msoas_england = msoas[msoas.MSOA11CD.str.startswith('E')]

#drop unneccesary columns
to_drop = ['MSOA11NMW','LONG','LAT','Shape__Are','Shape__Len']
msoas_england.drop(columns=to_drop, inplace = True)

#tidy up boroughs
msoas_england['borough'] = msoas_england.MSOA11NM.str.replace(r' \d+','',regex=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


In [22]:
#pull in msoa nicknames
msoa_nms = pd.read_csv('https://github.com/jreades/i2p/blob/master/data/src/MSOA-Names-1.8.csv.gz?raw=true', compression='gzip')

#merge dataframes
msoas = pd.merge(msoas_england, msoa_nms, how = 'inner' , left_on = 'MSOA11CD', right_on = 'msoa11cd')
# tidy up
to_drop = ['msoa11cd','msoa11nm','msoa11nmw','Laname','msoa11hclnmw']
msoas.drop(columns=to_drop, inplace=True)
print(msoas.shape)

(6791, 8)


In [23]:
#create geodataframe
gdf = pd.merge(msoas, counts_agg_c_top, left_on = 'MSOA11CD', right_on = 'MSOACode', how = 'inner')

In [24]:
#export to geojson

gdf.to_file(os.path.join('Data','preventabledeathsbycause.geojson'), driver='GeoJSON')

Any questions speak to [Yasmine Hujair](yasmine.hujair@gmail.com)