<a href="https://colab.research.google.com/github/DomMc97/Applied-Maths-MSc-Thesis/blob/master/docs/MSOA%20Data%20Prep/MSOA_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data Preperation

Prepares all MSOA data.

##Libraries

In [None]:
from numpy import where
import pandas as pd
import numpy as np

In [None]:
%%capture
import os
!curl -L http://download.osgeo.org/libspatialindex/spatialindex-src-1.8.5.tar.gz | tar xz
os.chdir("/content/spatialindex-src-1.8.5")
!./configure
!make
!make install
!ldconfig
!pip install descartes
!pip install rtree
!pip install geopandas
import geopandas as gpd

##Mounts Drive

In [None]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
root = '/content/drive/My Drive/Project/MSOA Data Preparation/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## MSOA Data

Loads Shapefile of MSOAs in England and Wales.

In [None]:
MSOAs = gpd.read_file(root + 'MSOA_boundaries.shp')
MSOAs = MSOAs.iloc[:,[1,2,4,6]]
print('Shape: ', MSOAs.shape)
MSOAs.head()

Shape:  (7201, 4)


Unnamed: 0,msoa11cd,msoa11nm,st_areasha,geometry
0,E02000001,City of London 001,2983633.0,"POLYGON ((532419.592 181998.305, 532746.814 18..."
1,E02000002,Barking and Dagenham 001,2091907.0,"POLYGON ((548452.001 189024.718, 548235.426 18..."
2,E02000003,Barking and Dagenham 002,2122216.0,"POLYGON ((548954.500 189063.203, 548874.188 18..."
3,E02000004,Barking and Dagenham 003,2569470.0,"POLYGON ((551943.813 186027.672, 551670.812 18..."
4,E02000005,Barking and Dagenham 004,1111109.0,"POLYGON ((549145.624 187383.875, 549128.657 18..."


Loads list of MSOAS in cosine similarity matrix.

In [None]:
MSOA_List = pd.read_csv(root + 'MSOA LIST.csv', index_col=0)
MSOA_List.columns = ['ID']
print('Shape: ',MSOA_List.shape)
MSOA_List.head()

Shape:  (6791, 1)


Unnamed: 0,ID
0,E02000001
1,E02000002
2,E02000003
3,E02000004
4,E02000005


Removes MSOAs in Wales from MSOAs dataframe.

In [None]:
MSOAs = MSOAs[MSOAs.msoa11cd.isin(MSOA_List.ID)]
MSOAs.at[6737,'msoa11nm'] = 'Shepway 015' #corrects error in dataset
print('Shape: ',MSOAs.shape)

Shape:  (6791, 4)


Loads population data.

In [None]:
Population = pd.read_csv(root + 'MSOA_population.csv')
Population['All Ages'] = Population['All Ages'].str.replace(',', '').astype(float)
Population.head()

Unnamed: 0,Area Codes,Area Names,Unnamed: 2,All Ages
0,E06000047,County Durham,,515923.0
1,E02004297,,County Durham 001,7836.0
2,E02004290,,County Durham 002,5965.0
3,E02004298,,County Durham 003,9688.0
4,E02004299,,County Durham 004,8543.0


Removes population data not needed.

In [None]:
Population = Population[Population['Area Codes'].isin(MSOA_List.ID)]
Population = Population[['Area Codes','All Ages']]
Population.columns = ['msoa11cd','pop']
print('Shape: ',Population.shape)
Population.head()

Shape:  (6791, 2)


Unnamed: 0,msoa11cd,pop
1,E02004297,7836.0
2,E02004290,5965.0
3,E02004298,9688.0
4,E02004299,8543.0
5,E02004291,6825.0


Adds population data to MSOAs df.

In [None]:
MSOAs = pd.merge(MSOAs, Population)
MSOAs.head()

Unnamed: 0,msoa11cd,msoa11nm,st_areasha,geometry,pop
0,E02000001,City of London 001,2983633.0,"POLYGON ((532419.592 181998.305, 532746.814 18...",6031.0
1,E02000002,Barking and Dagenham 001,2091907.0,"POLYGON ((548452.001 189024.718, 548235.426 18...",7131.0
2,E02000003,Barking and Dagenham 002,2122216.0,"POLYGON ((548954.500 189063.203, 548874.188 18...",10437.0
3,E02000004,Barking and Dagenham 003,2569470.0,"POLYGON ((551943.813 186027.672, 551670.812 18...",6393.0
4,E02000005,Barking and Dagenham 004,1111109.0,"POLYGON ((549145.624 187383.875, 549128.657 18...",9116.0


Changes crs of MSOAs df. 

In [None]:
MSOAs = gpd.GeoDataFrame(MSOAs).to_crs(epsg=4326)

Loads Hospital Trust location data. 

In [None]:
# hospital trusts and there locations
Pro = pd.read_csv(root + 'PROCODE3 POSTCODES.csv')
print('Shape: ',Pro.shape)
Pro.head()

Shape:  (249, 5)


Unnamed: 0,PROCODE5,NAME,POSTCODE,Latitude,Longitude
0,R1A,WORCESTERSHIRE HEALTH AND CARE NHS TRUST,WR4 9RW,52.19506,-2.213008
1,R1C,SOLENT NHS TRUST,SO16 4XE,50.92515,-1.446226
2,R1E,STAFFORDSHIRE AND STOKE ON TRENT PARTNERSHIP N...,ST5 1QG,53.010128,-2.227616
3,R1F,ISLE OF WIGHT NHS TRUST,PO30 5TG,50.710843,-1.30133
4,R1G,TORBAY AND SOUTHERN DEVON HEALTH AND CARE NHS ...,TQ2 7TD,50.48778,-3.552785


Loads data of list of Hospital Trusts included in study. 

In [None]:
Pro_List = pd.read_csv(root + 'PROLIST.csv',index_col=0)
Pro_List.columns = ['ID']
print('Shape: ',Pro_List.shape)
Pro_List.head()

Shape:  (140, 1)


Unnamed: 0,ID
0,R1F
1,R1H
2,R1K
3,RA2
4,RA3


Removes data from Pro not used in study.

In [None]:
Pro = Pro[Pro.PROCODE5.isin(Pro_List.ID)]
print('Shape: ',Pro.shape)

Shape:  (140, 5)


Converts the pandas dataframe Pro to a geodataframe.

In [None]:
Pro = gpd.GeoDataFrame(
    Pro[['NAME']], geometry=gpd.points_from_xy(x=Pro.Longitude, y=Pro.Latitude),
    crs = 'EPSG:4326'
)
Pro = Pro.reset_index()
Pro.head()

Unnamed: 0,index,NAME,geometry
0,3,ISLE OF WIGHT NHS TRUST,POINT (-1.30133 50.71084)
1,5,BARTS HEALTH NHS TRUST,POINT (-0.05813 51.51902)
2,7,ROYAL SURREY COUNTY HOSPITAL NHS FOUNDATION TRUST,POINT (-0.60746 51.24102)
3,8,WESTON AREA HEALTH NHS TRUST,POINT (-2.97140 51.32233)
4,9,YEOVIL DISTRICT HOSPITAL NHS FOUNDATION TRUST,POINT (-2.63471 50.94484)


Adds a binary column to the MSOAs df detailing whether the MSOA contains a hospital trust.

In [None]:
# spatial join to create gdf of the MSOA each provider is within
trust = gpd.sjoin(Pro, MSOAs, op='within', how ='right')

# where a MSOA has a trust within set new column value to 1  
trust['Contains trust'] = where(trust.NAME.isna(), 0, 1)

# sorts index then adds column to MSOAs
trust = trust.sort_index()
MSOAs['con_trust'] = trust['Contains trust']

#reorders data
cols = MSOAs.columns.tolist()
cols[3] , cols[4] = cols[4], cols[3]
MSOAs = MSOAs[cols]

MSOAs.head()

Unnamed: 0,msoa11cd,msoa11nm,st_areasha,pop,geometry,con_trust
0,E02000001,City of London 001,2983633.0,6031.0,"POLYGON ((-0.09276 51.52139, -0.08813 51.51941...",0
1,E02000002,Barking and Dagenham 001,2091907.0,7131.0,"POLYGON ((0.14112 51.58054, 0.13788 51.57812, ...",0
2,E02000003,Barking and Dagenham 002,2122216.0,10437.0,"POLYGON ((0.14838 51.58075, 0.14698 51.57568, ...",0
3,E02000004,Barking and Dagenham 003,2569470.0,6393.0,"POLYGON ((0.19018 51.55268, 0.18600 51.54753, ...",0
4,E02000005,Barking and Dagenham 004,1111109.0,9116.0,"POLYGON ((0.15043 51.56561, 0.14998 51.56138, ...",0


Removes Isles of Scilly

In [None]:
# index for removal from stability data.
idx = MSOAs[MSOAs['msoa11nm'] == 'Isles of Scilly 001'].index.values[0]
print('Index of Isles of Scilly 001: ', idx)

Index of Isles of Scilly 001:  6639


In [None]:
MSOAs = MSOAs[MSOAs['msoa11nm'] != 'Isles of Scilly 001']

Saves MSOAs as shp file.

In [None]:
MSOAs.to_file('/content/drive/My Drive/Project/MSOAs/MSOAs.shp')

Drop index column from Pro and saves Pro.

In [None]:
Pro = Pro.drop('index',axis = 1)
Pro.to_file('/content/drive/My Drive/Project/MSOAs/PRO.shp')
Pro.head()

Unnamed: 0,NAME,geometry
0,ISLE OF WIGHT NHS TRUST,POINT (-1.30133 50.71084)
1,BARTS HEALTH NHS TRUST,POINT (-0.05813 51.51902)
2,ROYAL SURREY COUNTY HOSPITAL NHS FOUNDATION TRUST,POINT (-0.60746 51.24102)
3,WESTON AREA HEALTH NHS TRUST,POINT (-2.97140 51.32233)
4,YEOVIL DISTRICT HOSPITAL NHS FOUNDATION TRUST,POINT (-2.63471 50.94484)
