## Inequalities in Health in England

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import os

### Data Gathering

In [2]:
if False:
    url_local_health="https://wpieconomics.us13.list-manage.com/track/click?u=5331abc1c0dacc833dd4e807b&id=645de758bb&e=9f29c59835.csv"
    multiple_deprivation='https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/833970/File_1_-_IMD2019_Index_of_Multiple_Deprivation.xlsx'
    urls=[url_local_health,multiple_deprivation]
    for url in urls:
        response=requests.get(url)
        with open(url.split(".")[2]+"."+url.split(".")[-1],'wb') as file:
            file.write(response.content)

In [3]:
df=pd.read_csv('list-manage.csv')
df2=pd.read_excel("service.xlsx",sheet_name="IMD2019")

### Pre-loading dataset 1

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6793 entries, 0 to 6792
Data columns (total 4 columns):
 #   Column                                                        Non-Null Count  Dtype 
---  ------                                                        --------------  ----- 
 0   Local Health - Office for Health Improvement and Disparities  6793 non-null   object
 1                                                                 6793 non-null   object
 2    .1                                                           6737 non-null   object
 3    .2                                                           6774 non-null   object
dtypes: object(4)
memory usage: 212.4+ KB


#### Pre-loadong  dataset 2

In [5]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32844 entries, 0 to 32843
Data columns (total 6 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   LSOA code (2011)                            32844 non-null  object
 1   LSOA name (2011)                            32844 non-null  object
 2   Local Authority District code (2019)        32844 non-null  object
 3   Local Authority District name (2019)        32844 non-null  object
 4   Index of Multiple Deprivation (IMD) Rank    32844 non-null  int64 
 5   Index of Multiple Deprivation (IMD) Decile  32844 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.5+ MB


## Data Acessment

#### Assessing dataset 1

In [6]:
df.head()

Unnamed: 0,Local Health - Office for Health Improvement and Disparities,Unnamed: 2,.1,.2
0,Geographical references: MSOA 2011 (Middle lev...,,,
1,Code,Label,Life expectancy at birth for males,Life expectancy at birth for females
2,E02000001,City of London 001,91.1,90.2
3,E02000002,Barking and Dagenham 001,79.5,82.5
4,E02000003,Barking and Dagenham 002,78.1,84.2


In [7]:
df.columns

Index(['Local Health - Office for Health Improvement and Disparities', ' ',
       ' .1', ' .2'],
      dtype='object')

##### Outcome of Assessing the Dataset 1
- Remove the title
- city code to another column
- rename columns
- Treat NaN values
- Strip the city code of the city label
- remove NaN for life expectancy at birth- that's the independent variable

### Assessing dataset 2

In [8]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32844 entries, 0 to 32843
Data columns (total 6 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   LSOA code (2011)                            32844 non-null  object
 1   LSOA name (2011)                            32844 non-null  object
 2   Local Authority District code (2019)        32844 non-null  object
 3   Local Authority District name (2019)        32844 non-null  object
 4   Index of Multiple Deprivation (IMD) Rank    32844 non-null  int64 
 5   Index of Multiple Deprivation (IMD) Decile  32844 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.5+ MB


In [9]:
df2.head()

Unnamed: 0,LSOA code (2011),LSOA name (2011),Local Authority District code (2019),Local Authority District name (2019),Index of Multiple Deprivation (IMD) Rank,Index of Multiple Deprivation (IMD) Decile
0,E01000001,City of London 001A,E09000001,City of London,29199,9
1,E01000002,City of London 001B,E09000001,City of London,30379,10
2,E01000003,City of London 001C,E09000001,City of London,14915,5
3,E01000005,City of London 001E,E09000001,City of London,8678,3
4,E01000006,Barking and Dagenham 016A,E09000002,Barking and Dagenham,14486,5


### Assessing dataset2
- drop LOSA code (2011) & LSOA name (2011) columns
- rename columns
- Convert IMD rank to integer
- convert IMD Decile to integer

## Data Cleaning

### Dataset 1
 Delete the first two rows and rename columns

In [10]:
header=["area_code","city","male_life_expectancy","female_life_expectancy"]
header

['area_code', 'city', 'male_life_expectancy', 'female_life_expectancy']

In [11]:
df=df.iloc[2:]


In [12]:
df.columns=header
df

Unnamed: 0,area_code,city,male_life_expectancy,female_life_expectancy
2,E02000001,City of London 001,91.1,90.2
3,E02000002,Barking and Dagenham 001,79.5,82.5
4,E02000003,Barking and Dagenham 002,78.1,84.2
5,E02000004,Barking and Dagenham 003,77.6,81.8
6,E02000005,Barking and Dagenham 004,79.2,84.6
...,...,...,...,...
6788,E02006930,Greenwich 037,80.8,82.8
6789,E02006931,Greenwich 038,79.6,83.9
6790,E02006932,Liverpool 060,73.4,78.7
6791,E02006933,Liverpool 061,77,84.3
