### Development in the Global Healthcare Industry

##### Functions for project

In [78]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

#the countries from 'Country Name' that are relevant
wanted_countries=['Aruba', 'Afghanistan', 'Angola', 'Albania', 'Andorra', 'United Arab Emirates', 'Argentina', 'Armenia', 'American Samoa',
                  'Antigua and Barbuda', 'Australia', 'Austria', 'Azerbaijan', 'Burundi', 'Belgium', 'Benin', 'Burkina Faso', 'Bangladesh',
                  'Bulgaria', 'Bahrain', 'Bahamas, The', 'Bosnia and Herzegovina', 'Belarus', 'Belize', 'Bermuda', 'Bolivia', 'Brazil',
                  'Barbados', 'Brunei Darussalam', 'Bhutan', 'Botswana', 'Central African Republic', 'Canada', 'Switzerland', 'Chile',
                  'China', 'Cameroon', 'Congo, Dem. Rep.', 'Congo, Rep.', 'Colombia', 'Comoros', 'Cabo Verde',
                  'Costa Rica', 'Cuba', 'Curacao', 'Cyprus', 'Czech Republic', 'Germany', 'Djibouti', 'Dominica',
                  'Denmark', 'Dominican Republic', 'Algeria', 'Ecuador', 'Egypt, Arab Rep.', 'Eritrea', 'Spain', 'Estonia', 'Ethiopia',
                  'Finland', 'Fiji', 'France', 'Faroe Islands', 'Micronesia, Fed. Sts.', 'Gabon', 'United Kingdom', 'Georgia', 'Ghana',
                  'Guinea', 'Gambia, The', 'Guinea-Bissau', 'Equatorial Guinea', 'Greece', 'Grenada', 'Greenland', 'Guatemala', 'Guam',
                  'Guyana', 'Honduras', 'Croatia', 'Haiti', 'Hungary', 'Indonesia', 'India', 'Ireland', 'Iran, Islamic Rep.', 'Iraq',
                  'Iceland', 'Israel', 'Italy', 'Jamaica', 'Jordan', 'Japan', 'Kazakhstan', 'Kenya', 'Kyrgyz Republic', 'Cambodia',
                  'Kiribati', 'St. Kitts and Nevis', 'Korea, Rep.', 'Kuwait', 'Lao PDR', 'Lebanon', 'Liberia', 'Libya', 'St. Lucia',
                  'Liechtenstein', 'Sri Lanka', 'Lesotho', 'Lithuania', 'Luxembourg', 'Latvia', 'Morocco', 'Monaco', 'Moldova', 'Madagascar',
                  'Maldives', 'Mexico', 'Marshall Islands', 'North Macedonia', 'Mali', 'Malta', 'Myanmar', 'Montenegro', 'Mongolia', 'Mozambique',
                  'Mauritania', 'Mauritius', 'Malawi', 'Malaysia', 'Namibia', 'Niger', 'Nigeria', 'Nicaragua', 'Netherlands', 'Norway', 'Nepal',
                  'Nauru', 'New Zealand', 'Oman', 'Pakistan', 'Panama', 'Peru', 'Philippines', 'Palau', 'Papua New Guinea', 'Poland', 'Puerto Rico',
                  "Korea, Dem. People's Rep.", 'Portugal', 'Paraguay', 'French Polynesia', 'Qatar', 'Romania', 'Russian Federation', 'Rwanda',
                  'Saudi Arabia', 'Sudan', 'Senegal', 'Singapore', 'Solomon Islands', 'Sierra Leone', 'El Salvador', 'San Marino', 'Somalia',
                  'Serbia', 'South Sudan', 'Sao Tome and Principe', 'Suriname', 'Slovak Republic', 'Slovenia', 'Sweden', 'Eswatini',
                  'Sint Maarten (Dutch part)', 'Seychelles', 'Syrian Arab Republic', 'Chad', 'Togo', 'Thailand', 'Tajikistan', 'Turkmenistan',
                  'Timor-Leste', 'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Tuvalu', 'Tanzania', 'Uganda', 'Ukraine', 'Uruguay',
                  'United States', 'Uzbekistan', 'Venezuela, RB', 'Vietnam', 'Vanuatu', 'Samoa',
                  'Yemen, Rep.', 'South Africa', 'Zambia', 'Zimbabwe']

dates = ['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
date_nums=[]
for i in dates:
    date_nums.append(int(i))

#data clean
def clean(df):
    return df[df['Country Name'].isin(wanted_countries)].reset_index(drop=True).drop(columns=['Indicator Name','Indicator Code','Country Code']).set_index('Country Name')

#make new df out of existing dfs for all data for one country or date (keyowrd dependent)

def third(d,keyword=''):    
    temp=pd.DataFrame()    
    if keyword=='':
        return 'No keyword chosen'        
    elif keyword in wanted_countries:
        for kind, dfr in d.items():        
            temp[kind] = dfr.loc[keyword]
        temp['dates']=date_nums
        return temp
    elif keyword in dates:
        for kind, dfr in d.items():        
            temp[kind] = dfr[keyword]
        return temp
    else:
        return 'Keyword not found'
    
def newdf(d,columns):
    df = pd.DataFrame()
    for column in columns:
        df[column]=d[column]
    return df

### dataframes

- **We initialize the dataframes for:**

    1. GDP per capita (gdp)
    2. Crude birth rate (cbr)
    3. Crude death rate (cdr)
    4. Rate of tuberculosis incidence (tbc)
    5. Population (pop)
    6. Life expectancy (lfe)
    7. HIV Incidence (hiv)
    8. Undernourishment percentage (unr)<br><br>
    
- We also initialize the dictionary 'dfs' which we will be using to store the dataframes we will use for the analyses. This may be referred to as the "Three Dimensional DataFrame".

In [79]:
gdp=pd.DataFrame()
cbr=pd.DataFrame()
cdr=pd.DataFrame()
tbc=pd.DataFrame()
pop=pd.DataFrame()
lfe=pd.DataFrame()
hiv=pd.DataFrame()
unr=pd.DataFrame()
dfs = {}

- We then clean the data in order to best fit our use, using the following function:

```
def clean(df):
    return df[df['Country Name'].isin(wanted_countries)].reset_index(drop=True).drop(columns=['Indicator Name','Indicator Code','Country Code']).set_index('Country Name')
    
```

The most notable change we make to the actual data is that we divide all items in the population dataframe by ``1000000`` for simplicity. Population will further be measured in the millions.

In [80]:
gdp = clean(pd.read_csv('data/gdp_percap.csv')).drop(columns='Unnamed: 65')
cbr = clean(pd.read_csv('data/birth_rate.csv')).drop(columns='Unnamed: 65')
cdr = clean(pd.read_csv('data/death_rate.csv')).drop(columns='Unnamed: 65')
tbc = clean(pd.read_csv('data/tuberculosis.csv')).drop(columns='Unnamed: 65')
pop = clean(pd.read_csv('data/pop.csv')).drop(columns='Unnamed: 65')/1000000
lfe = clean(pd.read_csv('data/life_expectancy.csv')).drop(columns='Unnamed: 65')
hiv = clean(pd.read_csv('data/hiv.csv')).drop(columns='Unnamed: 65')
unr = clean(pd.read_csv('data/undernourishment.csv')).drop(columns='Unnamed: 65')

- We construct the three dimensional dataframe by adding all of the dataframes into the dictionary we previously declared. We omit 'HIV' due to a lack of data.

In [81]:
dfs['GDP']=gdp
dfs['POP']=pop
dfs['CBR']=cbr
dfs['CDR']=cdr
dfs['TBC']=tbc
dfs['LFE']=lfe
### HIV has a lot of missing data, 
#too many values are missing in critical countries for us to make any reasonable moves towards conclusions
#thus HIV will be omitted from the analysis
#dfs['HIV']=hiv
#Same applies to undernourishment
#dfs['UNR']=unr

- We now make a new dictionary, called 'cases'. This will be used to store the cases we use for the case studies in the analysis.
- The cases that we decide on will decided based on the mean GDP per capita of the available countries over a span of the years 1960 to 2020.
- From this, we take:
    1) The top 1 percentile
    2) The middle 1 percentile
    3) The bottom 1 percentile
    
    <br>

- These cases will be representative of all of the data.

In [82]:
cases = {}

In [83]:
cases['Top'] = gdp.mean(axis=1)[gdp.mean(axis=1)>=gdp.mean(axis=1).quantile(0.99)].index.to_list()

In [84]:
cases['Middle'] = gdp.mean(axis=1)[(gdp.mean(axis=1)>=gdp.mean(axis=1).quantile(0.495)) & (gdp.mean(axis=1)<=gdp.mean(axis=1).quantile(0.505))].index.to_list()

In [85]:
cases['Bottom'] = gdp.mean(axis=1)[gdp.mean(axis=1)<=gdp.mean(axis=1).quantile(0.01)].index.to_list()

In [86]:
pd.DataFrame(cases)

Unnamed: 0,Top,Middle,Bottom
0,Liechtenstein,Botswana,Burundi
1,Monaco,Turkmenistan,Somalia


In [93]:
t1 = "Liechtenstein"
t2 = "Monaco"
m1 = "Botswana"
m2 = "Turkmenistan"
b1 = "Burundi"
b2 = "Somalia"

In [107]:
t1df = third(dfs,keyword = t1)
t1df

Unnamed: 0,GDP,POP,CBR,CDR,TBC,LFE,dates
1960,,0.016501,23.1,7.5,,,1960
1961,,0.016894,21.3,7.8,,,1961
1962,,0.017300,20.6,9.5,,,1962
1963,,0.017724,22.0,8.1,,,1963
1964,,0.018170,20.6,7.9,,,1964
...,...,...,...,...,...,...,...
2016,165642.386276,0.037655,10.0,7.2,,82.258537,2016
2017,171253.964254,0.037805,8.9,6.6,,83.746341,2017
2018,180366.715198,0.037918,9.9,7.2,,83.041463,2018
2019,175813.875592,0.038020,9.2,6.8,,83.041463,2019


In [110]:
t1df.corr()['GDP']

GDP      1.000000
POP      0.955717
CBR     -0.855147
CDR     -0.128496
TBC           NaN
LFE      0.885644
dates    0.971316
Name: GDP, dtype: float64

In [111]:
t2df = third(dfs,keyword=t2)
t2df

Unnamed: 0,GDP,POP,CBR,CDR,TBC,LFE,dates
1960,,0.022461,,,,,1960
1961,,0.022813,,,,,1961
1962,,0.023043,,,,,1962
1963,,0.023165,,,,,1963
1964,,0.023236,,,,,1964
...,...,...,...,...,...,...,...
2016,170028.655718,0.038070,7.8,7.9,6.0,,2016
2017,167517.059728,0.038392,,,0.0,,2017
2018,185978.609251,0.038682,5.9,6.6,3.0,,2018
2019,189487.147128,0.038967,,,0.0,,2019


In [114]:
t2df.corr()['GDP']

GDP      1.000000
POP      0.965727
CBR     -0.975807
CDR     -0.976538
TBC      0.269116
LFE           NaN
dates    0.967784
Name: GDP, dtype: float64

In [118]:
m1df = third(dfs,m1)
m1df

Unnamed: 0,GDP,POP,CBR,CDR,TBC,LFE,dates
1960,60.493958,0.502733,47.281,17.687,,49.179,1960
1961,64.176140,0.512688,47.059,17.275,,49.684,1961
1962,68.050349,0.523777,46.824,16.867,,50.171,1962
1963,71.106439,0.535692,46.589,16.464,,50.641,1963
1964,75.955918,0.547870,46.362,16.066,,51.099,1964
...,...,...,...,...,...,...,...
2016,6982.917492,2.159925,25.943,6.000,326.0,68.178,2016
2017,7296.092142,2.205076,25.396,5.834,300.0,68.812,2017
2018,7503.878588,2.254067,24.821,5.742,275.0,69.275,2018
2019,7203.064221,2.303703,24.234,5.708,253.0,69.592,2019


In [120]:
m1df.corr()['GDP']

GDP      1.000000
POP      0.973384
CBR     -0.939469
CDR     -0.614902
TBC     -0.896620
LFE      0.568411
dates    0.960816
Name: GDP, dtype: float64

In [121]:
m2df = third(dfs,m2)
m2df

Unnamed: 0,GDP,POP,CBR,CDR,TBC,LFE,dates
1960,,1.603254,45.710,15.946,,54.471,1960
1961,,1.658364,45.552,15.578,,54.897,1961
1962,,1.715408,45.121,15.173,,55.326,1962
1963,,1.773854,44.433,14.736,,55.757,1963
1964,,1.833065,43.534,14.279,,56.186,1964
...,...,...,...,...,...,...,...
2016,6387.682575,5.662371,25.370,7.076,46.0,67.835,2016
2017,6587.092604,5.757667,24.615,7.057,43.0,67.956,2017
2018,6967.375043,5.850902,23.831,7.045,46.0,68.073,2018
2019,7612.035180,5.942094,23.066,7.040,45.0,68.191,2019


In [122]:
m2df.corr()['GDP']

GDP      1.000000
POP      0.875476
CBR     -0.336663
CDR     -0.771347
TBC     -0.890081
LFE      0.958766
dates    0.891459
Name: GDP, dtype: float64

In [123]:
b1df = third(dfs,b1)
b1df

Unnamed: 0,GDP,POP,CBR,CDR,TBC,LFE,dates
1960,70.051910,2.797925,48.510,23.226,,41.281,1960
1961,71.167188,2.852438,48.446,22.930,,41.592,1961
1962,73.435331,2.907320,48.391,22.629,,41.907,1962
1963,78.514621,2.964416,48.338,22.328,,42.225,1963
1964,86.161550,3.026292,48.283,22.031,,42.540,1964
...,...,...,...,...,...,...,...
2016,260.565221,10.488002,40.278,8.300,118.0,60.528,2016
2017,253.826354,10.827010,39.646,8.106,114.0,60.898,2017
2018,238.783467,11.175379,39.008,7.929,111.0,61.247,2018
2019,228.213589,11.530577,38.377,7.766,107.0,61.584,2019


In [124]:
b1df.corr()['GDP']

GDP      1.000000
POP      0.698225
CBR     -0.257508
CDR     -0.715310
TBC     -0.880945
LFE      0.748257
dates    0.724338
Name: GDP, dtype: float64

In [130]:
b2df = third(dfs,b2)
b2df.head().drop(columns='dates')

Unnamed: 0,GDP,POP,CBR,CDR,TBC,LFE
1960,65.479716,2.755967,47.63,26.748,,36.976
1961,68.106397,2.814125,47.52,26.367,,37.374
1962,70.813049,2.874215,47.424,25.99,,37.773
1963,73.607204,2.936478,47.339,25.616,,38.175
1964,76.480398,3.00116,47.261,25.246,,38.578


In [140]:
b2df.corr()['GDP']

GDP      1.000000
POP      0.951175
CBR     -0.826070
CDR     -0.928148
TBC     -0.882634
LFE      0.947697
dates    0.946321
Name: GDP, dtype: float64

In [134]:
c = []
for i,j in cases.items():
    for k in j:
        c.append(k)

In [150]:
correlations = pd.DataFrame()
for country in c:
    e = third(dfs,country).corr()['GDP'][1:6]
    correlations[country]=e
correlations = correlations.transpose()

In [152]:
correlations

Unnamed: 0,POP,CBR,CDR,TBC,LFE
Liechtenstein,0.955717,-0.855147,-0.128496,,0.885644
Monaco,0.965727,-0.975807,-0.976538,0.269116,
Botswana,0.973384,-0.939469,-0.614902,-0.89662,0.568411
Turkmenistan,0.875476,-0.336663,-0.771347,-0.890081,0.958766
Burundi,0.698225,-0.257508,-0.71531,-0.880945,0.748257
Somalia,0.951175,-0.82607,-0.928148,-0.882634,0.947697


In [155]:
correlations.describe()

Unnamed: 0,POP,CBR,CDR,TBC,LFE
count,6.0,6.0,6.0,5.0,5.0
mean,0.903284,-0.698444,-0.689123,-0.656233,0.821755
std,0.106469,0.3166,0.305614,0.517324,0.164545
min,0.698225,-0.975807,-0.976538,-0.89662,0.568411
25%,0.894401,-0.918389,-0.888947,-0.890081,0.748257
50%,0.953446,-0.840608,-0.743328,-0.882634,0.885644
75%,0.963225,-0.459015,-0.640004,-0.880945,0.947697
max,0.973384,-0.257508,-0.128496,0.269116,0.958766


In [157]:
pd.DataFrame(correlations.describe().loc['mean'])

Unnamed: 0,mean
POP,0.903284
CBR,-0.698444
CDR,-0.689123
TBC,-0.656233
LFE,0.821755


#### For more details on how this data was used, read the paper written about this, *Development in the Global Healthcare Industry,* by Arjun Naik
- Find it at: [github.com/Aleph-Null-123/Development-in-the-Global-Healthcare-Industry/blob/main/development_in_the_global_healthcare_industry.pdf](https://github.com/Aleph-Null-123/Development-in-the-Global-Healthcare-Industry/blob/main/development_in_the_global_healthcare_industry.pdf)