### Development in the Global Healthcare Industry

##### Functions for project

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

#the countries from 'Country Name' that are relevant
wanted_countries=['Aruba', 'Afghanistan', 'Angola', 'Albania', 'Andorra', 'United Arab Emirates', 'Argentina', 'Armenia', 'American Samoa',
                  'Antigua and Barbuda', 'Australia', 'Austria', 'Azerbaijan', 'Burundi', 'Belgium', 'Benin', 'Burkina Faso', 'Bangladesh',
                  'Bulgaria', 'Bahrain', 'Bahamas, The', 'Bosnia and Herzegovina', 'Belarus', 'Belize', 'Bermuda', 'Bolivia', 'Brazil',
                  'Barbados', 'Brunei Darussalam', 'Bhutan', 'Botswana', 'Central African Republic', 'Canada', 'Switzerland', 'Chile',
                  'China', 'Cameroon', 'Congo, Dem. Rep.', 'Congo, Rep.', 'Colombia', 'Comoros', 'Cabo Verde',
                  'Costa Rica', 'Cuba', 'Curacao', 'Cyprus', 'Czech Republic', 'Germany', 'Djibouti', 'Dominica',
                  'Denmark', 'Dominican Republic', 'Algeria', 'Ecuador', 'Egypt, Arab Rep.', 'Eritrea', 'Spain', 'Estonia', 'Ethiopia',
                  'Finland', 'Fiji', 'France', 'Faroe Islands', 'Micronesia, Fed. Sts.', 'Gabon', 'United Kingdom', 'Georgia', 'Ghana',
                  'Guinea', 'Gambia, The', 'Guinea-Bissau', 'Equatorial Guinea', 'Greece', 'Grenada', 'Greenland', 'Guatemala', 'Guam',
                  'Guyana', 'Honduras', 'Croatia', 'Haiti', 'Hungary', 'Indonesia', 'India', 'Ireland', 'Iran, Islamic Rep.', 'Iraq',
                  'Iceland', 'Israel', 'Italy', 'Jamaica', 'Jordan', 'Japan', 'Kazakhstan', 'Kenya', 'Kyrgyz Republic', 'Cambodia',
                  'Kiribati', 'St. Kitts and Nevis', 'Korea, Rep.', 'Kuwait', 'Lao PDR', 'Lebanon', 'Liberia', 'Libya', 'St. Lucia',
                  'Liechtenstein', 'Sri Lanka', 'Lesotho', 'Lithuania', 'Luxembourg', 'Latvia', 'Morocco', 'Monaco', 'Moldova', 'Madagascar',
                  'Maldives', 'Mexico', 'Marshall Islands', 'North Macedonia', 'Mali', 'Malta', 'Myanmar', 'Montenegro', 'Mongolia', 'Mozambique',
                  'Mauritania', 'Mauritius', 'Malawi', 'Malaysia', 'Namibia', 'Niger', 'Nigeria', 'Nicaragua', 'Netherlands', 'Norway', 'Nepal',
                  'Nauru', 'New Zealand', 'Oman', 'Pakistan', 'Panama', 'Peru', 'Philippines', 'Palau', 'Papua New Guinea', 'Poland', 'Puerto Rico',
                  "Korea, Dem. People's Rep.", 'Portugal', 'Paraguay', 'French Polynesia', 'Qatar', 'Romania', 'Russian Federation', 'Rwanda',
                  'Saudi Arabia', 'Sudan', 'Senegal', 'Singapore', 'Solomon Islands', 'Sierra Leone', 'El Salvador', 'San Marino', 'Somalia',
                  'Serbia', 'South Sudan', 'Sao Tome and Principe', 'Suriname', 'Slovak Republic', 'Slovenia', 'Sweden', 'Eswatini',
                  'Sint Maarten (Dutch part)', 'Seychelles', 'Syrian Arab Republic', 'Chad', 'Togo', 'Thailand', 'Tajikistan', 'Turkmenistan',
                  'Timor-Leste', 'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Tuvalu', 'Tanzania', 'Uganda', 'Ukraine', 'Uruguay',
                  'United States', 'Uzbekistan', 'Venezuela, RB', 'Vietnam', 'Vanuatu', 'Samoa',
                  'Yemen, Rep.', 'South Africa', 'Zambia', 'Zimbabwe']

dates = ['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
date_nums=[]
for i in dates:
    date_nums.append(int(i))

#data clean
def clean(df):
    return df[df['Country Name'].isin(wanted_countries)].reset_index(drop=True).drop(columns=['Indicator Name','Indicator Code','Country Code']).set_index('Country Name')

#make new df out of existing dfs for all data for one country or date (keyowrd dependent)

def third(d,keyword=''):    
    temp=pd.DataFrame()    
    if keyword=='':
        return 'No keyword chosen'        
    elif keyword in wanted_countries:
        for kind, dfr in d.items():        
            temp[kind] = dfr.loc[keyword]
        temp['dates']=date_nums
        return temp
    elif keyword in dates:
        for kind, dfr in d.items():        
            temp[kind] = dfr[keyword]
        return temp
    else:
        return 'Keyword not found'

### dataframes

- **We initialize the dataframes for:**

    1. GDP per capita (gdp)
    2. Crude birth rate (cbr)
    3. Crude death rate (cdr)
    4. Rate of tuberculosis incidence (tbc)
    5. Population (pop)
    6. Life expectancy (lfe)
    7. HIV Incidence (hiv)
    8. Undernourishment percentage (unr)<br><br>
    
- We also initialize the dictionary 'dfs' which we will be using to store the dataframes we will use for the analyses. This may be referred to as the "Three Dimensional DataFrame".

In [2]:
gdp=pd.DataFrame()
cbr=pd.DataFrame()
cdr=pd.DataFrame()
tbc=pd.DataFrame()
pop=pd.DataFrame()
lfe=pd.DataFrame()
hiv=pd.DataFrame()
unr=pd.DataFrame()
dfs = {}

- We then clean the data in order to best fit our use, using the following function:

```
def clean(df):
    return df[df['Country Name'].isin(wanted_countries)].reset_index(drop=True).drop(columns=['Indicator Name','Indicator Code','Country Code']).set_index('Country Name')
    
```

The most notable change we make to the actual data is that we divide all items in the population dataframe by ``1000000`` for simplicity. Population will further be measured in the millions.

In [3]:
gdp = clean(pd.read_csv('data/gdp_percap.csv')).drop(columns='Unnamed: 65')
cbr = clean(pd.read_csv('data/birth_rate.csv')).drop(columns='Unnamed: 65')
cdr = clean(pd.read_csv('data/death_rate.csv')).drop(columns='Unnamed: 65')
tbc = clean(pd.read_csv('data/tuberculosis.csv')).drop(columns='Unnamed: 65')
pop = clean(pd.read_csv('data/pop.csv')).drop(columns='Unnamed: 65')/1000000
lfe = clean(pd.read_csv('data/life_expectancy.csv')).drop(columns='Unnamed: 65')
hiv = clean(pd.read_csv('data/hiv.csv')).drop(columns='Unnamed: 65')
unr = clean(pd.read_csv('data/undernourishment.csv')).drop(columns='Unnamed: 65')

- We construct the three dimensional dataframe by adding all of the dataframes into the dictionary we previously declared. We omit 'HIV' due to a lack of data.

In [4]:
dfs['GDP']=gdp
dfs['POP']=pop
dfs['CBR']=cbr
dfs['CDR']=cdr
dfs['TBC']=tbc
dfs['LFE']=lfe
### HIV has a lot of missing data, 
#too many values are missing in critical countries for us to make any reasonable moves towards conclusions
#thus HIV will be omitted from the analysis
#dfs['HIV']=hiv
dfs['UNR']=unr

- We now make a new dictionary, called 'cases'. This will be used to store the cases we use for the case studies in the analysis.
- The cases that we decide on will decided based on the mean GDP per capita of the available countries over a span of the years 1960 to 2020.
- From this, we take:
    1) The top 1 percentile
    2) The middle 1 percentile
    3) The bottom 1 percentile
    
    <br>

- These cases will be representative of all of the data.

In [5]:
cases = {}

In [6]:
cases['Top'] = gdp.mean(axis=1)[gdp.mean(axis=1)>=gdp.mean(axis=1).quantile(0.99)].index.to_list()

In [7]:
cases['Middle'] = gdp.mean(axis=1)[(gdp.mean(axis=1)>=gdp.mean(axis=1).quantile(0.495)) & (gdp.mean(axis=1)<=gdp.mean(axis=1).quantile(0.505))].index.to_list()

In [8]:
cases['Bottom'] = gdp.mean(axis=1)[gdp.mean(axis=1)<=gdp.mean(axis=1).quantile(0.01)].index.to_list()

In [9]:
pd.DataFrame(cases)

Unnamed: 0,Top,Middle,Bottom
0,Liechtenstein,Botswana,Burundi
1,Monaco,Turkmenistan,Somalia
