In [221]:
import pandas as pd

# Suicide Rates Dataset

Suicide rate: 
Age-standardized suicide rates (per 100 000 population)

In [222]:
suicides = pd.read_csv('suicide_core.csv').iloc[1:,0:3]

In [223]:
suicides.columns = ['country','sex','suicide_rate']

In [224]:
female_suicides = suicides[suicides.sex == 'Female'][['country','suicide_rate']]

In [225]:
female_suicides = female_suicides.assign(country = female_suicides.country.apply(lambda x: x.strip()))

In [226]:
len(female_suicides)

183

In [227]:
female_suicides.head()

Unnamed: 0,country,suicide_rate
3,Afghanistan,3.1
6,Albania,2.4
9,Algeria,1.3
12,Angola,14.3
15,Antigua and Barbuda,0.0


Check if there are null values: 

In [228]:
female_suicides.suicide_rate.isnull().sum()

0

# Drinks

In [234]:
drinks = pd.read_csv('drinks.csv')

In [235]:
drinks_total = drinks[['country','total_litres_of_pure_alcohol']]

In [236]:
drinks_total = drinks_total.assign(country = drinks_total.country.apply(lambda x: x.strip()))

In [237]:
len(drinks_total)

193

In [238]:
drinks_total.head()

Unnamed: 0,country,total_litres_of_pure_alcohol
0,Afghanistan,0.0
1,Albania,4.9
2,Algeria,0.7
3,Andorra,12.4
4,Angola,5.9


Check if there are null values

In [239]:
drinks.total_litres_of_pure_alcohol.isnull().sum()

0

# GDI 

In [251]:
gdi = pd.read_excel('GDI.xlsx')

In [252]:
gdi.columns

Index(['HDI rank', 'Country', 'GDI Value(2015)', 'GDI Group(2015)',
       'HDI Female(2015)', 'Expected schooling years Female(2015)',
       'Estimated GNI per capita -Female(2015)'],
      dtype='object')

In [253]:
gdi_selection = gdi[['Country', 'GDI Value(2015)', 'HDI Female(2015)', 'Expected schooling years Female(2015)',
    'Estimated GNI per capita -Female(2015)']]

In [254]:
gdi_selection.columns = ['country','gdi', 'hdi', 'schooling_years','gni']

In [255]:
gdi_selection = gdi_selection.assign(country = gdi_selection.country.apply(lambda x: x.strip()))

In [256]:
gdi_selection.head()

Unnamed: 0,country,gdi,hdi,schooling_years,gni
0,South Sudan,0.874,0.37,3.8,1286
1,Eritrea,0.874,0.37,4.4,1286
2,Niger,0.732,0.291,4.7,481
3,Central African Republic,0.776,0.306,5.8,482
4,Chad,0.765,0.34,5.8,1581


Check if there are missing values

In [257]:
gdi_selection.isnull().sum()

country            0
gdi                0
hdi                0
schooling_years    0
gni                0
dtype: int64

# Poverty

For poverty data sets we took the most recent years and imputed the data for 2014 from the years 2011-2013. For those that don't have any values for any years, we took the column average. 
TODO: add rationale (i.e. the remaining countires intuitively are good fit for the mean)

In [267]:
poverty = pd.read_excel('poverty.xlsx')
poverty_selection = poverty[['Country', 2014]]
poverty_selection.columns = ['country', 'poverty_index']
poverty_selection = poverty_selection.assign(country = poverty_selection.country.apply(lambda x: x.strip()))
poor_countries = poverty_selection['country']
means = poverty_selection["poverty_index"].fillna(poverty_selection["poverty_index"].mean())
poverty_selection = poverty_selection.assign(poverty_index = means)
poverty_selection.head()

Unnamed: 0,country,poverty_index
0,Afghanistan,0.293
1,Albania,0.156609
2,Argentina,0.156609
3,Armenia,0.156609
4,Azerbaijan,0.156609


# Happiness

Life satisfaction is measured on a scale from 0 to 10 (0 - worst, 10 - best). 

In [184]:
happiness = pd.read_csv('happiness.csv')
happiness = happiness.assign(country = happiness.country.apply(lambda x: x.strip()))
happiness.columns

Index(['country', 'satisfaction_index'], dtype='object')

In [188]:
happiness[happiness.satisfaction_index.isnull()].head()

Unnamed: 0,country,satisfaction_index
2,Algeria,
3,Angola,
4,Antigua and Barbuda,
5,Arab World,
11,Bahamas,


Rationale for missing values. Values are missing only for the poor, undeveloped countires. Examples of them are Algeria, Angola. In the previous dataset on poverty we had only poor and hence unhappy countries (TODO: add link to research that shows that richer -> happier in general). So we're going to select all the countries that are considered poor and take their average for imputation.

In [175]:
poor_countries_mean = pd.merge(pd.DataFrame(poor_countries), happiness, how='left').satisfaction_index.mean()

In [190]:
happiness.fillna(poor_countries_mean, inplace = True)

In [194]:
happiness.head()

Unnamed: 0,country,satisfaction_index
0,Afghanistan,3.982855
1,Albania,4.606651
2,Algeria,4.792081
3,Angola,4.792081
4,Antigua and Barbuda,4.792081


# Unemployement rates

In [207]:
unempl = pd.read_excel('unemployement_rates.xlsx')

In [210]:
unempl_selection = unempl[['Country Name', 2015]]
unempl_selection.columns = ['country', 'unemployment_rate']

In [212]:
unempl_selection.unemployment_rate.isnull().sum()

0

In [213]:
unempl_selection.head()

Unnamed: 0,country,unemployment_rate
0,Afghanistan,8.600000381
1,Angola,6.199999809
2,Albania,17.10000038
3,United Arab Emirates,4.0
4,Argentina,7.199999809


In [272]:
#unempl_selection.unemployment_rate

# Health services 

In [273]:
health = pd.read_csv("health_services.csv")

In [276]:
health_selection = health.iloc[:,[0,2]]
health_selection.columns = ['country', 'access_to_family_planning']

In [280]:
health_selection.head()

Unnamed: 0,country,access_to_family_planning
0,Afghanistan,47.0
1,Albania,12.9
2,Algeria,77.2
3,Angola,24.2
4,Antigua and Barbuda,58.2
