In [503]:
#Invite everyone to the data science par-tay

import pandas as pd

from ipywidgets import widgets

# Predicting suicide rates based on supposed risk factors

Mental health is a prevalent problem in today's world. Being able to predict the percentage of a population in a region who will commit suicide could lead to more money for support being put in place in those regions and for those most at risk.

I have looked at some of the risk factors for suicide listed on mentalhealth.org (https://www.mentalhealth.org.uk/a-to-z/s/suicide) and have chosen to investigate whether unemployment, alcohol consumption, poverty, location and murder rates are able to predict suicide rates. 

# Gathering and preprocessing the data

In [504]:
#Load all of the data
unemployment_percentage = pd.read_csv("long_term_unemployment_rate_percent (1).csv")
rural_poverty_percentage = pd.read_csv("rural_poverty_percent_rural_people_below_national_rural.csv")
urban_poverty_percentage = pd.read_csv("urban_poverty_percent_urban_people_below_national_urban.csv")
murders_per_100000 = pd.read_csv("murder_per_100000_people.csv")
alcohol_per_capita = pd.read_csv("sh_alc_pcap_li.csv")
suicide_per_1000 = pd.read_csv("suicide_per_100000_people (1).csv")

In [505]:
#Have a look at some of the data
unemployment_percentage.head()

Unnamed: 0,country,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,Albania,,,,,,,,,,...,4.5,4.91,5.67,5.28,5.3,5.53,5.76,6.09,,
1,Algeria,,,,,,,,,,...,,,,,,,2.15,,,
2,Angola,,,,,,,,,,...,,,,1.56,,,,,,
3,Argentina,,,,,,,,,,...,0.88,0.925,1.22,1.11,1.28,1.04,1.22,,,1.14
4,Armenia,,,,,,,,,,...,,5.69,6.77,6.46,5.8,,7.15,7.11,5.4,


Looking at all the data, it seems as though all of the dataframes are in the same format with the rows sorted by countries and columns as the years so I will need to re-arrange them all and combine them. However, before I do this, I am going to look at the midding values of each data files. I predict that certain countries will have lots of missing data rather than some years having missing data for lots of countries. I will investigate this and if this is the case, I will drop the countries with a substantial amount of missing data. 

## Sort out the missing values in each dataset before combining

In [506]:
def missing_values_columns(data): 
    number_of_missing_values = data.isnull().sum()
    percentage_missing = (data.isnull().sum())/data.isnull().count()
    missing_values_table = pd.concat([percentage_missing, number_of_missing_values], axis=1, keys=["Percentage missing", "Number of missing values"])
    return missing_values_table.sort_values("Percentage missing", ascending=False)

def missing_values_rows(data): 
    number_of_missing_values = data.isnull().sum(axis="columns")
    percentage_missing = (data.isnull().sum(axis="columns"))/data.isnull().count(axis="columns")
    missing_values_table = pd.concat([percentage_missing, number_of_missing_values], axis=1, keys=["Percentage missing", "Number of missing values"])
    return missing_values_table.sort_values("Percentage missing", ascending=False)

As predicted, some countries have a substantial amount of missing data and some countries have very little, although when categorised by year, all years have a fair amount of missing data due to lots of countries in each year not submitting the data. I will delete all years with countries with >25% of missing data. 

In [507]:
#Missing data for each year 
missing_values_columns(unemployment_percentage)

Unnamed: 0,Percentage missing,Number of missing values
1990,0.909091,110
1991,0.892562,108
1992,0.785124,95
1993,0.768595,93
1994,0.760331,92
1995,0.727273,88
1996,0.710744,86
1997,0.710744,86
1998,0.694215,84
1999,0.68595,83


In [508]:
#Missing data for each country 
missing_values_rows(unemployment_percentage)

Unnamed: 0,Percentage missing,Number of missing values
120,0.931034,27
17,0.931034,27
70,0.931034,27
73,0.931034,27
37,0.931034,27
76,0.931034,27
61,0.931034,27
90,0.931034,27
1,0.931034,27
92,0.931034,27


In [509]:
unemployment_percentage = pd.read_csv("long_term_unemployment_rate_percent (1).csv")

unemployment_original_size = unemployment_percentage.size

unemployment_percentage = unemployment_percentage.dropna(thresh=0.5*len(unemployment_percentage), axis="columns")
unemployment_percentage = unemployment_percentage.transpose()
unemployment_percentage = unemployment_percentage.dropna(thresh=0.5*len(unemployment_percentage), axis="columns")

In [510]:
print("Unemployment orignally included {0} countries and after dropping those with a high percentage of missing values it contains {1} countries".format(unemployment_original_size, unemployment_percentage.size))

Unemployment orignally included 3509 countries and after dropping those with a high percentage of missing values it contains 836 countries


In [511]:
missing_values_rows(unemployment_percentage)

Unnamed: 0,Percentage missing,Number of missing values
2017,0.236842,18
2008,0.210526,16
2015,0.065789,5
2016,0.052632,4
2009,0.039474,3
2010,0.039474,3
2014,0.039474,3
2012,0.026316,2
2013,0.026316,2
2011,0.013158,1


In [512]:
#urban poverty reduce missing values 
#urban_poverty_percentage = urban_poverty_percentage.dropna(thresh=0.1*len(urban_poverty_percentage), axis="columns")
#urban_poverty_percentage = urban_poverty_percentage.transpose()
#urban_poverty_percentage = urban_poverty_percentage.dropna(thresh=0.6*len(urban_poverty_percentage), axis="columns")

#missing_values_columns(urban_poverty_percentage)

In [513]:
#Repeat with other dataframes
#Transpose and drop those with >25% missing data
rural_poverty_percentage = rural_poverty_percentage.transpose()
rural_poverty_percentage = rural_poverty_percentage.dropna(thresh=0.3*len(rural_poverty_percentage), axis="columns")

urban_poverty_percentage = urban_poverty_percentage.transpose()
urban_poverty_percentage = urban_poverty_percentage.dropna(thresh=0.3*len(urban_poverty_percentage), axis="columns")

murders_per_100000 = murders_per_100000.transpose()
murders_per_100000 = murders_per_100000.dropna(thresh=0.3*len(murders_per_100000), axis="columns")

alcohol_per_capita = alcohol_per_capita.transpose()
alcohol_per_capita = alcohol_per_capita.dropna(thresh=0.3*len(alcohol_per_capita), axis="columns")

suicide_per_1000 = suicide_per_1000.transpose()
suicide_per_1000 = suicide_per_1000.dropna(thresh=0.3*len(suicide_per_1000), axis="columns")


## Combining the dataframes

I want the dataframes to be able to combine to be in the format:
year - country - rest of variables

so I need to melt each dataframe so that it is of the form 
year - country - variable

and then I will be able to combine them. 

In [514]:
unemployment_percentage = unemployment_percentage.transpose()
unemployment_percentage = pd.melt(unemployment_percentage, id_vars=['country']).rename(columns={'variable':'year','value': "unemployment_percentage"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")


In [515]:
unemployment_percentage.head()

Unnamed: 0,country,year,unemployment_percentage
0,Albania,2008,4.5
1,Albania,2009,4.91
2,Albania,2010,5.67
3,Albania,2011,5.28
4,Albania,2012,5.3


In [516]:
#Melting the rest of the dataframes

rural_poverty_percentage = rural_poverty_percentage.transpose()
urban_poverty_percentage = urban_poverty_percentage.transpose()
murders_per_100000 = murders_per_100000.transpose()
alcohol_per_capita = alcohol_per_capita.transpose()
suicide_per_1000 = suicide_per_1000.transpose()

rural_poverty_percentage = pd.melt(rural_poverty_percentage, id_vars=['country']).rename(columns={'variable':'year','value': "rural_poverty_percentage"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
urban_poverty_percentage= pd.melt(urban_poverty_percentage, id_vars=['country']).rename(columns={'variable':'year','value': "urban_poverty_percentage"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
murders_per_100000 = pd.melt(murders_per_100000, id_vars=['country']).rename(columns={'variable':'year','value': "murders_per_100000"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
alcohol_per_capita = pd.melt(alcohol_per_capita, id_vars=['country']).rename(columns={'variable':'year','value': "alcohol_per_capita"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
suicide_per_1000= pd.melt(suicide_per_1000, id_vars=['country']).rename(columns={'variable':'year','value': "suicide_per_1000"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
    
    

In [517]:
rural_poverty_percentage.tail()

Unnamed: 0,country,year,rural_poverty_percentage
491,Uruguay,2011,6.0
492,Uruguay,2012,4.1
493,Uruguay,2013,3.0
494,Uruguay,2014,3.0
495,Uruguay,2015,


In [518]:
#Combine the dataframes where country and year are the same

merge1 = pd.merge(unemployment_percentage, rural_poverty_percentage, on=["country", "year"], how='outer').sort_values(by=["country", "year"])
merge2 = pd.merge(merge1, urban_poverty_percentage, on=["country", "year"], how='outer').sort_values(by=["country", "year"])
merge3 = pd.merge(merge2, murders_per_100000, on=["country", "year"], how='outer').sort_values(by=["country", "year"])
merge4 = pd.merge(merge3, alcohol_per_capita, on=["country", "year"], how='outer').sort_values(by=["country", "year"])
full_data = pd.merge(merge4, suicide_per_1000, on=["country", "year"], how='outer').sort_values(by=["country", "year"])


In [519]:
#Check everything is successfully merged

full_data.head()

Unnamed: 0,country,year,unemployment_percentage,rural_poverty_percentage,urban_poverty_percentage,murders_per_100000,alcohol_per_capita,suicide_per_1000
0,Afghanistan,2016,,,,,0.2,
1,Albania,2008,4.5,,,,,
2,Albania,2009,4.91,,,,,
3,Albania,2010,5.67,,,,,
4,Albania,2011,5.28,,,,,


In [520]:
#Check that the size of the dataset looks reasonable. 

print("The length of the unemployment dataset was {0}, the length of the suicide dataset was {1} and the length of the full dataset is {2}. This seems reasonable.".format(len(unemployment_percentage), len(suicide_per_1000), len(full_data)))

The length of the unemployment dataset was 760, the length of the suicide dataset was 4087 and the length of the full dataset is 4735. This seems reasonable.


# Look at missing data

In [521]:
def missing_values(data): 
    number_of_missing_values = data.isnull().sum()
    percentage_missing = (data.isnull().sum())/data.isnull().count()
    missing_values_table = pd.concat([percentage_missing, number_of_missing_values], axis=1, keys=["Percentage missing", "Number of missing values"])
    return missing_values_table.sort_values("Percentage missing", ascending=False)

In [522]:
missing_values(full_data)

Unnamed: 0,Percentage missing,Number of missing values
alcohol_per_capita,0.960718,4549
rural_poverty_percentage,0.95924,4542
urban_poverty_percentage,0.956072,4527
unemployment_percentage,0.851531,4032
murders_per_100000,0.443295,2099
suicide_per_1000,0.439493,2081
country,0.0,0
year,0.0,0
