In [78]:
#Invite everyone to the data science par-tay

import pandas as pd

from ipywidgets import widgets

# Predicting suicide rates based on supposed risk factors

Mental health is a prevalent problem in today's world. Being able to predict the percentage of a population in a region who will commit suicide could lead to more money for support being put in place in those regions and for those most at risk.

I have looked at some of the risk factors for suicide listed on mentalhealth.org (https://www.mentalhealth.org.uk/a-to-z/s/suicide) and have chosen to investigate whether unemployment, alcohol consumption, poverty, location and murder rates are able to predict suicide rates. 

# Gathering and preprocessing the data

In [127]:
#Load all of the data
unemployment_percentage = pd.read_csv("long_term_unemployment_rate_percent (1).csv")
rural_poverty_percentage = pd.read_csv("rural_poverty_percent_rural_people_below_national_rural.csv")
urban_poverty_percentage = pd.read_csv("urban_poverty_percent_urban_people_below_national_urban.csv")
murders_per_100000 = pd.read_csv("murder_per_100000_people.csv")
alcohol_per_capita = pd.read_csv("sh_alc_pcap_li.csv")
suicide_per_1000 = pd.read_csv("suicide_per_100000_people (1).csv")

In [128]:
#Have a look at some of the data
unemployment_percentage = unemployment_percentage.transpose()

unemployment_percentage = unemployment_percentage.rename(columns=unemployment_percentage.iloc[0]).drop(unemployment_percentage.index[0])

unemployment_percentage.head()

Unnamed: 0,Albania,Algeria,Angola,Argentina,Armenia,Australia,Austria,Azerbaijan,Bangladesh,Belarus,...,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Vanuatu,Venezuela,Vietnam,Yemen,Zimbabwe
1990,,,,,,1.24,,,,,...,,,,0.278,,,,,,
1991,,,,,,1.94,,,,,...,,,,0.436,,,,,,
1992,,,,,,2.66,,,,,...,,,2.8,0.759,,,,,,
1993,,,,,,2.71,,,,,...,,,3.23,0.691,,,,,,
1994,,,,,,2.37,0.859,,,,...,,,3.01,0.731,,,,,,


In [129]:
missing_values(unemployment_percentage)

Unnamed: 0,Percentage missing,Number of missing values
Zimbabwe,0.964286,27
Cambodia,0.964286,27
Mozambique,0.964286,27
Nepal,0.964286,27
Gambia,0.964286,27
Nicaragua,0.964286,27
Maldives,0.964286,27
Samoa,0.964286,27
Algeria,0.964286,27
Senegal,0.964286,27


In [81]:
murders_per_100000.head()

Unnamed: 0,country,1950,1951,1952,1953,1954,1955,1956,1957,1958,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,Albania,,,,,,,,,,...,3.07,2.96,,2.3,,,,,,
1,Antigua and Barbuda,,,,,,,,,,...,,,,,,,,,,
2,Argentina,,,,,,,,,,...,,,,,,,,,,
3,Armenia,,,,,,,,,,...,,1.69,,,,1.51,1.1,0.964,1.12,1.01
4,Australia,1.0,1.33,1.46,1.25,1.3,1.39,1.3,1.34,1.54,...,0.978,1.09,1.17,1.01,1.05,1.23,1.06,1.02,1.11,


It seems as though all of the dataframes are in the sane format with the rows sorted by countries and columns as the years..

## Combining the dataframes

In [82]:
#Melting the dataframes 

unemployment_percentage = pd.melt(unemployment_percentage, id_vars=['country']).rename(columns={'variable':'year','value': "unemployment_percentage"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
rural_poverty_percentage = pd.melt(rural_poverty_percentage, id_vars=['country']).rename(columns={'variable':'year','value': "rural_poverty_percentage"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
urban_poverty_percentage= pd.melt(urban_poverty_percentage, id_vars=['country']).rename(columns={'variable':'year','value': "urban_poverty_percentage"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
murders_per_100000 = pd.melt(murders_per_100000, id_vars=['country']).rename(columns={'variable':'year','value': "murders_per_100000"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
alcohol_per_capita = pd.melt(alcohol_per_capita, id_vars=['country']).rename(columns={'variable':'year','value': "alcohol_per_capita"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
suicide_per_1000= pd.melt(suicide_per_1000, id_vars=['country']).rename(columns={'variable':'year','value': "suicide_per_1000"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
    
    

In [83]:
# Check that the dataframes have been succesfully melted

murders_per_100000.head()

Unnamed: 0,country,year,murders_per_100000
0,Albania,1950,
1,Albania,1951,
2,Albania,1952,
3,Albania,1953,
4,Albania,1954,


In [84]:
suicide_per_1000.head()

Unnamed: 0,country,year,suicide_per_1000
0,Albania,1950,
1,Albania,1951,
2,Albania,1952,
3,Albania,1953,
4,Albania,1954,


In [85]:
#Combine the dataframes where country and year are the same

In [88]:
merge1 = pd.merge(unemployment_percentage, rural_poverty_percentage, on=["country", "year"], how='outer').sort_values(by=["country", "year"])
merge2 = pd.merge(merge1, urban_poverty_percentage, on=["country", "year"], how='outer').sort_values(by=["country", "year"])
merge3 = pd.merge(merge2, murders_per_100000, on=["country", "year"], how='outer').sort_values(by=["country", "year"])
merge4 = pd.merge(merge3, alcohol_per_capita, on=["country", "year"], how='outer').sort_values(by=["country", "year"])
full_data = pd.merge(merge4, suicide_per_1000, on=["country", "year"], how='outer').sort_values(by=["country", "year"])


In [89]:
#Check everything is successfully merged

full_data.head()

Unnamed: 0,country,year,unemployment_percentage,rural_poverty_percentage,urban_poverty_percentage,murders_per_100000,alcohol_per_capita,suicide_per_1000
0,Afghanistan,1985,,,,,,
1,Afghanistan,1986,,,,,,
2,Afghanistan,1987,,,,,,
3,Afghanistan,1988,,,,,,
4,Afghanistan,1989,,,,,,


In [100]:
#Check that the size of the dataset looks reasonable. 

print("The length of the unemployment dataset was {0}, the length of the suicide dataset was {1} and the length of the full dataset is {2}. This seems reasonable.".format(len(unemployment_percentage), len(suicide_per_1000), len(full_data)))

The length of the unemployment dataset was 3388, the length of the suicide dataset was 6968 and the length of the full dataset is 9402. This seems reasonable.


# Look at missing data

In [101]:
def missing_values(data): 
    number_of_missing_values = data.isnull().sum()
    percentage_missing = (data.isnull().sum())/data.isnull().count()
    missing_values_table = pd.concat([percentage_missing, number_of_missing_values], axis=1, keys=["Percentage missing", "Number of missing values"])
    return missing_values_table.sort_values("Percentage missing", ascending=False)

In [102]:
missing_values(full_data)

Unnamed: 0,Percentage missing,Number of missing values
alcohol_per_capita,0.980217,9216
rural_poverty_percentage,0.957456,9002
urban_poverty_percentage,0.95586,8987
unemployment_percentage,0.843331,7929
murders_per_100000,0.684642,6437
suicide_per_1000,0.68177,6410
country,0.0,0
year,0.0,0
