In [132]:
#Invite everyone to the data science par-tay

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from ipywidgets import widgets

ModuleNotFoundError: No module named 'seaborn'

# Predicting suicide rates based on supposed risk factors

Mental health is a prevalent problem in today's world. Being able to predict the percentage of a population in a region who will commit suicide could lead to more money for support being put in place in those regions and for those most at risk.

I have looked at some of the risk factors for suicide listed on mentalhealth.org (https://www.mentalhealth.org.uk/a-to-z/s/suicide) and have chosen to investigate whether unemployment, alcohol consumption, poverty, location and murder rates are able to predict suicide rates. I will take data between 2000 and 2016 one because that keeps the predictions relevant to this year and 2) because lots of data from before these years is missing. 

# Gathering and preprocessing the data

In [95]:
#Load all of the data
unemployment_percentage = pd.read_csv("long_term_unemployment_rate_percent (1).csv")
poverty_percentage = pd.read_csv("alternative_poverty_percent_below_nationally_defined_poverty.csv")
murders_per_100000 = pd.read_csv("murder_per_100000_people.csv")
alcohol_per_capita = pd.read_csv("data.csv")
alcohol_per_capita_2 = pd.read_csv("data (1).csv")
suicide_per_10000 = pd.read_csv("suicide_per_100000_people (1).csv")


In [96]:
#Have a look at some of the data

unemployment_percentage.head()

Unnamed: 0,country,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,Albania,,,,,,,,,,...,4.5,4.91,5.67,5.28,5.3,5.53,5.76,6.09,,
1,Algeria,,,,,,,,,,...,,,,,,,2.15,,,
2,Angola,,,,,,,,,,...,,,,1.56,,,,,,
3,Argentina,,,,,,,,,,...,0.88,0.925,1.22,1.11,1.28,1.04,1.22,,,1.14
4,Armenia,,,,,,,,,,...,,5.69,6.77,6.46,5.8,,7.15,7.11,5.4,


In [97]:
#The alcohol datasets were taken from a different source and so are formatted differently. I'll play around with the 
#csv file to get it into the same format as the others to make things run more smoothly. 
alcohol_per_capita.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,"Alcohol, recorded per capita (15+) consumption (in litres of pure alcohol)","Alcohol, recorded per capita (15+) consumption (in litres of pure alcohol).1","Alcohol, recorded per capita (15+) consumption (in litres of pure alcohol).2","Alcohol, recorded per capita (15+) consumption (in litres of pure alcohol).3","Alcohol, recorded per capita (15+) consumption (in litres of pure alcohol).4","Alcohol, recorded per capita (15+) consumption (in litres of pure alcohol).5","Alcohol, recorded per capita (15+) consumption (in litres of pure alcohol).6"
0,Country,Data Source,Beverage Types,2016.0,2015.0,2014.0,2013.0,2012.0,2011.0,2010.0
1,Afghanistan,Data source,All types,,0.02,0.03,0.03,0.04,0.04,0.03
2,Albania,Data source,All types,5.07,4.77,4.81,5.06,5.43,5.65,5.53
3,Algeria,Data source,All types,0.56,0.56,0.56,0.54,0.49,0.44,0.39
4,Andorra,Data source,All types,10.06,9.97,9.95,9.78,10.06,10.31,10.64


In [98]:
# Funtion to modify alcohol datasets so that they're in the same format as the others
#
#Drop unecessary columns
#Adjust the headers to match other datasets
#Drop the first row (now a duplicate of header)

def modify_alcohol_databases(data):
    data = data.drop(["Unnamed: 1", "Unnamed: 2"], axis = "columns")

    headers = ["country"]
    for header_names in data.iloc[0][1:]:
        headers.append(str(int(header_names)))

    data.iloc[0] = headers
    data.columns = data.iloc[0]
    data = data.drop(data.index[0])
    return data


In [99]:
alcohol_per_capita = modify_alcohol_databases(alcohol_per_capita)
alcohol_per_capita_2 = modify_alcohol_databases(alcohol_per_capita_2)

#Merge the two alcohol datasets together 
alcohol_per_capita_full = pd.merge(alcohol_per_capita, alcohol_per_capita_2, on=["country"], how='outer')

#Check everything's okay
alcohol_per_capita_full.head()

Unnamed: 0,country,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,Afghanistan,,0.02,0.03,0.03,0.04,0.04,0.03,0.03,0.03,0.03,0.03,0.02,0.02,0.01,0.01,0.0,0.0
1,Albania,5.07,4.77,4.81,5.06,5.43,5.65,5.53,6.04,5.82,5.76,5.46,5.3,4.67,4.43,4.01,4.71,3.98
2,Algeria,0.56,0.56,0.56,0.54,0.49,0.44,0.39,0.48,0.43,0.39,0.43,0.55,0.52,0.39,0.43,0.27,0.32
3,Andorra,10.06,9.97,9.95,9.78,10.06,10.31,10.64,10.8,10.97,11.53,12.03,11.96,10.8,10.97,11.53,12.03,11.96
4,Angola,4.7,5.65,9.0,8.02,8.14,7.86,7.67,6.89,6.9,6.22,5.44,4.68,3.39,2.88,2.62,2.45,1.77


In [100]:
# Only keep years after 2000 from the databases - alcohol database is only from 2000 so no need to i

def get_pre_2000_indexes(data):
    less_than_2000_column_indexes = []
    for column_name in data.columns.values[1:]: #don't include 'country' column
        if (int(column_name) < 2000):
            less_than_2000_column_indexes.append(data.columns.get_loc(column_name))
    data = data.drop(data.columns[less_than_2000_column_indexes], axis="columns")
    return data

In [101]:
unemployment_percentage = get_pre_2000_indexes(unemployment_percentage)
poverty_percentage = get_pre_2000_indexes(poverty_percentage)
murders_per_100000 = get_pre_2000_indexes(murders_per_100000)
alcohol_per_capita_full = get_pre_2000_indexes(alcohol_per_capita_full)
suicide_per_10000 = get_pre_2000_indexes(suicide_per_10000)


Looking at all the data, it seems as though all of the dataframes are in the same format with the rows sorted by countries and columns as the years so I will need to re-arrange them all and combine them. However, before I do this, I am going to look at the mising values of each data files. I predict that certain countries will have lots of missing data rather than some years having missing data for lots of countries. I will investigate this and if this is the case, I will drop the countries with a substantial amount of missing data. 

In [102]:
poverty_percentage.head()

Unnamed: 0,country,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,Afghanistan,,,,,,,,36.3,,,,35.8,,,,,,
1,Albania,,,25.4,,,18.5,,,12.4,,,,14.3,,,,,
2,Algeria,,,,,,,,,,,,5.5,,,,,,
3,Angola,,,,,,,,,36.6,,,,,,,,,
4,Argentina,,,,,,,,,,,,,,,,,30.3,


In [103]:
alcohol_per_capita_full.head()

Unnamed: 0,country,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,Afghanistan,,0.02,0.03,0.03,0.04,0.04,0.03,0.03,0.03,0.03,0.03,0.02,0.02,0.01,0.01,0.0,0.0
1,Albania,5.07,4.77,4.81,5.06,5.43,5.65,5.53,6.04,5.82,5.76,5.46,5.3,4.67,4.43,4.01,4.71,3.98
2,Algeria,0.56,0.56,0.56,0.54,0.49,0.44,0.39,0.48,0.43,0.39,0.43,0.55,0.52,0.39,0.43,0.27,0.32
3,Andorra,10.06,9.97,9.95,9.78,10.06,10.31,10.64,10.8,10.97,11.53,12.03,11.96,10.8,10.97,11.53,12.03,11.96
4,Angola,4.7,5.65,9.0,8.02,8.14,7.86,7.67,6.89,6.9,6.22,5.44,4.68,3.39,2.88,2.62,2.45,1.77


## Sort out the missing values in each dataset before combining

In [104]:
def missing_values_columns(data): 
    number_of_missing_values = data.isnull().sum()
    percentage_missing = (data.isnull().sum())/data.isnull().count()
    missing_values_table = pd.concat([percentage_missing, number_of_missing_values], axis=1, keys=["Percentage missing", "Number of missing values"])
    return missing_values_table.sort_values("Percentage missing", ascending=False)

def missing_values_rows(data): 
    number_of_missing_values = data.isnull().sum(axis="columns")
    percentage_missing = (data.isnull().sum(axis="columns"))/data.isnull().count(axis="columns")
    missing_values_table = pd.concat([percentage_missing, number_of_missing_values], axis=1, keys=["Percentage missing", "Number of missing values"])
    return missing_values_table.sort_values("Percentage missing", ascending=False)

As predicted, some countries have a substantial amount of missing data and some countries have very little, although when categorised by year, all years have a fair amount of missing data due to lots of countries in each year not submitting the data. I will delete all years with countries with >25% of missing data. 

## Combining the dataframes

I want the dataframes to be able to combine to be in the format:
year - country - rest of variables

so I need to melt each dataframe so that it is of the form 
year - country - variable

and then I will be able to combine them. 

In [105]:
unemployment_percentage = pd.melt(unemployment_percentage, id_vars=['country']).rename(columns={'variable':'year','value': "unemployment_percentage"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")


In [106]:
unemployment_percentage.head()

Unnamed: 0,country,year,unemployment_percentage
0,Albania,2000,
1,Albania,2001,
2,Albania,2002,
3,Albania,2003,
4,Albania,2004,


In [107]:
#Melting the rest of the dataframes

poverty_percentage = pd.melt(poverty_percentage, id_vars=['country']).rename(columns={'variable':'year','value': "poverty_percentage"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
murders_per_100000 = pd.melt(murders_per_100000, id_vars=['country']).rename(columns={'variable':'year','value': "murders_per_100000"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
alcohol_per_capita_full = pd.melt(alcohol_per_capita_full, id_vars=['country']).rename(columns={0:"year", "value": "alcohol_per_capita"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
suicide_per_10000= pd.melt(suicide_per_10000, id_vars=['country']).rename(columns={'variable':'year','value': "suicide_per_10000"}).sort_values(by=["country", "year"]).reset_index().drop("index", axis="columns")
    

In [108]:
poverty_percentage.tail()

Unnamed: 0,country,year,poverty_percentage
2425,Zimbabwe,2013,
2426,Zimbabwe,2014,
2427,Zimbabwe,2015,
2428,Zimbabwe,2016,
2429,Zimbabwe,2017,


In [109]:
#Combine the dataframes where country and year are the same

merge1 = pd.merge(unemployment_percentage, poverty_percentage, on=["country", "year"], how='outer').sort_values(by=["country", "year"])
merge2 = pd.merge(merge1, murders_per_100000, on=["country", "year"], how='outer').sort_values(by=["country", "year"])
merge3 = pd.merge(merge2, alcohol_per_capita_full, on=["country", "year"], how='outer').sort_values(by=["country", "year"])
full_data = pd.merge(merge3, suicide_per_10000, on=["country", "year"], how='outer').sort_values(by=["country", "year"])


In [110]:
#Check everything is successfully merged

full_data.head()

Unnamed: 0,country,year,unemployment_percentage,poverty_percentage,murders_per_100000,alcohol_per_capita,suicide_per_10000
0,Afghanistan,2000,,,,0.0,
1,Afghanistan,2001,,,,0.0,
2,Afghanistan,2002,,,,0.01,
3,Afghanistan,2003,,,,0.01,
4,Afghanistan,2004,,,,0.02,


In [111]:
#Check that the size of the dataset looks reasonable. 

print("The length of the unemployment dataset was {0}, the length of the suicide dataset was {1} and the length of the full dataset is {2}. This seems reasonable.".format(len(unemployment_percentage), len(suicide_per_10000), len(full_data)))

The length of the unemployment dataset was 2178, the length of the suicide dataset was 1768 and the length of the full dataset is 3960. This seems reasonable.


# Looking at how much data is missing in the full dataframe

In [112]:
missing_values_columns(full_data)

Unnamed: 0,Percentage missing,Number of missing values
poverty_percentage,0.815152,3228
murders_per_100000,0.80101,3172
suicide_per_10000,0.79798,3160
unemployment_percentage,0.7,2772
alcohol_per_capita,0.209343,829
country,0.0,0
year,0.0,0


In [113]:
len(full_data)

3960

In [114]:
full_data_copy = full_data.copy()
full_data_copy = full_data_copy.transpose()

In [115]:
#delete instances which have more than 40% missing data
full_data_copy = full_data_copy.dropna(thresh=0.6*len(full_data_copy), axis="columns")

In [116]:
missing_values_rows(full_data_copy)

Unnamed: 0,Percentage missing,Number of missing values
poverty_percentage,0.596215,567
murders_per_100000,0.193481,184
suicide_per_10000,0.189274,180
unemployment_percentage,0.126183,120
alcohol_per_capita,0.117771,112
country,0.0,0
year,0.0,0


In [117]:
len(full_data_copy.columns)

951

# Impute the missing values

The poverty percentage feature has a high number of missing data (~60%), so I will drop this feature. The rest of the missing values are between 10% and 20% so I will impute these values using the average value of the feature for that country. 

I am also going to drop all the instances which don't have a suicide rate. Suicide rate is what I'm trying to predict, so it seesms counter-intuitive to build a model based of imputed guesses of suicide rate. 

In [118]:
#Drop poverty percentage
full_data_copy = full_data_copy.transpose()
full_data_copy = full_data_copy.drop("poverty_percentage", axis="columns")
full_data_copy.head()

Unnamed: 0,country,year,unemployment_percentage,murders_per_100000,alcohol_per_capita,suicide_per_10000
18,Albania,2000,,3.96,3.98,1.88
19,Albania,2001,,7.58,4.71,4.04
20,Albania,2002,,7.4,4.01,4.53
21,Albania,2003,,5.53,4.43,4.08
22,Albania,2004,,4.29,4.67,4.72


In [119]:
full_data_copy = full_data_copy.dropna(axis=0, subset=["suicide_per_10000"])

In [120]:
full_data_copy.shape

(771, 6)

In [121]:
full_data_copy.groupby("country").nunique().sum()

country                     67
year                       771
unemployment_percentage    626
murders_per_100000         752
alcohol_per_capita         616
suicide_per_10000          702
dtype: int64

90 different countries is too many to encode, so I will separate them into continents. 

In [122]:
countries_to_continents = pd.read_csv("Countries-Continents.csv")

In [123]:
countries_to_continents.head()

Unnamed: 0,Continent,Country
0,Africa,Algeria
1,Africa,Angola
2,Africa,Benin
3,Africa,Botswana
4,Africa,Burkina


In [124]:
countries_dictionary = countries_to_continents.set_index("Country").to_dict()["Continent"]

In [125]:
full_data_copy = full_data_copy.replace(countries_dictionary)
full_data_copy.tail()

Unnamed: 0,country,year,unemployment_percentage,murders_per_100000,alcohol_per_capita,suicide_per_10000
3784,Asia,2001,,3.78,3.23,8.77
3785,Asia,2002,,3.63,3.12,7.03
3786,Asia,2003,,2.53,3.35,6.27
3787,Asia,2004,,2.78,2.02,5.34
3788,Asia,2005,,3.16,1.88,5.21


Now that the countries have been changed to continents, let's impute the missing data. 

In [126]:
#Split into training and test data before impution
9
y = pd.DataFrame(full_data_copy["suicide_per_10000"])
X = full_data_copy.drop("suicide_per_10000", axis="columns")


In [127]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=36)

In [128]:
meanImputer = SimpleImputer()
modeImputer = SimpleImputer(strategy="most_frequent")

In [129]:
X_train_imputed = X_train.copy()
X_test_imputed = X_test.copy()
y_train_imputed = y_train.copy()
y_test_imputed = y_train.copy()

X_train_imputed.iloc[:,1:] = meanImputer.fit_transform(X_train.iloc[:,1:])
X_test_imputed.iloc[:,1:] = meanImputer.transform(X_test.iloc[:,1:])

y_train_imputed = meanImputer.fit_transform(X_train.iloc[:,1:])
y_test_imputed = meanImputer.transform(X_test.iloc[:,1:])

X_train_imputed.iloc[:,:1] = modeImputer.fit_transform(X_train.iloc[:,:1])
X_test_imputed.iloc[:,:1] = modeImputer.fit_transform(X_test.iloc[:,:1])


In [130]:
missing_values_columns(X_train_imputed)

Unnamed: 0,Percentage missing,Number of missing values
country,0.0,0
year,0.0,0
unemployment_percentage,0.0,0
murders_per_100000,0.0,0
alcohol_per_capita,0.0,0


No more missing data - perfect!

# Exploring the data

Hypotheses:

    - an increase in alcohol per capital is correlated to increased suicide rates
    - an increase in unenployment is correlated to increased suicide rates. 
    - an increase in murders is correlated to increased suicide rates. 

## Heatmap of numerical data