In [55]:
import pandas as pd
import numpy as np

In [416]:
from sklearn import linear_model
from sklearn.preprocessing import normalize
df = pd.read_csv('full_suicide_dataset.csv')
df = df.assign(sunshine_hours = (df.sunshine_hours/24))

# Prepare data set for regression
1. Normalize if necessary 
2. Convert everything to numpy arrays of floats

In [417]:
normalized = False
if normalized:
    # Normalized approach:
    colnames1 = list(set(df.columns) - {'country'})
    just_values = normalize(df[df.columns[1:]])
    X = just_values[:,1:].astype(float)
    Y = just_values[:,0].astype(float)
else:
    # Not normalized approach:
    colnames = list(set(df.columns) - {'country', 'suicide_rate'})
    X = df[colnames].values.astype('float')
    Y = df.suicide_rate.values.astype('float')

# Split into train and test 
We have a rather small dataset and hence we would like to use as much data as possible for training. We decided to use 85% of the dataset for trainig and 15% for testing. The split is random with fixed seed so one cen reproduce this notebook.

In [418]:
# set seed
np.random.seed(129)
# randomly select 85% of the data for the train set and use the rest for the test set
selvec = np.random.rand(len(X)) < 0.85

In [419]:
# split into train and test
train_data_X = X[selvec]
train_data_Y = Y[selvec]
test_data_X = X[~selvec]
test_data_Y = Y[~selvec]

# Build regression model 
For that purpose we will simply take the linear regression class available from scklearn library and fit the dataset into it.

In [None]:
regr = linear_model.LinearRegression()
# feed the linear regression with the train data to obtain a model.
regr.fit(train_data_X, train_data_Y)

# Evaluate model
Now we need to check if our model of any good and if we can predict anything with our data. Two measure for that are: 
1. MSE - Mean Squared Error. The lower the better
2. Variance score. The higher the better (1 is ideal)

In [423]:
mse = np.mean((regr.predict(test_data_X) - test_data_Y) ** 2)
print("Mean squared error: {}".format(np.round(mse,4)))

Mean squared error: 1.7047


In [425]:
print('Variance score: {}'.format(np.round(regr.score(test_data_X, test_data_Y),4)))

Variance score: 0.7104


# Result table
It's great to see the numbers but since each our observation is not anonymous and it's really interesting to look at exact numbers, we can build the table to visually compare actual and predicted rates. 

In [426]:
Y_verb = df[['country', 'suicide_rate']][~selvec]

In [427]:
Y_verb = Y_verb.assign(predicted = regr.predict(test_data_X))

In [430]:
Y_verb

Unnamed: 0,country,suicide_rate,predicted
4,Argentina,4.8,4.265015
6,Australia,5.6,5.626218
16,Benin,8.4,7.948655
32,Colombia,2.1,4.767461
41,Dominican Republic,2.6,3.938574
61,Haiti,6.0,7.671718
64,Iceland,6.3,5.864886
69,Israel,2.3,2.915976
73,Jordan,2.4,3.78795
82,Libya,2.8,2.747905


# Variables impact 
The following table shows how differnt variable affect the resulting suicide rate in our prediction.

In [441]:
pd.DataFrame({'variable': df.columns[2:], 'effect': regr.coef_})

Unnamed: 0,variable,effect
0,total_litres_of_pure_alcohol,0.221639
1,gdi,2.7e-05
2,hdi,-0.379499
3,schooling_years,-0.021041
4,gni,1.6e-05
5,poverty_index,0.005261
6,satisfaction_index,0.007165
7,unemployment_rate,-0.02612
8,access_to_family_planning,17.735389
9,gii,0.004275
