Import packages

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, plot, iplot
import sqlite3
import Code.Preparation as prep
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
import statsmodels.api as sm
import re
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
import itertools
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import acf, pacf, adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings
warnings.filterwarnings('ignore')

### Clean and combine happiness reports dataset

Load datasets

In [14]:
df2021 = pd.read_csv(prep.path('happy/2021.csv'))
df2021['Year'] = 2021

In [15]:
dfhappy = pd.read_csv(prep.path('happy/world-happiness-report.csv'))

Change names to match corresponding columns and merge dataframes

In [17]:
df2021 = df2021[['Country name','Year', 'Ladder score','Social support','Healthy life expectancy','Logged GDP per capita','Freedom to make life choices','Generosity','Perceptions of corruption']]
df2021 = df2021.rename(columns={'Year': 'year', 'Ladder score': 'Life Ladder', 'Healthy life expectancy': 'Healthy life expectancy at birth', 'Logged GDP per capita': 'Log GDP per capita'})
dfhappy = dfhappy.merge(df2021, how='outer').drop(columns=['Positive affect', 'Negative affect'])

Check for null values

In [20]:
dfhappy = dfhappy.rename(columns={'Country name': 'Country'})

Interpolate null values

In [21]:
dfhappy = dfhappy.interpolate()

Save data to csv for modeling. 

In [23]:
# Export dataset to csv
dfhappy.to_csv('Data/CleanedHappy.csv')

### World Dataset

In [27]:
datadf = pd.read_csv(prep.path('WDI/WDIData.csv'))
countrydf = pd.read_csv(prep.path('WDI/WDICountry.csv'))
df = pd.read_csv(prep.path('World_Development/Indicators.csv'))

In [28]:
Indicator_array =  df[['IndicatorName','IndicatorCode']].drop_duplicates().values

Search tool for relevent features that could have a impact on suicides. 

In [34]:
modified_indicators = []
unique_indicator_codes = []
for ele in Indicator_array:
    indicator = ele[0]
    indicator_code = ele[1].strip()
    if indicator_code not in unique_indicator_codes:
        # delete , ( ) from the IndicatorNames
        new_indicator = re.sub('[,()]',"",indicator).lower()
        # replace - with "to" and make all words into lower case
        new_indicator = re.sub('-'," to ",new_indicator).lower()
        modified_indicators.append([new_indicator,indicator_code])
        unique_indicator_codes.append(indicator_code)

Indicators = pd.DataFrame(modified_indicators,columns=['IndicatorName','IndicatorCode'])
Indicators = Indicators.drop_duplicates()

In [30]:
key_word_dict = {}
key_word_dict['Food'] = ['food','grain','nutrition','calories']
key_word_dict['Foreign'] = ['foreign']
key_word_dict['Health'] = ['health','desease','hospital','mortality','doctor', 'mental']
key_word_dict['Economy'] = ['income','gdp','gni','deficit','budget','market','stock','bond','infrastructure', 'investment']
key_word_dict['Education'] = ['education','literacy', 'school', 'college']
key_word_dict['Energy'] = ['fuel','energy','power','emission','electric','electricity', 'water']
key_word_dict['Employment'] =['employed','employment','umemployed','unemployment']
key_word_dict['Rural'] = ['rural','village']
key_word_dict['Urban'] = ['urban','city']
key_word_dict['Social Programs'] = ['social', 'welfare']
key_word_dict['Tech'] = ['technology', 'tech', 'phone', 'mobile', 'broadband', 'cable', 'telephone']
key_word_dict['Trade'] = ['trade','import','export','good','shipping','shipment']
key_word_dict['Water'] = ['water', 'sanitation', 'sanitary']
key_word_dict['Access'] = ['access']

In [31]:
feature = 'Social Programs'
for indicator in Indicators.values:
    for w in key_word_dict[feature]:
        word_list = indicator[0].split()
        if w in word_list or w+'s' in word_list:
            # Uncomment this line to print the indicators explicitely
            print(indicator)
            break

['social contributions % of revenue' 'GC.REV.SOCL.ZS']
['social contributions current lcu' 'GC.REV.SOCL.CN']
['adequacy of social insurance programs % of total welfare of beneficiary households'
 'per_si_allsi.adq_pop_tot']
['adequacy of social protection and labor programs % of total welfare of beneficiary households'
 'per_allsp.adq_pop_tot']
['adequacy of unemployment benefits and almp % of total welfare of beneficiary households'
 'per_lm_alllm.adq_pop_tot']
['benefits incidence in poorest quintile %  to  all social insurance'
 'per_si_allsi.ben_q1_tot']
['benefits incidence in poorest quintile %  to all social protection and labor'
 'per_allsp.ben_q1_tot']
['coverage %  to  all social insurance' 'per_si_allsi.cov_pop_tot']
['coverage %  to all social protection and labor' 'per_allsp.cov_pop_tot']
['coverage %  to  all social assistance' 'per_sa_allsa.cov_pop_tot']
['adequacy of social safety net programs % of total welfare of beneficiary households'
 'per_sa_allsa.adq_pop_tot']
['

In [32]:
worlddf = pd.DataFrame(datadf.groupby(['Country Name','Indicator Name']).mean().stack())
worlddf = worlddf.reset_index()
worlddf = worlddf.rename(columns={0: 'Value', 'level_2': 'year', 'Country Name': 'Country', 'Indicator Name': 'IndicatorName'})
worlddf = worlddf[worlddf.year >= '1985']

In [33]:
worlddf.to_csv('Data/WorldDf.csv')