## Imports - the usual suspects

In [21]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import plotly.express as px

## Load dataset 

Obtained from - https://ourworldindata.org/grapher/suicide-rates-vs-prevalence-of-depression

In [22]:
column_names = {
    "Entity": "country",
    "Code": "code",
    "Year": "year",
    "Deaths - Self-harm - Sex: Both - Age: Age-standardized (Rate)": "suicide_rates",
    "Prevalence - Depressive disorders - Sex: Both - Age: Age-standardized (Rate)": "depression_rates",
    "Population (historical estimates)": "population",
    "Continent": "continent"
}


df_suicide_and_depression_rates = pd.read_csv("../Datasets/suicide-rates-vs-prevalence-of-depression.csv").rename(columns=column_names)
df_suicide_and_depression_rates.head(4)

Unnamed: 0,country,code,year,suicide_rates,depression_rates,population,continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1990,10.318504,4039.755763,12412311.0,
2,Afghanistan,AFG,1991,10.32701,4046.256034,13299016.0,
3,Afghanistan,AFG,1992,10.271411,4053.709902,14485543.0,


## Data cleaning

There are rows with
1. Years less than zero
2. Regions that are not in the 195 countries in the world
3. NaNs in the continent column

### Case 1: Negative years

In [23]:
df_suicide_and_depression_rates[df_suicide_and_depression_rates['year'] < 1960].head()

Unnamed: 0,country,code,year,suicide_rates,depression_rates,population,continent
29,Afghanistan,AFG,-10000,,,14737.0,
30,Afghanistan,AFG,-9000,,,20405.0,
31,Afghanistan,AFG,-8000,,,28253.0,
32,Afghanistan,AFG,-7000,,,39120.0,
33,Afghanistan,AFG,-6000,,,54166.0,


### Fix: Drop years under 1960

In [24]:
df_suicide_and_depression_rates = df_suicide_and_depression_rates[df_suicide_and_depression_rates['year'] > 1960]

### Case 2: Not all entities are countries - and non-countries rarely have recorded depression and suicide rates

In [25]:
regions = df_suicide_and_depression_rates.query('code.isnull()', engine='python')["country"].unique()
regions

array(['Africa', 'Andean Latin America', 'Asia', 'Australasia',
       'Caribbean', 'Central Asia', 'Central Europe',
       'Central Europe, Eastern Europe, and Central Asia',
       'Central Latin America', 'Central Sub-Saharan Africa', 'East Asia',
       'Eastern Europe', 'Eastern Sub-Saharan Africa', 'England',
       'Europe', 'High SDI', 'High-income', 'High-income Asia Pacific',
       'High-middle SDI', 'Latin America and Caribbean', 'Low SDI',
       'Low-middle SDI', 'Middle SDI', 'North Africa and Middle East',
       'North America', 'Northern Ireland', 'Oceania', 'Saint Barthlemy',
       'Scotland', 'South America', 'South Asia', 'Southeast Asia',
       'Southeast Asia, East Asia, and Oceania', 'Southern Latin America',
       'Southern Sub-Saharan Africa', 'Sub-Saharan Africa',
       'Tropical Latin America', 'Wales', 'Western Europe',
       'Western Sub-Saharan Africa'], dtype=object)

In [26]:
df_suicide_and_depression_rates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15777 entries, 0 to 56628
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   country           15777 non-null  object 
 1   code              14426 non-null  object 
 2   year              15777 non-null  int64  
 3   suicide_rates     6468 non-null   float64
 4   depression_rates  6468 non-null   float64
 5   population        14804 non-null  float64
 6   continent         285 non-null    object 
dtypes: float64(3), int64(1), object(3)
memory usage: 986.1+ KB


In [27]:

df_suicide_and_depression_rates.query('suicide_rates.isnull()', engine='python').drop_duplicates(subset=["country"], keep="first")


Unnamed: 0,country,code,year,suicide_rates,depression_rates,population,continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
227,Afghanistan,AFG,1961,,,9169406.0,
458,Africa,,1961,,,290214463.0,
519,Akrotiri and Dhekelia,OWID_AKD,2015,,,,Asia
746,Albania,ALB,1961,,,1685926.0,
...,...,...,...,...,...,...,...
56108,Yugoslavia,OWID_YGS,2015,,,,Europe
56335,Zambia,ZMB,1961,,,3164330.0,
56368,Zanzibar,OWID_ZAN,2015,,,,Africa
56595,Zimbabwe,ZWE,1961,,,3905038.0,


In [28]:

df_suicide_and_depression_rates.query('suicide_rates.isnull()', engine='python').drop_duplicates(subset=["country"], keep="last")


Unnamed: 0,country,code,year,suicide_rates,depression_rates,population,continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
259,Afghanistan,AFG,2021,,,3.983543e+07,
518,Africa,,2021,,,1.373486e+09,
519,Akrotiri and Dhekelia,OWID_AKD,2015,,,,Asia
778,Albania,ALB,2021,,,2.872934e+06,
...,...,...,...,...,...,...,...
56108,Yugoslavia,OWID_YGS,2015,,,,Europe
56367,Zambia,ZMB,2021,,,1.892066e+07,
56368,Zanzibar,OWID_ZAN,2015,,,,Africa
56627,Zimbabwe,ZWE,2021,,,1.509217e+07,


### Fix: Drop rows with NaN values 

Rows without suicide rates, depression rates, or population data will be dropped

This also highlights some problems with the way the data was recorded

Some countries have null records from as far back as 1961, when they only have real values from 1990

An example is Afghanistan as seen below

In [29]:
df_suicide_and_depression_rates = df_suicide_and_depression_rates.dropna(
        subset=[
            "suicide_rates",
            "depression_rates",
            "population"
        ]
    )
df_suicide_and_depression_rates

Unnamed: 0,country,code,year,suicide_rates,depression_rates,population,continent
1,Afghanistan,AFG,1990,10.318504,4039.755763,12412311.0,
2,Afghanistan,AFG,1991,10.327010,4046.256034,13299016.0,
3,Afghanistan,AFG,1992,10.271411,4053.709902,14485543.0,
4,Afghanistan,AFG,1993,10.376123,4060.203474,15816601.0,
5,Afghanistan,AFG,1994,10.575915,4062.290365,17075728.0,
...,...,...,...,...,...,...,...
56392,Zimbabwe,ZWE,2013,28.361200,3048.264249,13350378.0,
56393,Zimbabwe,ZWE,2014,27.605547,3056.996704,13586710.0,
56394,Zimbabwe,ZWE,2015,27.197061,3068.250731,13814642.0,Africa
56395,Zimbabwe,ZWE,2016,26.839591,3081.782858,14030338.0,


### Case 3: NaNs in the continents column

In [30]:
df_suicide_and_depression_rates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5544 entries, 1 to 56396
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   country           5544 non-null   object 
 1   code              5488 non-null   object 
 2   year              5544 non-null   int64  
 3   suicide_rates     5544 non-null   float64
 4   depression_rates  5544 non-null   float64
 5   population        5544 non-null   float64
 6   continent         195 non-null    object 
dtypes: float64(3), int64(1), object(3)
memory usage: 346.5+ KB


### Fix: Extract country-to-continent mapping and merge on country

In [31]:
countries_to_continents = df_suicide_and_depression_rates.dropna(subset=["continent"])[["country", "continent"]]
countries_to_continents

Unnamed: 0,country,continent
26,Afghanistan,Asia
545,Albania,Europe
804,Algeria,Africa
1063,American Samoa,Oceania
1205,Andorra,Europe
...,...,...
54824,Venezuela,South America
55083,Vietnam,Asia
55872,Yemen,Asia
56134,Zambia,Africa


In [32]:
df_suicide_and_depression_rates

Unnamed: 0,country,code,year,suicide_rates,depression_rates,population,continent
1,Afghanistan,AFG,1990,10.318504,4039.755763,12412311.0,
2,Afghanistan,AFG,1991,10.327010,4046.256034,13299016.0,
3,Afghanistan,AFG,1992,10.271411,4053.709902,14485543.0,
4,Afghanistan,AFG,1993,10.376123,4060.203474,15816601.0,
5,Afghanistan,AFG,1994,10.575915,4062.290365,17075728.0,
...,...,...,...,...,...,...,...
56392,Zimbabwe,ZWE,2013,28.361200,3048.264249,13350378.0,
56393,Zimbabwe,ZWE,2014,27.605547,3056.996704,13586710.0,
56394,Zimbabwe,ZWE,2015,27.197061,3068.250731,13814642.0,Africa
56395,Zimbabwe,ZWE,2016,26.839591,3081.782858,14030338.0,


In [36]:
df_suicide_and_depression_rates_no_continent = df_suicide_and_depression_rates.drop(columns=["continent"])

In [43]:

df_suicide_and_depression_rates = df_suicide_and_depression_rates_no_continent.merge(countries_to_continents, on='country', how='inner')
df_suicide_and_depression_rates.sample(7)



Unnamed: 0,country,code,year,suicide_rates,depression_rates,population,continent
877,Cameroon,CMR,1999,16.219331,3677.453622,15112598.0,Africa
4544,South Sudan,SSD,1998,13.868189,3604.117933,5661934.0,Africa
3510,Niger,NER,2000,8.370294,3399.695775,11331561.0,Africa
434,Barbados,BRB,2004,4.479657,2581.541365,275283.0,North America
351,Bahamas,BHS,2005,2.916657,2507.00615,324848.0,North America
1993,Guam,GUM,1995,17.924762,3282.333235,145559.0,Oceania
3401,Nepal,NPL,2003,8.596152,3646.489096,25080880.0,Asia


In [45]:
countries = pd.unique(df_suicide_and_depression_rates['country'])
print(len(countries), "\n", countries)

195 
 ['Afghanistan' 'Albania' 'Algeria' 'American Samoa' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Brunei' 'Bulgaria'
 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon' 'Canada' 'Cape Verde'
 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia' 'Comoros'
 'Congo' 'Costa Rica' "Cote d'Ivoire" 'Croatia' 'Cuba' 'Cyprus' 'Czechia'
 'Democratic Republic of Congo' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea'
 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia' 'Fiji' 'Finland' 'France'
 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Greenland'
 'Grenada' 'Guam' 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti'
 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran' 'Iraq'
 'Ireland' 'Israel' 'Italy' 'Jamaica'

## Plots of suicide rates against depression rates

In [59]:
fig = px.scatter(df_suicide_and_depression_rates, x="depression_rates", y="suicide_rates", animation_frame="year", animation_group="country",
           size="population", color="continent", hover_name="country", labels={
                 "depression_rates": "Prevalence - Depressive disorders (Depression Rate)",
                 "suicide_rates": "Deaths - Self-harm (Suicide Rate)",
                 "population": "Population",
                 "continent": "Continent"
             },
           log_x=False, size_max=55, title="Suicide rates vs. prevalence of depression (all sexes and ages worldwide).")
fig.show()

In [64]:
fig = px.scatter(df_suicide_and_depression_rates, x="depression_rates", y="suicide_rates", animation_frame="year", animation_group="country",
           size="population", color="continent", hover_name="country", facet_col="continent", labels={
                 "depression_rates": "",
                 "suicide_rates": "Deaths - Self-harm (Suicide Rate)",
                 "population": "Population",
                 "continent": "Continent"
             },
           log_x=False, size_max=55, title="Suicide rates vs. prevalence of depression (worldwide).")
fig.show()

In [None]:
df_suicide_and_depression_rates

In [62]:
fig = px.area(df_suicide_and_depression_rates, x="year", y="suicide_rates", color="continent", line_group="country")
fig.show()

In [63]:
fig = px.area(df_suicide_and_depression_rates, x="year", y="depression_rates", color="continent", line_group="country")
fig.show()

In [65]:
fig = px.line(df_suicide_and_depression_rates, x="year", y="depression_rates", color="continent", line_group="country", hover_name="country",
        line_shape="spline", render_mode="svg")
fig.show()

In [66]:
fig = px.line(df_suicide_and_depression_rates, x="year", y="suicide_rates", color="continent", line_group="country", hover_name="country",
        line_shape="spline", render_mode="svg")
fig.show()