In [None]:
import os
from google.colab import drive

drive.mount('/content/drive')
dataset = '/content/drive/My Drive/visual/data/kaggle_data.csv'
to_save = '/content/drive/My Drive/visual/data/preprocessed_data.csv'
print(dataset)

Mounted at /content/drive
/content/drive/My Drive/visual/data/kaggle_data.csv


In [None]:
import numpy as np 
import pandas as pd

In [None]:
data_df = pd.read_csv(dataset)
print(data_df.head())

   country  year  ... gdp_per_capita ($)       generation
0  Albania  1987  ...                796     Generation X
1  Albania  1987  ...                796           Silent
2  Albania  1987  ...                796     Generation X
3  Albania  1987  ...                796  G.I. Generation
4  Albania  1987  ...                796          Boomers

[5 rows x 12 columns]


Get cleaner names for columns

In [None]:
data_df.rename(columns={"suicides/100k pop":"suicides_pop","HDI for year":"HDI_for_year",
                  " gdp_for_year ($) ":"gdp_for_year"," gdp_per_capita ($) ":"gdp_per_capita",
                    "gdp_per_capita ($)":"gdp_per_capita"}, inplace=True)
print(data_df.columns)

Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
       'suicides_pop', 'country-year', 'HDI_for_year', 'gdp_for_year',
       'gdp_per_capita', 'generation'],
      dtype='object')


Check for null values

In [None]:
data_df.isnull().sum()

country               0
year                  0
sex                   0
age                   0
suicides_no           0
population            0
suicides_pop          0
country-year          0
HDI_for_year      19456
gdp_for_year          0
gdp_per_capita        0
generation            0
dtype: int64

HDI_for_year has mostly null values and therefore we will ignore this parameter. We remove also redundant parameters

In [None]:
data_df = data_df.drop('HDI_for_year', axis=1)
data_df = data_df.drop('country-year', axis=1)
data_df = data_df.drop('generation', axis=1)
print(data_df.columns)

Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
       'suicides_pop', 'gdp_for_year', 'gdp_per_capita'],
      dtype='object')


Better values encoding

In [None]:
data_df["gdp_for_year"] = data_df["gdp_for_year"].str.replace(",","").astype(np.int64)
data_df["age"] = data_df["age"].str.replace("5-14 years","05-14").str.replace("15-24 years","15-24").str.replace("25-34 years","25-34").str.replace("35-54 years","35-54").str.replace("55-74 years","55-74").str.replace("75+ years","75+")

Replace names of countries that are different from the geojson

In [None]:
data_df["country"] = data_df["country"].str.replace("United States","USA").str.replace("United Kingdom", "England").str.replace("Republic of Korea", "South Korea").str.replace("Russian Federation", "Russia").str.replace("Serbia","Republic of Serbia").str.replace("Bahamas", "The Bahamas")

In [None]:
print(data_df.sample(10))

                            country  year  ...    gdp_for_year gdp_per_capita
13931                    Kazakhstan  2006  ...     81003884545           5770
21205                        Russia  2011  ...   2051661732060          15226
26324                       Ukraine  2008  ...    179992405832           4104
21420                   Saint Lucia  1996  ...       662196185           5031
27089                           USA  2005  ...  13093726000000          47423
4577                       Bulgaria  1987  ...     28101000000           3355
26931                           USA  1991  ...   6174043000000          26503
24580                        Sweden  2004  ...    381705425302          44831
21840  Saint Vincent and Grenadines  2008  ...       695428852           6966
14193                        Kuwait  1986  ...     17903681693          11661

[10 rows x 9 columns]


In [None]:
print(data_df.count())

country           27820
year              27820
sex               27820
age               27820
suicides_no       27820
population        27820
suicides_pop      27820
gdp_for_year      27820
gdp_per_capita    27820
dtype: int64


In [None]:
data_df.to_csv(to_save, index=False)