In [2]:
import os
from google.colab import drive

drive.mount('/content/drive')
dataset = '/content/drive/My Drive/visual/data/kaggle_data.csv'
to_save = '/content/drive/My Drive/visual/data/data.csv'
print(dataset)

Mounted at /content/drive
/content/drive/My Drive/visual/data/kaggle_data.csv


In [3]:
import numpy as np 
import pandas as pd

In [4]:
data_df = pd.read_csv(dataset)
print(data_df.head())

   country  year  ... gdp_per_capita ($)       generation
0  Albania  1987  ...                796     Generation X
1  Albania  1987  ...                796           Silent
2  Albania  1987  ...                796     Generation X
3  Albania  1987  ...                796  G.I. Generation
4  Albania  1987  ...                796          Boomers

[5 rows x 12 columns]


Get cleaner names for columns

In [5]:
data_df.rename(columns={"suicides/100k pop":"suicides_pop","HDI for year":"HDI_for_year",
                  " gdp_for_year ($) ":"gdp_for_year"," gdp_per_capita ($) ":"gdp_per_capita",
                    "gdp_per_capita ($)":"gdp_per_capita"}, inplace=True)
print(data_df.columns)

Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
       'suicides_pop', 'country-year', 'HDI_for_year', 'gdp_for_year',
       'gdp_per_capita', 'generation'],
      dtype='object')


Check for null values

In [6]:
data_df.isnull().sum()

country               0
year                  0
sex                   0
age                   0
suicides_no           0
population            0
suicides_pop          0
country-year          0
HDI_for_year      19456
gdp_for_year          0
gdp_per_capita        0
generation            0
dtype: int64

HDI_for_year has mostly null values and therefore we will ignore this parameter. We remove also redundant parameters

In [7]:
data_df = data_df.drop('HDI_for_year', axis=1)
data_df = data_df.drop('country-year', axis=1)
data_df = data_df.drop('generation', axis=1)
print(data_df.columns)

Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
       'suicides_pop', 'gdp_for_year', 'gdp_per_capita'],
      dtype='object')


Better values encoding

In [8]:
data_df["gdp_for_year"] = data_df["gdp_for_year"].str.replace(",","").astype(np.int64)
data_df["age"] = data_df["age"].str.replace("5-14 years","05-14 years")

In [9]:
print(data_df.sample(10))

              country  year  ...   gdp_for_year gdp_per_capita
15871           Malta  1990  ...     2547163582           7780
19847     Puerto Rico  1998  ...    54086400000          15610
7135   Czech Republic  1993  ...    40614350197           4186
24694          Sweden  2014  ...   573817719109          62956
3888           Belize  1994  ...      580863700           3399
19982     Puerto Rico  2010  ...    98381268000          28267
6342       Costa Rica  2012  ...    46473128286          10832
25050        Thailand  1994  ...   146683499006           2784
27257         Uruguay  1988  ...     8213515459           2934
23804           Spain  2008  ...  1635015380108          37848

[10 rows x 9 columns]


In [10]:
print(data_df.count())

country           27820
year              27820
sex               27820
age               27820
suicides_no       27820
population        27820
suicides_pop      27820
gdp_for_year      27820
gdp_per_capita    27820
dtype: int64


In [11]:
data_df.to_csv(to_save, index=False)