### Import dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Import dataset

In [2]:
pop_df = pd.read_csv("Data/population_total.csv")
inet_df = pd.read_csv("Data/internet_users.csv")
cell_df = pd.read_csv("Data/cell_phones_total.csv")
suic_df = pd.read_csv("Data/suicide_total_deaths.csv")
#gini_df = pd.read_csv("Data/inequality_index_gini.csv")
gdp_df = pd.read_csv("Data/gdppercapita.csv")

### Investigate dataset

### Steps:
- read in dataset
- check the following:
    - null values
    - decide what to do with null values (drop, fill with 0, average, etc)
    - what columns to keep (drop unneccessary columns) - keep columns from 1990 - 2016 (decide to investigate those 
        years + data available for these years - the dataset with the least data = suicide data 1990 - 2016)
    - shape
    
### Notes

- see what years to use and drop the rest
- transpose datasets
- merge datasets

In [3]:
# Check imported datasets
pop_df.head(3)

Unnamed: 0,country,1800,1801,1802,1803,1804,1805,1806,1807,1808,...,2091,2092,2093,2094,2095,2096,2097,2098,2099,2100
0,Afghanistan,3280000,3280000,3280000,3280000,3280000,3280000,3280000,3280000,3280000,...,76600000,76400000,76300000,76100000,76000000,75800000,75600000,75400000,75200000,74900000
1,Albania,400000,402000,404000,405000,407000,409000,411000,413000,414000,...,1330000,1300000,1270000,1250000,1220000,1190000,1170000,1140000,1110000,1090000
2,Algeria,2500000,2510000,2520000,2530000,2540000,2550000,2560000,2560000,2570000,...,70400000,70500000,70500000,70600000,70700000,70700000,70700000,70700000,70700000,70700000


In [4]:
# Check imported datasets
inet_df.head(3)

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Afghanistan,,,,,,,,,,...,4.0,5.0,5.45,5.9,7.0,8.26,,11.4,,
1,Albania,,,,,,,,,,...,45.0,49.0,54.7,57.2,60.1,63.3,66.4,71.8,,69.6
2,Algeria,,,,,,,,,,...,12.5,14.9,18.2,22.5,29.5,38.2,42.9,47.7,49.0,


In [5]:
# Check imported datasets
cell_df.head(3)

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Afghanistan,0.0,,,,,0.0,,,,...,10200000.0,13800000.0,15300000.0,16800000.0,18400000.0,19700000,21600000.0,23900000.0,22000000.0,22600000.0
1,Albania,0.0,,,,,0.0,,,,...,2690000.0,3100000.0,3500000.0,3690000.0,3360000.0,3400000,3370000.0,3630000.0,2710000.0,2630000.0
2,Algeria,0.0,,,,,0.0,,,,...,32800000.0,35600000.0,37500000.0,39500000.0,43300000.0,43200000,47000000.0,45800000.0,47200000.0,47100000.0


In [6]:
# Check imported datasets
suic_df.head(3)

Unnamed: 0,country,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,Afghanistan,703.0,754.0,820.0,894.0,977.0,1050.0,1100.0,1130.0,1170.0,...,1680.0,1710.0,1750.0,1760.0,1810.0,1870.0,1990.0,2080.0,2170.0,2250.0
1,Albania,127.0,130.0,131.0,135.0,136.0,142.0,150.0,162.0,170.0,...,204.0,205.0,201.0,195.0,191.0,188.0,186.0,184.0,183.0,181.0
2,Algeria,806.0,822.0,843.0,866.0,888.0,912.0,941.0,983.0,1020.0,...,1240.0,1250.0,1270.0,1290.0,1310.0,1340.0,1370.0,1410.0,1420.0,1440.0


In [7]:
# Check imported datasets
gdp_df.head(3)

Unnamed: 0,country,1800,1801,1802,1803,1804,1805,1806,1807,1808,...,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040
0,Afghanistan,603,603,603,603,603,603,603,603,603,...,2550,2600,2660,2710,2770,2820,2880,2940,3000,3060
1,Albania,667,667,667,667,667,668,668,668,668,...,19400,19800,20200,20600,21000,21500,21900,22300,22800,23300
2,Algeria,715,716,717,718,719,720,721,722,723,...,14300,14600,14900,15200,15500,15800,16100,16500,16800,17100


### Drop columns

In [8]:
# Keep only the columns 1990 - 2016 - try to do this in the function or for loop!
# suicide dataset already sliced (1990 - 2016)
# QUESTION - is there better way of doing this 
pop_df = pop_df.iloc[:, np.r_[:1, 191:218]]
inet_df = inet_df.iloc[:, np.r_[:1, 31:58]]
cell_df = cell_df.iloc[:, np.r_[:1, 31:58]]
#gini_df = gini_df.iloc[:, np.r_[:1, 24:51]]
gdp_df = gdp_df.iloc[:, np.r_[:1, 191:218]]

## Handling null values

- After investigating datasets the nature of internet use, cell phones - there was increasing trends in all countries - support that with line plots!!! therefore I decided to fill null values with fill forward (research this:  Interpolation here: https://stackabuse.com/python-how-to-handle-missing-dataframe-values-in-pandas/) + investigate how to find average between missing cells and fillna with this.

- GDP: see the trend and fill up with rolling window 5?
- Suicide: see the trend andfill with overall average
- gini: see the trend rolliing window and weight on the previous velue??? suggestion for the window_type!

In [9]:
# fill first colum [1990] with null if empty
inet_df['1990'].fillna(0, inplace=True)
cell_df['1990'].fillna(0, inplace=True)

In [10]:
# fill null values for internet and cell phone
inet_df = inet_df.fillna(method='ffill', axis=1)
cell_df = cell_df.fillna(method='ffill', axis=1)

In [11]:
# Check null values for all dataframe:
#suic_df.isnull().sum()

### Unpivot dataframes

In [12]:
pop_df = pop_df.melt(id_vars = 'country', var_name = 'year', value_name = 'population_total', ignore_index=True)
inet_df = inet_df.melt(id_vars = 'country', var_name = 'year', value_name = 'internet_use', ignore_index=True)
cell_df = cell_df.melt(id_vars = 'country', var_name = 'year', value_name = 'cell_use', ignore_index=True)
suic_df = suic_df.melt(id_vars = 'country', var_name = 'year', value_name = 'suicide_total', ignore_index=True)
#gini_df = gini_df.melt(id_vars = 'country', var_name = 'year', value_name = 'gini_index', ignore_index=True)
gdp_df = gdp_df.melt(id_vars = 'country', var_name = 'year', value_name = 'GDP', ignore_index=True)

### Merge datasets
- questions: can I merge all in one step?
- how is the best way to check that the merges are correct??


In [13]:
merged01 = pd.merge(pop_df, inet_df, how="inner", on=["country", "year"])
merged02 = pd.merge(suic_df, gdp_df, how="inner", on=["country", "year"])

In [14]:
merged03_df = pd.merge(merged01, merged02, how="inner", on=["country", "year"])

In [15]:
merged_df = pd.merge(merged03_df, cell_df, how="inner", on=["country", "year"])

In [16]:
merged_df

Unnamed: 0,country,year,population_total,internet_use,suicide_total,GDP,cell_use
0,Afghanistan,1990,12400000,0,703.00,1860,0
1,Albania,1990,3290000,0,127.00,4460,0
2,Algeria,1990,25800000,0,806.00,10300,470
3,Andorra,1990,54500,0,5.45,28400,0
4,Angola,1990,11800000,0,645.00,4760,0
...,...,...,...,...,...,...,...
5044,Venezuela,2016,29900000,60,2970.00,15200,2.76e+07
5045,Vietnam,2016,93600000,53,7740.00,5900,1.21e+08
5046,Yemen,2016,27200000,24.6,1450.00,2620,1.64e+07
5047,Zambia,2016,16400000,25.5,1380.00,3700,1.2e+07


In [17]:
merged_df.to_csv('Data/merged.csv', index=False)

### Data Wrangling (null, duplicates, etc)

In [18]:
# Shape
merged_df.shape

(5049, 7)

In [19]:
# Duplicate rows
merged_df.duplicated().sum()

0

In [20]:
# Missing values - decide what to do with null values
merged_df.isnull().sum()

country             0
year                0
population_total    0
internet_use        0
suicide_total       0
GDP                 0
cell_use            0
dtype: int64

In [21]:
# Check individual datatypes - convert year to int
merged_df.dtypes

country              object
year                 object
population_total      int64
internet_use         object
suicide_total       float64
GDP                   int64
cell_use             object
dtype: object

In [None]:
## Change datatypes - scientific to float

In [None]:
# Filling the nulls with the rollin average
#merged_df.internet_use = merged_df.internet_use.fillna(merged_df.internet_use.rolling(6, min_periods=1).mean())

In [None]:
#merged_df

In [None]:
merged_df.isnull().sum()

In [None]:
# Export to csv to investigate
#merged_df.to_csv('Data/merged_w_rolling_mean_net.csv', index=False)