Connect to the google drive

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


### Import Libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import the data file

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Time Series/WDIData.csv')

# 1 - Data Analysis

### 1.1 Check how the data looks

In [5]:
df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,Unnamed: 63
0,Arab World,ARB,"2005 PPP conversion factor, GDP (LCU per inter...",PA.NUS.PPP.05,,,,,,,...,,,,,,,,,,
1,Arab World,ARB,"2005 PPP conversion factor, private consumptio...",PA.NUS.PRVT.PP.05,,,,,,,...,,,,,,,,,,
2,Arab World,ARB,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,82.368101,82.783289,83.120303,83.533457,83.897596,84.171599,84.510171,,,
3,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,86.00762,86.428272,87.070576,88.176836,87.342739,89.130121,89.678685,90.273687,,
4,Arab World,ARB,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,73.466653,73.942103,75.244104,77.162305,75.538976,78.741152,79.665635,80.749293,,


### 1.2 - Dataset Size and what the columns are

In [6]:
df.shape

(422136, 64)

In [7]:
df.columns

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', 'Unnamed: 63'],
      dtype='object')

### 1.3 - Overview of column data types and missing values

In [8]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422136 entries, 0 to 422135
Data columns (total 64 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Country Name    422136 non-null  object 
 1   Country Code    422136 non-null  object 
 2   Indicator Name  422136 non-null  object 
 3   Indicator Code  422136 non-null  object 
 4   1960            38262 non-null   float64
 5   1961            41977 non-null   float64
 6   1962            44156 non-null   float64
 7   1963            44036 non-null   float64
 8   1964            44564 non-null   float64
 9   1965            47250 non-null   float64
 10  1966            46847 non-null   float64
 11  1967            48579 non-null   float64
 12  1968            48158 non-null   float64
 13  1969            49961 non-null   float64
 14  1970            92948 non-null   float64
 15  1971            99195 non-null   float64
 16  1972            101947 non-null  float64
 17  1973      

### 1.4 - Summary statistics for numerical columns

In [9]:
df.describe()

Unnamed: 0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,Unnamed: 63
count,38262.0,41977.0,44156.0,44036.0,44564.0,47250.0,46847.0,48579.0,48158.0,49961.0,...,236962.0,227654.0,229276.0,224822.0,230038.0,224078.0,215692.0,189735.0,83318.0,0.0
mean,244222000000.0,241809200000.0,244195100000.0,252034800000.0,269816300000.0,286876600000.0,311035000000.0,325607900000.0,378099500000.0,409261700000.0,...,1648256000000.0,1919604000000.0,2053926000000.0,2330338000000.0,2475555000000.0,2643786000000.0,2990721000000.0,3537043000000.0,5677840000000.0,
std,10016050000000.0,10494080000000.0,10948890000000.0,11512030000000.0,12392870000000.0,13695540000000.0,15141730000000.0,16526510000000.0,19036480000000.0,21523780000000.0,...,70470390000000.0,80391510000000.0,86662810000000.0,99359720000000.0,109046400000000.0,115718600000000.0,130633800000000.0,148620300000000.0,183794300000000.0,
min,-334419100000000.0,-382300000000000.0,-432795500000000.0,-476987500000000.0,-533935800000000.0,-626471600000000.0,-712002400000000.0,-824513400000000.0,-937946700000000.0,-1122499000000000.0,...,-182770900000000.0,-216892700000000.0,-301384500000000.0,-349561700000000.0,-518938500000000.0,-853088200000000.0,-1374105000000000.0,-1345846000000000.0,-459995100000000.0,
25%,4.27207,4.601415,4.53314,4.753457,4.831391,4.644542,4.872674,4.760664,5.040681,4.536045,...,4.914968,5.399331,5.018739,5.158872,5.105746,5.252173,4.976971,4.997969,7.0,
50%,35.285,36.81016,35.84284,35.35911,36.41009,38.25296,39.55614,41.17647,41.97375,39.10783,...,41.978,46.31169,45.36574,45.57322,46.21599,45.868,45.28034,48.97,47.17015,
75%,233968.8,214585.9,139370.8,187005.0,230268.8,360347.1,459350.5,434540.0,652067.5,560000.0,...,235185.8,490217.5,420000.2,612913.2,336900.0,504804.2,976219.7,6664168.0,6596510.0,
max,835372800000000.0,931253600000000.0,1011252000000000.0,1084815000000000.0,1184553000000000.0,1388505000000000.0,1544572000000000.0,1724250000000000.0,1979579000000000.0,2302920000000000.0,...,6864133000000000.0,7831726000000000.0,8676844000000000.0,1.00018e+16,1.155185e+16,1.167182e+16,1.319219e+16,1.534503e+16,1.491231e+16,


#### 1.4 - Check for missing values

In [10]:
# Count missing values in each column
print(df.isnull().sum())

Country Name           0
Country Code           0
Indicator Name         0
Indicator Code         0
1960              383874
                   ...  
2015              198058
2016              206444
2017              232401
2018              338818
Unnamed: 63       422136
Length: 64, dtype: int64


### 1.5 - See how many **unique** *Indicator Name* and *Country Code* are present

In [11]:
df['Indicator Name'].nunique()

1599

In [12]:
df['Country Code'].nunique()

264

### 1.6 - Creating a new dataframe where a Indicator Name matches:

*   GDP per capita (current US$)
*   CO2 emissions (metric tons per capita)





In [13]:
# rows with Indicator name matching with 'GDP per capita (current US$)' will be returned
gdp_per_capita_df = df.loc[df['Indicator Name'] == 'GDP per capita (current US$)']
gdp_per_capita_df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,Unnamed: 63
523,Arab World,ARB,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,5945.678558,6889.091806,7503.174184,7551.282834,7497.726925,6458.702258,6202.690406,6284.737723,6625.507319,
2122,Caribbean small states,CSS,GDP per capita (current US$),NY.GDP.PCAP.CD,447.101607,475.671874,493.461134,515.410838,546.537661,579.448366,...,9075.706457,9743.619786,10007.371148,10090.027335,10149.197121,9916.666674,9242.035178,9442.912794,9910.397885,
3721,Central Europe and the Baltics,CEB,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,12567.404996,13872.615410,12998.665269,13710.080616,14152.393835,12467.450176,12773.783836,14167.649103,15902.299611,
5320,Early-demographic dividend,EAR,GDP per capita (current US$),NY.GDP.PCAP.CD,155.827817,152.920933,154.319928,159.763265,179.066848,189.604165,...,2938.879732,3246.211837,3354.503243,3367.696207,3439.546743,3291.528607,3321.457299,3545.022221,3581.455982,
6919,East Asia & Pacific,EAS,GDP per capita (current US$),NY.GDP.PCAP.CD,147.229032,147.154368,148.505207,161.779035,181.616787,197.573087,...,7673.741646,8836.854594,9392.895426,9433.150535,9647.929555,9526.576304,9766.122869,10332.832262,11132.229665,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414664,Virgin Islands (U.S.),VIR,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,40043.190166,39144.165774,37849.728721,34819.147755,33573.097030,34797.140470,35931.541252,35938.024388,,
416263,West Bank and Gaza,PSE,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,2354.125987,2695.193853,2834.021525,3060.312389,3046.821798,2967.851747,3074.291152,3254.485887,3198.866644,
417862,"Yemen, Rep.",YEM,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,1334.784845,1374.621313,1446.536280,1607.152275,1674.002766,1608.744312,1139.870568,963.494721,944.408499,
419461,Zambia,ZMB,GDP per capita (current US$),NY.GDP.PCAP.CD,232.188867,220.042137,212.578123,213.896441,242.384333,303.281910,...,1489.459070,1672.949830,1763.094184,1878.903489,1763.056239,1332.194321,1280.578447,1534.865371,1539.900158,


In [14]:
# rows with Indicator name matching with 'CO2 emissions (metric tons per capita)' will be returned
co2Emi_metricTon_perCapita_df = df.loc[df['Indicator Name'] == 'CO2 emissions (metric tons per capita)']
co2Emi_metricTon_perCapita_df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,Unnamed: 63
207,Arab World,ARB,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.645736,0.687465,0.763574,0.878238,1.003053,1.170540,...,4.636813,4.559462,4.837780,4.674925,4.886988,,,,,
1806,Caribbean small states,CSS,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,1.401337,2.059977,2.703738,1.350237,2.362947,2.601916,...,9.291597,9.152374,8.888217,9.132325,8.871220,,,,,
3405,Central Europe and the Baltics,CEB,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,5.104206,5.330535,5.658550,6.064745,6.454644,6.604577,...,6.828034,6.864349,6.504367,6.344838,6.148883,,,,,
5004,Early-demographic dividend,EAR,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.594013,0.604162,0.624157,0.638623,0.665061,0.695949,...,2.120645,2.202526,2.297640,2.213076,2.298216,,,,,
6603,East Asia & Pacific,EAS,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,1.162460,0.996779,0.895692,0.914268,0.942162,0.998188,...,5.702481,6.190753,6.307835,6.331191,6.294256,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414348,Virgin Islands (U.S.),VIR,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,,,...,,,,,,,,,,
415947,West Bank and Gaza,PSE,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,,,...,0.537533,0.578903,0.552814,0.598168,,,,,,
417546,"Yemen, Rep.",YEM,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.683679,0.494324,0.710130,0.525293,0.644143,0.686312,...,1.011975,0.826044,0.762823,1.013900,0.878996,,,,,
419145,Zambia,ZMB,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,0.946606,1.096876,...,0.197823,0.209458,0.253506,0.265078,0.292412,,,,,


In [15]:
print(gdp_per_capita_df.shape)
print(co2Emi_metricTon_perCapita_df.shape)

(264, 64)
(264, 64)


#### 1.7 - Change the indices of these new dataframes and assign them country codes as new idices

In [16]:
gdp_per_capita_df.index = gdp_per_capita_df["Country Code"]
gdp_per_capita_df.index

Index(['ARB', 'CSS', 'CEB', 'EAR', 'EAS', 'EAP', 'TEA', 'EMU', 'ECS', 'ECA',
       ...
       'URY', 'UZB', 'VUT', 'VEN', 'VNM', 'VIR', 'PSE', 'YEM', 'ZMB', 'ZWE'],
      dtype='object', name='Country Code', length=264)

In [17]:
co2Emi_metricTon_perCapita_df.index = co2Emi_metricTon_perCapita_df['Country Code']
co2Emi_metricTon_perCapita_df.index

Index(['ARB', 'CSS', 'CEB', 'EAR', 'EAS', 'EAP', 'TEA', 'EMU', 'ECS', 'ECA',
       ...
       'URY', 'UZB', 'VUT', 'VEN', 'VNM', 'VIR', 'PSE', 'YEM', 'ZMB', 'ZWE'],
      dtype='object', name='Country Code', length=264)

#### 1.8 - Take the data for the year **2014**


##### 1.8.1 - Check if there are missing values from each dataframe for the same year

In [18]:
gdp_per_capita_df['2013'].shape

(264,)

In [19]:
sum(gdp_per_capita_df['2013'].isnull())

12

In [20]:
co2Emi_metricTon_perCapita_df['2013'].shape

(264,)

In [21]:
sum(co2Emi_metricTon_perCapita_df['2013'].isnull())

13

##### 1.8.2 - Remove the missing values from each dataframe

In [22]:
# The values that are NOT null will be saved
gdp_per_capita_df_2014 = gdp_per_capita_df['2013'].loc[~gdp_per_capita_df['2013'].isnull()]
gdp_per_capita_df_2014.shape

(252,)

In [23]:
# The values that are NOT null will be saved
co2Emi_metricTon_perCapita_df_2014 = co2Emi_metricTon_perCapita_df['2013'].loc[~co2Emi_metricTon_perCapita_df['2013'].isnull()]
co2Emi_metricTon_perCapita_df_2014.shape

(251,)

##### 1.8.3 - Concatenate the dataframes

*   Concatenate them column wise
*   Rename the column names
*   Check the inconcistency of the data



In [24]:
# Concatenate them column wise
gdp_co2_concat_df_2014 = pd.concat((gdp_per_capita_df_2014, co2Emi_metricTon_perCapita_df_2014), axis=1)
gdp_co2_concat_df_2014.shape

(260, 2)

In [25]:
gdp_co2_concat_df_2014.head()

Unnamed: 0_level_0,2013,2013
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1
ARB,7551.282834,4.674925
CSS,10090.027335,9.132325
CEB,13710.080616,6.344838
EAR,3367.696207,2.213076
EAS,9433.150535,6.331191


In [26]:
# Rename the column names
gdp_co2_concat_df_2014.columns = ['GDP per capita (current US$)', 'CO2 emissions (metric tons per capita)']
gdp_co2_concat_df_2014.head()

Unnamed: 0_level_0,GDP per capita (current US$),CO2 emissions (metric tons per capita)
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1
ARB,7551.282834,4.674925
CSS,10090.027335,9.132325
CEB,13710.080616,6.344838
EAR,3367.696207,2.213076
EAS,9433.150535,6.331191


For some countries, some data might not be available for both GDP and CO2 emission

In [27]:
# For each row check is any of the column contains null value
# Indicating either of the column for corresponding country is empty
# Only keep the rows that return False
gdp_co2_concat_df_2014_filtered = gdp_co2_concat_df_2014.loc[gdp_co2_concat_df_2014.isnull().any(axis= 1)]
gdp_co2_concat_df_2014_filtered.shape

(17, 2)

# 2 - Time Series

In [35]:
gdp_per_capita_df.loc[gdp_per_capita_df['Country Code'] == 'GBR']

Unnamed: 0_level_0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,Unnamed: 63
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GBR,United Kingdom,GBR,GDP per capita (current US$),NY.GDP.PCAP.CD,1380.306241,1452.544709,1513.651449,1592.614477,1729.399978,1850.954769,...,39079.842606,41652.557085,42018.725464,42938.425176,46967.668135,44472.151701,40539.919607,39932.060292,42491.364435,
