Connect to the google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


### Import Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import the data file

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Time Series/WDIData.csv')

# 1 - Data Analysis

### 1.1 Check how the data looks

In [4]:
df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,Unnamed: 63
0,Arab World,ARB,"2005 PPP conversion factor, GDP (LCU per inter...",PA.NUS.PPP.05,,,,,,,...,,,,,,,,,,
1,Arab World,ARB,"2005 PPP conversion factor, private consumptio...",PA.NUS.PRVT.PP.05,,,,,,,...,,,,,,,,,,
2,Arab World,ARB,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,82.368101,82.783289,83.120303,83.533457,83.897596,84.171599,84.510171,,,
3,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,86.00762,86.428272,87.070576,88.176836,87.342739,89.130121,89.678685,90.273687,,
4,Arab World,ARB,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,73.466653,73.942103,75.244104,77.162305,75.538976,78.741152,79.665635,80.749293,,


### 1.2 - Dataset Size and what the columns are

In [5]:
df.shape

(422136, 64)

In [6]:
df.columns

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', 'Unnamed: 63'],
      dtype='object')

### 1.3 - Overview of column data types and missing values

In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422136 entries, 0 to 422135
Data columns (total 64 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Country Name    422136 non-null  object 
 1   Country Code    422136 non-null  object 
 2   Indicator Name  422136 non-null  object 
 3   Indicator Code  422136 non-null  object 
 4   1960            38262 non-null   float64
 5   1961            41977 non-null   float64
 6   1962            44156 non-null   float64
 7   1963            44036 non-null   float64
 8   1964            44564 non-null   float64
 9   1965            47250 non-null   float64
 10  1966            46847 non-null   float64
 11  1967            48579 non-null   float64
 12  1968            48158 non-null   float64
 13  1969            49961 non-null   float64
 14  1970            92948 non-null   float64
 15  1971            99195 non-null   float64
 16  1972            101947 non-null  float64
 17  1973      

### 1.4 - Summary statistics for numerical columns

In [8]:
df.describe()

Unnamed: 0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,Unnamed: 63
count,38262.0,41977.0,44156.0,44036.0,44564.0,47250.0,46847.0,48579.0,48158.0,49961.0,...,236962.0,227654.0,229276.0,224822.0,230038.0,224078.0,215692.0,189735.0,83318.0,0.0
mean,244222000000.0,241809200000.0,244195100000.0,252034800000.0,269816300000.0,286876600000.0,311035000000.0,325607900000.0,378099500000.0,409261700000.0,...,1648256000000.0,1919604000000.0,2053926000000.0,2330338000000.0,2475555000000.0,2643786000000.0,2990721000000.0,3537043000000.0,5677840000000.0,
std,10016050000000.0,10494080000000.0,10948890000000.0,11512030000000.0,12392870000000.0,13695540000000.0,15141730000000.0,16526510000000.0,19036480000000.0,21523780000000.0,...,70470390000000.0,80391510000000.0,86662810000000.0,99359720000000.0,109046400000000.0,115718600000000.0,130633800000000.0,148620300000000.0,183794300000000.0,
min,-334419100000000.0,-382300000000000.0,-432795500000000.0,-476987500000000.0,-533935800000000.0,-626471600000000.0,-712002400000000.0,-824513400000000.0,-937946700000000.0,-1122499000000000.0,...,-182770900000000.0,-216892700000000.0,-301384500000000.0,-349561700000000.0,-518938500000000.0,-853088200000000.0,-1374105000000000.0,-1345846000000000.0,-459995100000000.0,
25%,4.27207,4.601415,4.53314,4.753457,4.831391,4.644542,4.872674,4.760664,5.040681,4.536045,...,4.914968,5.399331,5.018739,5.158872,5.105746,5.252173,4.976971,4.997969,7.0,
50%,35.285,36.81016,35.84284,35.35911,36.41009,38.25296,39.55614,41.17647,41.97375,39.10783,...,41.978,46.31169,45.36574,45.57322,46.21599,45.868,45.28034,48.97,47.17015,
75%,233968.8,214585.9,139370.8,187005.0,230268.8,360347.1,459350.5,434540.0,652067.5,560000.0,...,235185.8,490217.5,420000.2,612913.2,336900.0,504804.2,976219.7,6664168.0,6596510.0,
max,835372800000000.0,931253600000000.0,1011252000000000.0,1084815000000000.0,1184553000000000.0,1388505000000000.0,1544572000000000.0,1724250000000000.0,1979579000000000.0,2302920000000000.0,...,6864133000000000.0,7831726000000000.0,8676844000000000.0,1.00018e+16,1.155185e+16,1.167182e+16,1.319219e+16,1.534503e+16,1.491231e+16,


### 1.4 - See how many **unique** *Indicator Name* and *Country Code* are present

In [10]:
df['Indicator Name'].nunique()

1599

In [11]:
df['Country Code'].nunique()

264

### 1.5 - Creating a new dataframe where a Indicator Name matches a string

In [16]:
# Let's say we are getting an array of all the unique indicators and
indicator_1 = df["Indicator Name"].unique()[489]

'Firms using banks to finance working capital (% of firms)'