## Steps 1 - 5

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm

%matplotlib inline

## Step 6

In [2]:
gdp_df=pd.read_csv('data/gdp_per_capita.csv')

In [3]:
gdp_df.head(10)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
0,Afghanistan,2017,1758.465636,
1,Afghanistan,2016,1757.02349,
2,Afghanistan,2015,1766.593077,
3,Afghanistan,2014,1795.735834,
4,Afghanistan,2013,1807.762344,
5,Afghanistan,2012,1772.764974,
6,Afghanistan,2011,1626.764793,
7,Afghanistan,2010,1671.581238,
8,Afghanistan,2009,1502.354073,
9,Afghanistan,2008,1267.644939,


In [4]:
gdp_df.tail()

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
6497,Zimbabwe,1992,2591.007534,
6498,Zimbabwe,1991,2906.272849,
6499,Zimbabwe,1990,2819.549467,
6500,footnoteSeqID,Footnote,,
6501,2,"Excludes South Sudan after July 9, 2011.",,


In [5]:
gdp_df=gdp_df[0:6500]
#could've used skipfooter

In [6]:
gdp_df.tail()

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
6495,Zimbabwe,1994,2768.309953,
6496,Zimbabwe,1993,2572.870395,
6497,Zimbabwe,1992,2591.007534,
6498,Zimbabwe,1991,2906.272849,
6499,Zimbabwe,1990,2819.549467,


## Step 7

In [7]:
gdp_df.shape

(6500, 4)

In [8]:
gdp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 4 columns):
Country or Area    6500 non-null object
Year               6500 non-null object
Value              6500 non-null float64
Value Footnotes    1 non-null float64
dtypes: float64(2), object(2)
memory usage: 203.2+ KB


## Step 8 

In [9]:
gdp_df=gdp_df.drop(columns={'Value Footnotes'})
gdp_df=gdp_df.rename(columns={'Country or Area':'Country','Value':'GDP_Per_Capita'})

In [10]:
gdp_df.head()

Unnamed: 0,Country,Year,GDP_Per_Capita
0,Afghanistan,2017,1758.465636
1,Afghanistan,2016,1757.02349
2,Afghanistan,2015,1766.593077
3,Afghanistan,2014,1795.735834
4,Afghanistan,2013,1807.762344


## Step 9(a)

In [11]:
continents_df=pd.read_csv('data/continents.csv')

In [12]:
#To be changed in gdp_df
#* Change "CÃ´te d'Ivoire" to "Ivory Coast"
#* Change "CuraÃ§ao" to "Curaçao"
#* Change "SÃ£o TomÃ© and Principe" to "Sao Tome and Principe"
#* Change "Sint Maarten (Dutch part)" to "Sint Maarten"

In [13]:
gdp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 3 columns):
Country           6500 non-null object
Year              6500 non-null object
GDP_Per_Capita    6500 non-null float64
dtypes: float64(1), object(2)
memory usage: 152.4+ KB


In [14]:
gdp_df = gdp_df.replace({
    "CÃ´te d'Ivoire": 'Ivory Coast', 
    'CuraÃ§ao': 'Curaçao', 
    'SÃ£o TomÃ© and Principe':'Sao Tome and Principe',
    'Sint Maarten (Dutch part)':' "Sint Maarten'
})

In [15]:
gdp_df[gdp_df.Country=='Ivory Coast']

Unnamed: 0,Country,Year,GDP_Per_Capita
792,Ivory Coast,2017,3564.595846
793,Ivory Coast,2016,3395.088932
794,Ivory Coast,2015,3225.188859
795,Ivory Coast,2014,3038.844827
796,Ivory Coast,2013,2864.048627
797,Ivory Coast,2012,2696.191639
798,Ivory Coast,2011,2495.495125
799,Ivory Coast,2010,2673.013083
800,Ivory Coast,2009,2682.036638
801,Ivory Coast,2008,2657.672981


## Step 9(b)

In [16]:
gdp_df= pd.merge(continents_df, gdp_df, how='inner')

In [17]:
gdp_df

Unnamed: 0,Continent,Country,Year,GDP_Per_Capita
0,Asia,Afghanistan,2017,1758.465636
1,Asia,Afghanistan,2016,1757.023490
2,Asia,Afghanistan,2015,1766.593077
3,Asia,Afghanistan,2014,1795.735834
4,Asia,Afghanistan,2013,1807.762344
5,Asia,Afghanistan,2012,1772.764974
6,Asia,Afghanistan,2011,1626.764793
7,Asia,Afghanistan,2010,1671.581238
8,Asia,Afghanistan,2009,1502.354073
9,Asia,Afghanistan,2008,1267.644939


## Step 10 

In [18]:
gdp_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5217 entries, 0 to 5216
Data columns (total 4 columns):
Continent         5217 non-null object
Country           5217 non-null object
Year              5217 non-null object
GDP_Per_Capita    5217 non-null float64
dtypes: float64(1), object(3)
memory usage: 203.8+ KB


In [19]:
%%time

gdp_df.groupby('Continent').GDP_Per_Capita.mean()

CPU times: user 3.17 ms, sys: 322 µs, total: 3.49 ms
Wall time: 3.73 ms


Continent
Africa            4588.289835
Asia             21174.893215
Europe           26497.245133
North America    19068.562312
Oceania           8818.318792
South America    10989.950644
Name: GDP_Per_Capita, dtype: float64

In [20]:
gdp_df['Continent'] = gdp_df['Continent'].astype('category')

In [21]:
gdp_df

Unnamed: 0,Continent,Country,Year,GDP_Per_Capita
0,Asia,Afghanistan,2017,1758.465636
1,Asia,Afghanistan,2016,1757.023490
2,Asia,Afghanistan,2015,1766.593077
3,Asia,Afghanistan,2014,1795.735834
4,Asia,Afghanistan,2013,1807.762344
5,Asia,Afghanistan,2012,1772.764974
6,Asia,Afghanistan,2011,1626.764793
7,Asia,Afghanistan,2010,1671.581238
8,Asia,Afghanistan,2009,1502.354073
9,Asia,Afghanistan,2008,1267.644939


In [22]:
%%time

gdp_df.groupby('Continent').GDP_Per_Capita.mean()

CPU times: user 2.08 ms, sys: 1.31 ms, total: 3.39 ms
Wall time: 5.64 ms


Continent
Africa            4588.289835
Asia             21174.893215
Europe           26497.245133
North America    19068.562312
Oceania           8818.318792
South America    10989.950644
Name: GDP_Per_Capita, dtype: float64