## A Guided Exploration of UN data (Gross Domestic Product and Internet Usage)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
gdp_df = pd.read_csv('../data/gdp_percapita.csv', nrows = 6729)

### Question 5: Look at first 6 rows of gdp_df

In [3]:
gdp_df.head(6)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
0,Afghanistan,2018,1734.723214,
1,Afghanistan,2017,1758.465636,
2,Afghanistan,2016,1757.02349,
3,Afghanistan,2015,1766.593077,
4,Afghanistan,2014,1795.735834,
5,Afghanistan,2013,1807.762344,


In [4]:
internet_df = pd.read_csv('../data/internet_use.csv', nrows=4495)

### Question 6: Look at first 6 rows of internet_df

In [5]:
internet_df.head(6)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
0,Afghanistan,2014,6.39,
1,Afghanistan,2013,5.9,
2,Afghanistan,2012,5.454545,
3,Afghanistan,2011,5.0,
4,Afghanistan,2010,4.0,
5,Afghanistan,2009,3.55,


### Question 7: Look at the shape of each dataframe - how many rows, how many columns.

In [6]:
gdp_df.shape

(6729, 4)

gdp_df has 6731 rows and 4 columns

In [7]:
internet_df.shape

(4495, 4)

internet_df has 4675 rows and 4 columns

### Question 8: Take a look at the data types for the columns in each table.

In [8]:
gdp_df.dtypes

Country or Area     object
Year                 int64
Value              float64
Value Footnotes    float64
dtype: object

In [9]:
internet_df.dtypes

Country or Area     object
Year                 int64
Value              float64
Value Footnotes    float64
dtype: object

### Question 9: Take a look at the last 10 rows of each dataset in turn.

In [10]:
gdp_df.tail(10)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
6719,Zimbabwe,1999,3054.064189,
6720,Zimbabwe,1998,3099.076182,
6721,Zimbabwe,1997,3036.422224,
6722,Zimbabwe,1996,2985.856605,
6723,Zimbabwe,1995,2736.486436,
6724,Zimbabwe,1994,2768.309953,
6725,Zimbabwe,1993,2572.870395,
6726,Zimbabwe,1992,2591.007534,
6727,Zimbabwe,1991,2906.272849,
6728,Zimbabwe,1990,2819.549467,


In [11]:
internet_df.tail(10)

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
4485,Zimbabwe,2002,3.994356,
4486,Zimbabwe,2001,0.799846,
4487,Zimbabwe,2000,0.401434,
4488,Zimbabwe,1999,0.161676,
4489,Zimbabwe,1998,0.081648,
4490,Zimbabwe,1997,0.03308,
4491,Zimbabwe,1996,0.01679,
4492,Zimbabwe,1995,0.007684,
4493,Zimbabwe,1994,0.001739,
4494,Zimbabwe,1990,0.0,


### Question 10: Drop the 'value footnotes' data (column) from both datasets. Check that this worked as expected.

In [12]:
gdp_df = gdp_df.drop(columns = ['Value Footnotes'])
gdp_df.head(2)

Unnamed: 0,Country or Area,Year,Value
0,Afghanistan,2018,1734.723214
1,Afghanistan,2017,1758.465636


In [13]:
internet_df = internet_df.drop(columns = ['Value Footnotes'])
internet_df.head(2)

Unnamed: 0,Country or Area,Year,Value
0,Afghanistan,2014,6.39
1,Afghanistan,2013,5.9


### Question 11: Change the columns for the GDP Per Capita data frame to ‘Country’, ‘Year’, and ‘GDP_Per_Capita’.

In [14]:
gdp_df.columns = ['Country', 'Year', 'GDP_Per_Capita']
gdp_df.head()

Unnamed: 0,Country,Year,GDP_Per_Capita
0,Afghanistan,2018,1734.723214
1,Afghanistan,2017,1758.465636
2,Afghanistan,2016,1757.02349
3,Afghanistan,2015,1766.593077
4,Afghanistan,2014,1795.735834


### Question 12: Change the columns for the Internet Users data frame to ‘Country’, ‘Year’, and ‘Internet_Users_Pct’.

In [15]:
internet_df.columns = ['Country', 'Year', 'Internet_Users_Pct']
internet_df.head()

Unnamed: 0,Country,Year,Internet_Users_Pct
0,Afghanistan,2014,6.39
1,Afghanistan,2013,5.9
2,Afghanistan,2012,5.454545
3,Afghanistan,2011,5.0
4,Afghanistan,2010,4.0


### Question 13: Merge the two DataFrames to one. Merge all rows from each of the two DataFrames. 
### Call the new DataFrame gdp_and_internet_use. Look at the first five rows of your new data frame to confirm it merged correctly.

In [16]:
gdp_and_internet_use = pd.merge(gdp_df, internet_df, on = ['Country', 'Year'], how = 'outer')
gdp_and_internet_use.head(5)

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
0,Afghanistan,2018,1734.723214,
1,Afghanistan,2017,1758.465636,
2,Afghanistan,2016,1757.02349,
3,Afghanistan,2015,1766.593077,
4,Afghanistan,2014,1795.735834,6.39


### Question 15: Look at the last five rows to make sure the data is clean and as expected.

In [17]:
gdp_and_internet_use.tail(5)

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
7700,Virgin Islands (U.S.),1997,,6.948369
7701,Virgin Islands (U.S.),1996,,4.647186
7702,Virgin Islands (U.S.),1995,,2.801958
7703,Virgin Islands (U.S.),1994,,0.940645
7704,Virgin Islands (U.S.),1990,,0.0


### Question 16: Subset the combined data frame to keep only the data for 2004, 2009, and 2014. Check that this happened correctly.