In [None]:
# Data Source: https://www.kaggle.com/worldbank/world-development-indicators
# Folder: 'world-development-indicators'

<br><p style="font-family: Arial; font-size:3.75em;color:purple; font-style:bold">
Matplotlib: Exploring <br> <br> <br>Data Visualization</p><br><br>

<br><br><center><h1 style="font-size:2em;color:#2467C0">World Development Indicators</h1></center>
<br>
<table>
<col width="550">
<col width="450">
<tr>
<td><img src="https://upload.wikimedia.org/wikipedia/commons/4/46/North_South_divide.svg" align="middle" style="width:550px;height:360px;"/></td>
<td>
This week, we will be using an open dataset from <a href="https://www.kaggle.com">Kaggle</a>. It is  <a href="https://www.kaggle.com/worldbank/world-development-indicators">The World Development Indicators</a> dataset obtained from the World Bank containing over a thousand annual indicators of economic development from hundreds of countries around the world.
<br>
<br>
This is a slightly modified version of the original dataset from <a href="http://data.worldbank.org/data-catalog/world-development-indicators">The World Bank</a>
<br>
<br>
List of the <a href="https://www.kaggle.com/benhamner/d/worldbank/world-development-indicators/indicators-in-data">available indicators</a> and a <a href="https://www.kaggle.com/benhamner/d/worldbank/world-development-indicators/countries-in-the-wdi-data">list of the available countries</a>.
</td>
</tr>
</table>

# Step 1: Initial exploration of the Dataset

In [24]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt


In [25]:
Indicators = pd.read_csv('./world-development-indicators/WDIData.csv')

In [26]:
print(Indicators.shape)
print(type(Indicators))
names = Indicators.columns.values
print(names)

(379368, 66)
<class 'pandas.core.frame.DataFrame'>
['Country Name' 'Country Code' 'Indicator Name' 'Indicator Code' '1960'
 '1961' '1962' '1963' '1964' '1965' '1966' '1967' '1968' '1969' '1970'
 '1971' '1972' '1973' '1974' '1975' '1976' '1977' '1978' '1979' '1980'
 '1981' '1982' '1983' '1984' '1985' '1986' '1987' '1988' '1989' '1990'
 '1991' '1992' '1993' '1994' '1995' '1996' '1997' '1998' '1999' '2000'
 '2001' '2002' '2003' '2004' '2005' '2006' '2007' '2008' '2009' '2010'
 '2011' '2012' '2013' '2014' '2015' '2016' '2017' '2018' '2019' '2020'
 'Unnamed: 65']


In [29]:
Indicators_new = pd.melt(Indicators, id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], var_name="Year", value_name="Value")
Indicators_new

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,Value
0,Arab World,ARB,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,1960,
1,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,1960,
2,Arab World,ARB,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,1960,
3,Arab World,ARB,"Access to electricity, urban (% of urban popul...",EG.ELC.ACCS.UR.ZS,1960,
4,Arab World,ARB,Account ownership at a financial institution o...,FX.OWN.TOTL.ZS,1960,
...,...,...,...,...,...,...
23520811,Zimbabwe,ZWE,Women who believe a husband is justified in be...,SG.VAW.NEGL.ZS,Unnamed: 65,
23520812,Zimbabwe,ZWE,Women who believe a husband is justified in be...,SG.VAW.REFU.ZS,Unnamed: 65,
23520813,Zimbabwe,ZWE,Women who were first married by age 15 (% of w...,SP.M15.2024.FE.ZS,Unnamed: 65,
23520814,Zimbabwe,ZWE,Women who were first married by age 18 (% of w...,SP.M18.2024.FE.ZS,Unnamed: 65,


In [30]:
Indicators_new.to_csv('Indicators.csv')

This is a really large dataset, at least in terms of the number of rows.  But with 6 columns, what does this hold?

In [7]:
data = pd.read_csv('./world-development-indicators/Indicators.csv')
data.shape

  data = pd.read_csv('./world-development-indicators/Indicators.csv')


(23520816, 7)

In [5]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,Value
0,0,Arab World,ARB,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,1960,
1,1,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,1960,
2,2,Arab World,ARB,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,1960,
3,3,Arab World,ARB,"Access to electricity, urban (% of urban popul...",EG.ELC.ACCS.UR.ZS,1960,
4,4,Arab World,ARB,Account ownership at a financial institution o...,FX.OWN.TOTL.ZS,1960,
5,5,Arab World,ARB,Account ownership at a financial institution o...,FX.OWN.TOTL.FE.ZS,1960,
6,6,Arab World,ARB,Account ownership at a financial institution o...,FX.OWN.TOTL.MA.ZS,1960,
7,7,Arab World,ARB,Account ownership at a financial institution o...,FX.OWN.TOTL.OL.ZS,1960,
8,8,Arab World,ARB,Account ownership at a financial institution o...,FX.OWN.TOTL.40.ZS,1960,
9,9,Arab World,ARB,Account ownership at a financial institution o...,FX.OWN.TOTL.PL.ZS,1960,


In [53]:
data.describe()

Unnamed: 0.1,Unnamed: 0,Value
count,23520820.0,7578806.0
mean,11760410.0,1377970000000.0
std,6789875.0,72467490000000.0
min,0.0,-9824821000000000.0
25%,5880204.0,5.744782
50%,11760410.0,45.0183
75%,17640610.0,178533.8
max,23520820.0,1.590415e+16


In [55]:
data.index

RangeIndex(start=0, stop=23520816, step=1)

In [61]:
three = data.pop('Year')
three

0                  1960
1                  1960
2                  1960
3                  1960
4                  1960
               ...     
23520811    Unnamed: 65
23520812    Unnamed: 65
23520813    Unnamed: 65
23520814    Unnamed: 65
23520815    Unnamed: 65
Name: Year, Length: 23520816, dtype: object

Looks like it has different indicators for different countries with the year and value of the indicator. 

### How many UNIQUE country names are there ?

In [31]:
countries = data['Country Name'].unique().tolist()
len(countries)

264

### Are there same number of country codes ?

In [32]:
# How many unique country codes are there ? (should be the same #)
countryCodes = data['Country Code'].unique().tolist()
len(countryCodes)

264

### Are there many indicators or few ?

In [33]:
# How many unique indicators are there ? (should be the same #)
indicators = data['Indicator Name'].unique().tolist()
len(indicators)

1437

### How many years of data do we have ?

In [34]:
# How many years of data do we have ?
years = data['Year'].unique().tolist()
len(years)

63

### What's the range of years?

In [47]:
print(min(years) ," to ", max(years))

TypeError: '<' not supported between instances of 'str' and 'int'

<p style="font-family: Arial; font-size:2.5em;color:blue; font-style:bold">
Matplotlib: Basic Plotting, Part 1</p><br>

### Lets pick a country and an indicator to explore: CO2 Emissions per capita and the USA

In [None]:
# select CO2 emissions for the United States
hist_indicator = 'CO2 emissions \(metric'
hist_country = 'USA'

mask1 = data['IndicatorName'].str.contains(hist_indicator) 
mask2 = data['CountryCode'].str.contains(hist_country)

# stage is just those indicators matching the USA for country code and CO2 emissions over time.
stage = data[mask1 & mask2]

In [None]:
stage.head()

### Let's see how emissions have changed over time using MatplotLib

In [None]:
# get the years
years = stage['Year'].values
# get the values 
co2 = stage['Value'].values

# create
plt.bar(years,co2)
plt.show()

Turns out emissions per capita have dropped a bit over time, but let's make this graphic a bit more appealing before we continue to explore it.

In [None]:
# switch to a line plot
plt.plot(stage['Year'].values, stage['Value'].values)

# Label the axes
plt.xlabel('Year')
plt.ylabel(stage['IndicatorName'].iloc[0])

#label the figure
plt.title('CO2 Emissions in USA')

# to make more honest, start they y axis at 0
plt.axis([1959, 2011,0,25])

plt.show()

### Using Histograms to explore the distribution of values
We could also visualize this data as a histogram to better explore the ranges of values in CO2 production per year. 

In [None]:
# If you want to just include those within one standard deviation fo the mean, you could do the following
# lower = stage['Value'].mean() - stage['Value'].std()
# upper = stage['Value'].mean() + stage['Value'].std()
# hist_data = [x for x in stage[:10000]['Value'] if x>lower and x<upper ]

# Otherwise, let's look at all the data
hist_data = stage['Value'].values

In [None]:
print(len(hist_data))

In [None]:
# the histogram of the data
plt.hist(hist_data, 10, density=False, facecolor='green')

plt.xlabel(stage['IndicatorName'].iloc[0])
plt.ylabel('# of Years')
plt.title('Histogram Example')

plt.grid(True)

plt.show()

So the USA has many years where it produced between 19-20 metric tons per capita with outliers on either side.

### But how do the USA's numbers relate to those of other countries?

In [None]:
# select CO2 emissions for all countries in 2011
hist_indicator = 'CO2 emissions \(metric'
hist_year = 2011

mask1 = data['IndicatorName'].str.contains(hist_indicator) 
mask2 = data['Year'].isin([hist_year])

# apply our mask
co2_2011 = data[mask1 & mask2]
co2_2011.head()

For how many countries do we have CO2 per capita emissions data in 2011

In [None]:
print(len(co2_2011))

In [None]:
# let's plot a histogram of the emmissions per capita by country

# subplots returns a touple with the figure, axis attributes.
fig, ax = plt.subplots()

ax.annotate("USA",
            xy=(18, 5), xycoords='data',
            xytext=(18, 30), textcoords='data',
            arrowprops=dict(arrowstyle="->",
                            connectionstyle="arc3"),
            )

plt.hist(co2_2011['Value'], 10, density=False, facecolor='green')

plt.xlabel(stage['IndicatorName'].iloc[0])
plt.ylabel('# of Countries')
plt.title('Histogram of CO2 Emissions Per Capita')

#plt.axis([10, 22, 0, 14])
plt.grid(True)

plt.show()

So the USA, at ~18 CO2 emissions (metric tons per capital) is quite high among all countries.

An interesting next step, which we'll save for you, would be to explore how this relates to other industrialized nations and to look at the outliers with those values in the 40s!

<p style="font-family: Arial; font-size:2.0em;color:blue; font-style:bold">
Matplotlib: Basic Plotting, Part 2</p>

### Relationship between GPD and CO2 Emissions in USA

In [None]:
# select GDP Per capita emissions for the United States
hist_indicator = 'GDP per capita \(constant 2005'
hist_country = 'USA'

mask1 = data['IndicatorName'].str.contains(hist_indicator) 
mask2 = data['CountryCode'].str.contains(hist_country)

# stage is just those indicators matching the USA for country code and CO2 emissions over time.
gdp_stage = data[mask1 & mask2]

#plot gdp_stage vs stage

In [None]:
gdp_stage.head(2)

In [None]:
stage.head(2)

In [None]:
# switch to a line plot
plt.plot(gdp_stage['Year'].values, gdp_stage['Value'].values)

# Label the axes
plt.xlabel('Year')
plt.ylabel(gdp_stage['IndicatorName'].iloc[0])

#label the figure
plt.title('GDP Per Capita USA')

# to make more honest, start they y axis at 0
#plt.axis([1959, 2011,0,25])

plt.show()

So although we've seen a decline in the CO2 emissions per capita, it does not seem to translate to a decline in GDP per capita

### ScatterPlot for comparing GDP against CO2 emissions (per capita)

First, we'll need to make sure we're looking at the same time frames

In [None]:
print("GDP Min Year = ", gdp_stage['Year'].min(), "max: ", gdp_stage['Year'].max())
print("CO2 Min Year = ", stage['Year'].min(), "max: ", stage['Year'].max())

We have 3 extra years of GDP data, so let's trim those off so the scatterplot has equal length arrays to compare (this is actually required by scatterplot)

In [None]:
gdp_stage_trunc = gdp_stage[gdp_stage['Year'] < 2012]
print(len(gdp_stage_trunc))
print(len(stage))

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

fig, axis = plt.subplots()
# Grid lines, Xticks, Xlabel, Ylabel

axis.yaxis.grid(True)
axis.set_title('CO2 Emissions vs. GDP \(per capita\)',fontsize=10)
axis.set_xlabel(gdp_stage_trunc['IndicatorName'].iloc[0],fontsize=10)
axis.set_ylabel(stage['IndicatorName'].iloc[0],fontsize=10)

X = gdp_stage_trunc['Value']
Y = stage['Value']

axis.scatter(X, Y)
plt.show()

This doesn't look like a strong relationship.  We can test this by looking at correlation.

In [None]:
np.corrcoef(gdp_stage_trunc['Value'],stage['Value'])

A correlation of 0.07 is pretty weak, but you'll learn more about correlation in the next course.

You could continue to explore this to see if other countries have a closer relationship between CO2 emissions and GDP.  Perhaps it is stronger for developing countries?

## Want more ? 

### Matplotlib Examples Library

http://matplotlib.org/examples/index.html

In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;