# Scratch-pad Data visualisation

In [None]:
# one way to do data visualisation is to define BACKND and SCRIPTING
# BACKEND -- physical -- defines the canvas
# ARTIST  -- knows how to put ink on the CANVAS
#   two types: ATOMIC and COMPOSITE
# SCRIPTING -- for scientist that are not professionals
#    -- essentially this is PYPLOT --
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from matplotlib.figure import Figure  # the Figure artist
fig    = Figure()
canvas = FigureCanvas(fig)
import numpy as np
x = np.random.randn(10000)
ax = fig.add_subplot(111)
ax.hist(x,100)
ax.set_title('Normal distribution')
fig.savefig('first_histogram.jpg')

The codes are taken from the online COURSERA lecture

DATA VISUALISATION WITH PYTHON: <br/>
https://www.coursera.org/learn/python-for-data-visualization/

The same examples can be found in the online book <br/>
http://www.aosabook.org/en/matplotlib.html


# Trying out the maps

Since images are worth a thousand words ...

We want a map/based illustration of the education system in CENTRAL AFRICA

In [None]:
# using basemap was a first attempt -- getting the individual coutries
# was not feasible using functions -- this ia a nice example but the 
# newer version will be used: CARTOPY

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

In [None]:
plt.figure(figsize=(6, 6))
m = Basemap(projection='lcc', resolution='c',
            lat_0=11, lon_0=24,
            width=6E6, height=3E6)
m.bluemarble() #shadedrelief()
m.drawcountries(color='black',linewidth=1)
#m.drawrivers(color='aqua')


In [None]:
import cartopy.io.shapereader as shp_reader
import cartopy.crs as ccrs
import cartopy as cartopy
from matplotlib.figure import Figure  # the Figure artist
import numpy as np

In [None]:
shp_file_name = shp_reader.natural_earth(resolution='110m',
                                      category='cultural',
                                      name='admin_0_countries')
reader = shp_reader.Reader(shp_file_name)
countries = [country for country in reader.records()]
# finding the name
for country in countries:
    if country.attributes['REGION_WB'] == 'Sub-Saharan Africa':
        print(country.attributes['NAME_EN'])


In [None]:
# declaring data for our data visualisation:
# we are interested only in a small set of countries
#
# this is a dictionary having COUNTRY names as KEYS,
# and COLOR codes as attribute
#
colors = {
    "Nigeria": (0,1,1),
    "Niger":   (.4,.4,0),
    "Uganda":  (0, .7,.3),
    "Rwanda":  (.3,.5,.8),
    "Central African Republic": (.2,.3,.4),
    "Republic of the Congo": (.2,.1,.4),
    "Gabon":  (.5,.2,.7),
    "Somalia":  (.1,.8,.3),
    "Kenya":   (.3,.1,.8),
    "Sudan":   (.2,.0,.9),
    "Chad":    (.6,.2,.3),
    "Ethiopia":(.3,.7,.5),
    "South Sudan":(.2,.9,.8),
    "Cameroon":(.3,.7,.3),
    "Democratic Republic of the Congo": (.1,.3,0),
    "Tanzania": (.4,.1,0),
    "Burundi": (.5,0,.8)
}
# for further processing, we will need the names of
# COUTRIES
our_country_names = [k for k in colors.keys()]


In [None]:
# we define the function we want to be used
# HELPER function to restrict the DATA
def filter_country_attr_values(country_list, attr_name, attr_list):
    result = []
    for country in country_list:
        for attr_value in attr_list:
            if country.attributes[attr_name] == attr_value:
                result.append(country)
    return result
# end filter_country_attr_values

In [None]:
small_list = filter_country_attr_values(
    countries, # the large data list
    'NAME_EN', # the attribute to use
    our_country_names # the list of attribute values
)

In [None]:
len(small_list)

In [None]:
# visualisation source: 
# https://gis.stackexchange.com/questions/88209/python-mapping-in-matplotlib-cartopy-color-one-country

plt.figure(figsize=(12, 6))
ax = plt.axes(projection=ccrs.PlateCarree())
ax.stock_img()
ax.add_feature(cartopy.feature.BORDERS, linestyle='-', alpha=.5)
ax.add_feature(cartopy.feature.COASTLINE)
for cs in small_list:
    cs_col = colors[cs.attributes['NAME_EN']]
    ax.add_geometries(cs.geometry, ccrs.PlateCarree(),
                     facecolor = cs_col, alpha=0.2,
                     label = cs.attributes['NAME_EN'])
    (lon,lat) = cs.geometry.centroid.coords[0]
    plt.text(lon,lat,cs.attributes['NAME_EN'],horizontalalignment='right',transform=ccrs.PlateCarree())
    plt.scatter(lon,lat,marker='o',s=100,c=cs_col)
ax.set_extent([-5, 48, -8, 22], crs=ccrs.PlateCarree())


## Example using PANDAS

We want to use data from <br/>
**[WorldBank](https://data.worldbank.org/indicator)** <br>
to show different numbers.


In [None]:
# we are using the PANDA dataframe library
import pandas as pd
from __future__ import print_function # for compatibility
!pwd

In [None]:
df_gdp = pd.read_excel(
  'data/countries_gdp.xls',
  sheet_name = 'Data',
  skiprows = range(3) # enumeration of rows to skip
  )

# Reading in other data
df_edu_percent = pd.read_excel('data/countries_edu_percent.xls',sheet_name = 'Data',skiprows = range(3))
df_primary_014 = pd.read_excel('data/countries_primary_pupils_014.xls',sheet_name = 'Data',skiprows = range(3))
df_pupils_014  = pd.read_excel('data/countries_pupils_014.xls',sheet_name = 'Data',skiprows = range(3))


# We want to have selected countries, therefore creating indices for the four data-sets
df_gdp.set_index("Country Name", inplace=True)
df_edu_percent.set_index("Country Name", inplace=True)
df_primary_014.set_index("Country Name", inplace=True)
df_pupils_014.set_index("Country Name", inplace=True)
#
#!! important observation: the making of indices is irreversible 
#!! and can happen only once (error if done multiple times)

# we can confirm the importing
# HERE works
# df_gdp.head()

In [None]:
years = list( map( str, range(2010,2018)))

df_gdp.loc['Nigeria', years].plot(kind='line')
plt.legend(["Nigeria"])
plt.xlabel('Years')
plt.ylabel('GDP')

In [None]:
# Exercise
# Plot the GDP-s for all coutries in the list

# HIGHLIGHT your contry's GDP



In [None]:
# plot the percent of education for the list of countries

for c_name in our_country_names:
    c_col = colors[c_name]
    df_edu_percent.loc[c_name, years].plot(kind='bar',linewidth=4,color=c_col )
# end for

plt.legend(our_country_names,loc='upper left', bbox_to_anchor=(1, 1.2))
plt.xlabel('Years')
plt.ylabel('Education %')
plt.title("Education percentages of GDP")
plt.show()

In [None]:
years