In [19]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

#from functools import reduce
from IPython.display import display as dsp

In [20]:
edu = pd.read_csv('data/SYB61_T07_Education.csv', encoding = "ISO-8859-1")
edu_spend = pd.read_csv('data/SYB61_T09_Public Expenditure on Education.csv', encoding = "ISO-8859-1")
labor = pd.read_csv('data/SYB61_T17_Labour Force and Unemployment.csv', encoding = "ISO-8859-1")
pop = pd.read_csv('data/SYB61_T02_Population, Surface Area and Density.csv', encoding = "ISO-8859-1")
#Source: http://data.un.org/

#IMPORT FILES:
#Education Enrollment areas.
edu.rename(index = str, columns = {'T07': 'Region Code', 
                                   'Enrolment in primary, secondary and tertiary education levels' : 'Region',
                                   'Unnamed: 2': 'Year','Unnamed: 3' : 'Series','Unnamed: 4': 'Value',
                                   'Unnamed: 5': 'Footnotes', 'Unnamed: 6': 'Source'}, inplace = True)
edu.drop(edu.index[0], inplace = True)
edu.index = range(len(edu))
#dsp(edu.head(1))

#Education Expenditure
edu_spend.rename(index = str, columns = {'T09': 'Region Code', 'Public expenditure on education': 'Region',
                                        'Unnamed: 2':'Year', 'Unnamed: 3':'Series', 'Unnamed: 4':'Value',
                                        'Unnamed: 5':'Footnotes', 'Unnamed: 6' : 'Source'}, inplace = True)
edu_spend.drop(edu_spend.index[0], inplace = True)
edu_spend.index = range(len(edu_spend))
#dsp(edu_spend.head(1))

#Labor Force
labor.rename(index = str, columns = {'T17':'Region Code','Labour force participation and unemployment': 'Region',
                                    'Unnamed: 2': 'Year', 'Unnamed: 3':'Series', 'Unnamed: 4':'Value','Unnamed: 5':'Footnotes',
                                    'Unnamed: 6': 'Source'}, inplace = True)
labor.drop(labor.index[0], inplace = True)
labor.index = range(len(labor))
#dsp(labor.head(1))

#Population
pop.rename(index = str, columns = {'T02':'Region Code','Population, density and surface area': 'Region',
                                    'Unnamed: 2': 'Year', 'Unnamed: 3':'Series', 'Unnamed: 4':'Value','Unnamed: 5':'Footnotes',
                                    'Unnamed: 6': 'Source'}, inplace = True)
pop.drop(pop.index[0], inplace = True)
pop.index = range(len(pop))
#dsp(pop.head(1))

In [21]:
#Stitch measurements into one dataframe and take a few sample countries
dfs = [edu, edu_spend, labor, pop]
randCodes = ['404', '108', '340', '500', '516', '694', '320', '548', '834', '462']
df = pd.concat(dfs)
df = df.loc[(df['Region Code'].isin(randCodes)), ['Region Code', 'Region', 'Year', 'Series', 'Value']]
dsp(df.head())

print('Measures Represented:\n', df['Series'].unique())
print('Years Represented:\n', df['Year'].unique())


Unnamed: 0,Region Code,Region,Year,Series,Value
1646,108,Burundi,2005,Students enrolled in primary education (thousa...,1037.0
1647,108,Burundi,2005,Gross enrollement ratio - Primary (male),89.7
1648,108,Burundi,2005,Gross enrollment ratio - Primary (female),75.4
1649,108,Burundi,2005,Students enrolled in secondary education (thou...,171.0
1650,108,Burundi,2005,Gross enrollment ratio - Secondary (male),15.8


Measures Represented:
 ['Students enrolled in primary education (thousands)'
 'Gross enrollement ratio - Primary (male)'
 'Gross enrollment ratio - Primary (female)'
 'Students enrolled in secondary education (thousands)'
 'Gross enrollment ratio - Secondary (male)'
 'Gross enrollment ratio - Secondary (female)'
 'Students enrolled in tertiary education (thousands)'
 'Gross enrollment ratio - Tertiary (male)'
 'Gross enrollment ratio - Tertiary (female)'
 'Current expenditure other than staff compensation as % of total expenditure in public institutions (%)'
 'All staff compensation as % of total expenditure in public institutions (%)'
 'Capital expenditure as % of total expenditure in public institutions (%)'
 'Expenditure by level of education: pre-primary (as % of government expenditure)'
 'Expenditure by level of education: primary (as % of government expenditure)'
 'Expenditure by level of education: secondary (as % of government expenditure)'
 'Expenditure by level of education: 

In [28]:
#Select Variables For Features:

#We want to group by year and country, or select a year with the most data
#print(df['Year'].value_counts()) #211 entries for 2010
df2010 = df.loc[df['Year'] ==  '2010']
dsp(df2010.head())

#Select the measures to use:
Select_Ser = ['Region',
              'Gross enrollment ratio - Tertiary (male)', 'Gross enrollment ratio - Tertiary (female)', 
              'Labour force participation - Male' 'Unemployment rate - Male',
              'Labour force participation - Female' 'Unemployment rate - Female',
              'Expenditure by level of education: tertiary (as % of government expenditure)',
              'Population mid-year estimates for males (millions)',
              'Population mid-year estimates for females (millions)',
              'All staff compensation as % of total expenditure in public institutions (%)',
              'Capital expenditure as % of total expenditure in public institutions (%)',
              'All staff compensation as % of total expenditure in public institutions (%)'
             ]

Unnamed: 0,Region Code,Region,Year,Series,Value
1655,108,Burundi,2010,Students enrolled in primary education (thousa...,1850.0
1656,108,Burundi,2010,Gross enrollement ratio - Primary (male),141.8
1657,108,Burundi,2010,Gross enrollment ratio - Primary (female),137.0
1658,108,Burundi,2010,Students enrolled in secondary education (thou...,338.0
1659,108,Burundi,2010,Gross enrollment ratio - Secondary (male),28.7


# Variables:

'Gross enrollment ratio - Tertiary (male)'
'Gross enrollment ratio - Tertiary (female)', 
'Labour force participation - Male' 'Unemployment rate - Male',
'Labour force participation - Female' 'Unemployment rate - Female',
'Expenditure by level of education: tertiary (as % of government expenditure)',
'Population mid-year estimates for males (millions)',
'Population mid-year estimates for females (millions)',
'All staff compensation as % of total expenditure in public institutions (%)',
'Capital expenditure as % of total expenditure in public institutions (%)',
'All staff compensation as % of total expenditure in public institutions (%)'

In [30]:
#Filter for select measurements
df2010sr = df2010.loc[df2010['Series'].isin(Select_Ser)]

#Create Feature Dataframe
feat = pd.get_dummies(df2010sr['Region Code'])

#Add Measures to feat dataframe:
for entry in Select_Ser:
    feat = pd.concat([feat, df2010sr], axis = 1, join_axes = ['Region Code'], join = 'inner')

AttributeError: 'str' object has no attribute 'equals'

In [None]:
#Total, all countries
tot_edu = edu.loc[(edu['Region Code'] == '1'), 
                  ['Year', 'Series', 'Value']]
tot_edu.rename(index = str, columns = {'Series':'Education Demographic','Value':'Edu Measure'}, inplace = True)


tot_labor = labor.loc[(labor['Region Code'] == '1'),
                     ['Year', 'Series', 'Value']]
tot_labor.rename(index = str, columns = {'Series':'Labor Demographic','Value':'Lab Measure'}, inplace = True)


tot_pop = pop.loc[pop['Region Code'] == '1',
                 ['Year','Series', 'Value']]
tot_edu.rename(index = str, columns = {'Series':'Population Demographic','Value':'Pop Measure'}, inplace = True)

#print(tot_pop['Year'].unique())
#print(tot_edu['Year'].unique())
#print(tot_labor['Year'].unique())

#step = pd.merge(tot_labor, tot_pop, on = 'Year', how = 'inner')
#totals = pd.merge(step, tot_edu, on = 'Year', how = 'inner')

#Reset index