# Education
## Purpose
Generates a dataframe containing relevant information for our research into the effect that education can have on a countries ability to succeed at the Olympics
## Datasets
<b>education_data.csv</b> - A csv containing relevant data created in 1000-Joining.ipynb notebook

Import necessary libraries

In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import os.path
from sklearn.linear_model import LinearRegression 
import math

Load the dataset

In [52]:
filepath = '../../data/analysis/education_data.csv'
if not os.path.exists( filepath ):
    print("Missing dataset file")
else:
    df = pd.read_csv(filepath , encoding = "ISO-8859-1")
    print("File Read")

File Read


Print the frist 5 lines

In [53]:
df.head()

Unnamed: 0.1,Unnamed: 0,Year,Host_Country,Host_City,Summer,Winter,Total_Males,Total_Females,Total_Athletes,Discipline,...,NOC_Females_Sent,NOC_Total_Sent,NOC_Gold,NOC_Silver,NOC_Bronze,NOC_Total_Medals,NOC_Rating,NOC_Rank,Country,Education_Index
0,5618,1980,RUS,Moscow,True,False,4064,1115,5179,Swimming,...,28.0,121.0,5,2,5,12,24,17,Australia,0.869533
1,5619,1980,RUS,Moscow,True,False,4064,1115,5179,Swimming,...,28.0,121.0,5,2,5,12,24,17,Australia,0.869533
2,5620,1980,RUS,Moscow,True,False,4064,1115,5179,Swimming,...,28.0,121.0,5,2,5,12,24,17,Australia,0.869533
3,5621,1980,RUS,Moscow,True,False,4064,1115,5179,Swimming,...,28.0,121.0,5,2,5,12,24,17,Australia,0.869533
4,5622,1980,RUS,Moscow,True,False,4064,1115,5179,Swimming,...,28.0,121.0,5,2,5,12,24,17,Australia,0.869533


In [54]:
df.columns

Index(['Unnamed: 0', 'Year', 'Host_Country', 'Host_City', 'Summer', 'Winter',
       'Total_Males', 'Total_Females', 'Total_Athletes', 'Discipline', 'Sport',
       'Ath_Name', 'Gender', 'Home_Adv', 'Gold', 'Silver', 'Bronze',
       'Total_Medals', 'Ath_Rating', 'Ath_Rank', 'NOC', 'NOC_Males_Sent',
       'NOC_Females_Sent', 'NOC_Total_Sent', 'NOC_Gold', 'NOC_Silver',
       'NOC_Bronze', 'NOC_Total_Medals', 'NOC_Rating', 'NOC_Rank', 'Country',
       'Education_Index'],
      dtype='object')

## Checks for the number of nulls present for eduction idex

In [55]:
str((len(df[df.Education_Index.isnull()])/len(df))*100)+"% Null Values for Education Index"

'2.8451061865189287% Null Values for Education Index'

In [56]:
df[df.Education_Index.isnull()].Country.unique()

array(['Ethiopia', nan, 'Korea, Dem. People?s Rep.', 'Czechoslovakia',
       'Yugoslavia', 'Liechtenstein', 'Nigeria', 'Puerto Rico', 'Taiwan',
       'Djibouti', 'Virgin Islands (U.S.)', 'Suriname', 'Bahamas, The',
       'Belarus', 'Uzbekistan', 'Azerbaijan', 'Georgia', 'Moldova',
       'Macedonia, FYR', 'Eritrea', 'Samoa'], dtype=object)

The groupby satament below will exclude these values so it is important to note thier existence before this happens. The medals and rankings for each of these countries will be taken into account when the the evaluations are made to evaluate the remaining countries medal and rating shares.

### Group the dataframe by countries for each games they have competed in

In [57]:
grouped = df.groupby(['Country','Host_City','Year','Education_Index'])['Gold', 'Silver','Bronze', 'Total_Medals','Ath_Rating'].sum()
grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Gold,Silver,Bronze,Total_Medals,Ath_Rating
Country,Host_City,Year,Education_Index,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Afghanistan,Beijing,2008,0.33216,0,0,1,1,1
Afghanistan,London,2012,0.365333,0,0,1,1,1
Algeria,Atlanta,1996,0.438191,2,0,1,3,7
Algeria,Barcelone,1992,0.39918,1,0,1,2,4
Algeria,Beijing,2008,0.59808,0,1,1,2,3


Reset index to facilitate further operations

In [58]:
grouped = grouped.reset_index()
grouped.head()

Unnamed: 0,Country,Host_City,Year,Education_Index,Gold,Silver,Bronze,Total_Medals,Ath_Rating
0,Afghanistan,Beijing,2008,0.33216,0,0,1,1,1
1,Afghanistan,London,2012,0.365333,0,0,1,1,1
2,Algeria,Atlanta,1996,0.438191,2,0,1,3,7
3,Algeria,Barcelone,1992,0.39918,1,0,1,2,4
4,Algeria,Beijing,2008,0.59808,0,1,1,2,3


## Group the original dataframe by Golds, Medals and Rating to work out the total awared for each games

In [59]:
totMeds = df.groupby(['Year','Host_City'])[['Gold','Total_Medals','Ath_Rating']].sum()
totMeds.head(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,Gold,Total_Medals,Ath_Rating
Year,Host_City,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980,Lake Placid,72,218,436
1980,Moscow,454,1373,2732
1984,Los Angeles,496,1474,2942
1984,Sarajevo,74,222,444
1988,Calgary,88,264,528
1988,Seoul,506,1546,3063
1992,Albertville,107,324,648
1992,Barcelone,562,1712,3390
1994,Lillehammer,114,343,686
1996,Atlanta,615,1857,3696


# Adding Columns for Medal, Gold and Rating share
Columns facilitate comparrisons between different games in which varying number of medals were awarded

### Gold Share

In [60]:
grouped['Gold_Share'] = grouped.apply(lambda x: x.Gold/totMeds.loc[x.Year].loc[x.Host_City].Gold, axis=1 )
grouped.head()

Unnamed: 0,Country,Host_City,Year,Education_Index,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share
0,Afghanistan,Beijing,2008,0.33216,0,0,1,1,1,0.0
1,Afghanistan,London,2012,0.365333,0,0,1,1,1,0.0
2,Algeria,Atlanta,1996,0.438191,2,0,1,3,7,0.003252
3,Algeria,Barcelone,1992,0.39918,1,0,1,2,4,0.001779
4,Algeria,Beijing,2008,0.59808,0,1,1,2,3,0.0


### Medal Share

In [61]:
grouped['Medal_Share'] = grouped.apply(lambda x: x.Total_Medals/totMeds.loc[x.Year].loc[x.Host_City].Total_Medals, axis=1 )
grouped.head()

Unnamed: 0,Country,Host_City,Year,Education_Index,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,Medal_Share
0,Afghanistan,Beijing,2008,0.33216,0,0,1,1,1,0.0,0.00049
1,Afghanistan,London,2012,0.365333,0,0,1,1,1,0.0,0.000514
2,Algeria,Atlanta,1996,0.438191,2,0,1,3,7,0.003252,0.001616
3,Algeria,Barcelone,1992,0.39918,1,0,1,2,4,0.001779,0.001168
4,Algeria,Beijing,2008,0.59808,0,1,1,2,3,0.0,0.000981


### Rating Share

In [62]:
grouped['Rating_Share'] = grouped.apply(lambda x: x.Ath_Rating/totMeds.loc[x.Year].loc[x.Host_City].Ath_Rating, axis=1 )
grouped.head()

Unnamed: 0,Country,Host_City,Year,Education_Index,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,Medal_Share,Rating_Share
0,Afghanistan,Beijing,2008,0.33216,0,0,1,1,1,0.0,0.00049,0.000248
1,Afghanistan,London,2012,0.365333,0,0,1,1,1,0.0,0.000514,0.00026
2,Algeria,Atlanta,1996,0.438191,2,0,1,3,7,0.003252,0.001616,0.001894
3,Algeria,Barcelone,1992,0.39918,1,0,1,2,4,0.001779,0.001168,0.00118
4,Algeria,Beijing,2008,0.59808,0,1,1,2,3,0.0,0.000981,0.000743


# Normalising the data
Now that we have the metrics by which we will quantify success we will now normalise them to facilitate us in having all of our data to the same scale<br>
The formula we will use for nomalisation is MinMax

In [63]:
def minMaxNorm(mn,mx,val):
    return (val-mn)/(mx-mn)

### Normalising Gold Share

In [64]:
mn = grouped.Gold_Share.min()
mx = grouped.Gold_Share.max()
grouped['norm_Gold_Share'] = grouped['Gold_Share'].apply(lambda x: minMaxNorm(mn,mx,x))
grouped.head()

Unnamed: 0,Country,Host_City,Year,Education_Index,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,Medal_Share,Rating_Share,norm_Gold_Share
0,Afghanistan,Beijing,2008,0.33216,0,0,1,1,1,0.0,0.00049,0.000248,0.0
1,Afghanistan,London,2012,0.365333,0,0,1,1,1,0.0,0.000514,0.00026,0.0
2,Algeria,Atlanta,1996,0.438191,2,0,1,3,7,0.003252,0.001616,0.001894,0.00698
3,Algeria,Barcelone,1992,0.39918,1,0,1,2,4,0.001779,0.001168,0.00118,0.003819
4,Algeria,Beijing,2008,0.59808,0,1,1,2,3,0.0,0.000981,0.000743,0.0


### Normalising Medal Share

In [65]:
mn = grouped.Medal_Share.min()
mx = grouped.Medal_Share.max()
grouped['norm_Medal_Share'] = grouped['Medal_Share'].apply(lambda x: minMaxNorm(mn,mx,x))
grouped.head()

Unnamed: 0,Country,Host_City,Year,Education_Index,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,Medal_Share,Rating_Share,norm_Gold_Share,norm_Medal_Share
0,Afghanistan,Beijing,2008,0.33216,0,0,1,1,1,0.0,0.00049,0.000248,0.0,0.0
1,Afghanistan,London,2012,0.365333,0,0,1,1,1,0.0,0.000514,0.00026,0.0,7.6e-05
2,Algeria,Atlanta,1996,0.438191,2,0,1,3,7,0.003252,0.001616,0.001894,0.00698,0.003573
3,Algeria,Barcelone,1992,0.39918,1,0,1,2,4,0.001779,0.001168,0.00118,0.003819,0.002153
4,Algeria,Beijing,2008,0.59808,0,1,1,2,3,0.0,0.000981,0.000743,0.0,0.001558


### Normalising Rating Share

In [66]:
mn = grouped.Rating_Share.min()
mx = grouped.Rating_Share.max()
grouped['norm_Rating_Share'] = grouped['Rating_Share'].apply(lambda x: minMaxNorm(mn,mx,x))
grouped.head()

Unnamed: 0,Country,Host_City,Year,Education_Index,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,Medal_Share,Rating_Share,norm_Gold_Share,norm_Medal_Share,norm_Rating_Share
0,Afghanistan,Beijing,2008,0.33216,0,0,1,1,1,0.0,0.00049,0.000248,0.0,0.0,0.0
1,Afghanistan,London,2012,0.365333,0,0,1,1,1,0.0,0.000514,0.00026,0.0,7.6e-05,3.6e-05
2,Algeria,Atlanta,1996,0.438191,2,0,1,3,7,0.003252,0.001616,0.001894,0.00698,0.003573,0.004839
3,Algeria,Barcelone,1992,0.39918,1,0,1,2,4,0.001779,0.001168,0.00118,0.003819,0.002153,0.00274
4,Algeria,Beijing,2008,0.59808,0,1,1,2,3,0.0,0.000981,0.000743,0.0,0.001558,0.001457


# Output

Now that we have the dataframe with relevant information this will now be saved to csv file to be used to generate results

In [67]:
grouped.to_csv('../../data/final/Education.csv', index=False)