# Elevation<br>
## Purpose
Evaluates the effect of elevation in a countries ability to win medals
## Datasets
<b>Elevation.csv</b>

Import Necessary Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import os.path
from sklearn.linear_model import LinearRegression 
import math

Loads in dataset

In [3]:
filepath = '../../data/analysis/elevation_data.csv'
if not os.path.exists( filepath ):
    print("Missing dataset file")
else:
    df = pd.read_csv(filepath , encoding = "ISO-8859-1")
    print("File Read")

File Read


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Year,Host_Country,Host_City,Summer,Winter,Total_Males,Total_Females,Total_Athletes,Discipline,...,CO2_Emissions,Methane_Emissions,Nitrous_Oxide_Emisions,Total_Emissions,Emmisions_per_Capita,Code,Lowest_Point,Highest_Point,Elevation_Range,Education_Index
0,0,1960,ITA,Rome,True,False,4727,611,5338,Sailing,...,48815.104,,,48815.104,,ARG,-105.0,6962.0,7067.0,
1,1,1960,ITA,Rome,True,False,4727,611,5338,Boxing,...,48815.104,,,48815.104,,ARG,-105.0,6962.0,7067.0,
2,2,1960,ITA,Rome,True,False,4727,611,5338,Swimming,...,88202.351,,,88202.351,,AUS,-15.0,2228.0,2243.0,
3,3,1960,ITA,Rome,True,False,4727,611,5338,Eventing,...,88202.351,,,88202.351,,AUS,-15.0,2228.0,2243.0,
4,4,1960,ITA,Rome,True,False,4727,611,5338,Swimming,...,88202.351,,,88202.351,,AUS,-15.0,2228.0,2243.0,


Some of the columns in the dataframa are not useful for reaserach into elevation

In [5]:
df.columns

Index(['Unnamed: 0', 'Year', 'Host_Country', 'Host_City', 'Summer', 'Winter',
       'Total_Males', 'Total_Females', 'Total_Athletes', 'Discipline', 'Sport',
       'Ath_Name', 'Gender', 'Home_Adv', 'Gold', 'Silver', 'Bronze',
       'Total_Medals', 'Ath_Rating', 'Ath_Rank', 'NOC', 'NOC_Males_Sent',
       'NOC_Females_Sent', 'NOC_Total_Sent', 'NOC_Gold', 'NOC_Silver',
       'NOC_Bronze', 'NOC_Total_Medals', 'NOC_Rating', 'NOC_Rank', 'Country',
       'Population', 'Males', 'Females', 'Life_Expectancy', 'GDP', 'Region',
       'Elevation', 'Area_SqKM', 'Centroid_Longitude', 'Centroid_Latitude',
       'Population_Density', 'CO2_Emissions', 'Methane_Emissions',
       'Nitrous_Oxide_Emisions', 'Total_Emissions', 'Emmisions_per_Capita',
       'Code', 'Lowest_Point', 'Highest_Point', 'Elevation_Range',
       'Education_Index'],
      dtype='object')

The following columns we decided were valid to out study

In [6]:
df = df[['Year', 'Host_Country', 'Host_City', 'Summer', 'Winter',
       'Total_Males', 'Total_Females', 'Total_Athletes', 'Discipline', 'Sport',
       'Ath_Name', 'Gender', 'Home_Adv', 'Gold', 'Silver', 'Bronze',
       'Total_Medals', 'Ath_Rating', 'Ath_Rank', 'NOC', 'NOC_Males_Sent',
       'NOC_Females_Sent', 'NOC_Total_Sent', 'Country', 'Region',
       'Elevation', 'Area_SqKM', 'Centroid_Longitude', 'Centroid_Latitude',
        'Lowest_Point', 'Highest_Point', 'Elevation_Range']]
df.head()

Unnamed: 0,Year,Host_Country,Host_City,Summer,Winter,Total_Males,Total_Females,Total_Athletes,Discipline,Sport,...,NOC_Total_Sent,Country,Region,Elevation,Area_SqKM,Centroid_Longitude,Centroid_Latitude,Lowest_Point,Highest_Point,Elevation_Range
0,1960,ITA,Rome,True,False,4727,611,5338,Sailing,Sailing,...,91.0,Argentina,South America,595.62,2775401.0,-64.92097,-35.38706,-105.0,6962.0,7067.0
1,1960,ITA,Rome,True,False,4727,611,5338,Boxing,Boxing,...,91.0,Argentina,South America,595.62,2775401.0,-64.92097,-35.38706,-105.0,6962.0,7067.0
2,1960,ITA,Rome,True,False,4727,611,5338,Swimming,Aquatics,...,194.0,Australia,Oceania,272.4731,7662592.0,134.6131,-25.84766,-15.0,2228.0,2243.0
3,1960,ITA,Rome,True,False,4727,611,5338,Eventing,Equestrian,...,194.0,Australia,Oceania,272.4731,7662592.0,134.6131,-25.84766,-15.0,2228.0,2243.0
4,1960,ITA,Rome,True,False,4727,611,5338,Swimming,Aquatics,...,194.0,Australia,Oceania,272.4731,7662592.0,134.6131,-25.84766,-15.0,2228.0,2243.0


Checks to see what null values exist for elevation

In [7]:
df[df.Elevation.isnull()]

Unnamed: 0,Year,Host_Country,Host_City,Summer,Winter,Total_Males,Total_Females,Total_Athletes,Discipline,Sport,...,NOC_Total_Sent,Country,Region,Elevation,Area_SqKM,Centroid_Longitude,Centroid_Latitude,Lowest_Point,Highest_Point,Elevation_Range
1811,1972,DEU,Munich,True,False,6075,1059,7134,Weightlifting,Weightlifting,...,19.0,,,,,,,,,
3048,1980,RUS,Moscow,True,False,4064,1115,5179,Wrestling Greco-Roman,Wrestling,...,16.0,,,,,,,,,
4200,1988,KOR,Seoul,True,False,6197,2194,8391,Sailing,Sailing,...,2.0,,,,,,,,,
5273,1992,ESP,Barcelone,True,False,6652,2704,9356,Shooting,Shooting,...,59.0,,,,,,,,,
5274,1992,ESP,Barcelone,True,False,6652,2704,9356,Shooting,Shooting,...,59.0,,,,,,,,,
5275,1992,ESP,Barcelone,True,False,6652,2704,9356,Shooting,Shooting,...,59.0,,,,,,,,,
11369,2016,BRA,Rio,True,False,6173,5055,11228,Shooting,Shooting,...,9.0,,,,,,,,,
11370,2016,BRA,Rio,True,False,6173,5055,11228,Shooting,Shooting,...,9.0,,,,,,,,,


In [8]:
len(df[df.Elevation.isnull()])

8

The data above relates to eight medal winning athletes from NOCs for whom we were unable to obtain data on, we have amde the decision to remove these lines as they will skew our results 

In [9]:
df = df[df.Elevation.notnull()]
len(df)

11814

## Group the data by Country
To be able to examine what effect elevation has we had to group thhe data by country

In [10]:
countryDF = df.groupby(['Country','Elevation','Lowest_Point','Highest_Point','Elevation_Range','Sport'])\
            ['Gold', 'Silver','Bronze', 'Total_Medals','Ath_Rating'].sum()

countryDF = countryDF.reset_index()
countryDF.head()

Unnamed: 0,Country,Elevation,Lowest_Point,Highest_Point,Elevation_Range,Sport,Gold,Silver,Bronze,Total_Medals,Ath_Rating
0,Afghanistan,1884.71,258.0,7492.0,7234.0,Taekwondo,0,0,2,2,2
1,Algeria,572.8028,-40.0,2908.0,2948.0,Athletics,4,3,2,9,20
2,Algeria,572.8028,-40.0,2908.0,2948.0,Boxing,1,0,5,6,8
3,Algeria,572.8028,-40.0,2908.0,2948.0,Judo,0,1,1,2,3
4,Argentina,595.62,-105.0,6962.0,7067.0,Aquatics,0,0,1,1,1


## Finding Medal Share

The first step is to establish how many medals have been awarded in each games 

In [11]:
totMeds = df.groupby(['Sport'])[['Gold','Total_Medals','Ath_Rating']].sum()
totMeds.head(40)

Unnamed: 0_level_0,Gold,Total_Medals,Ath_Rating
Sport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aquatics,693,1911,3918
Archery,48,131,267
Athletics,649,1888,3812
Badminton,29,94,184
Baseball,5,15,30
Basketball,20,71,136
Biathlon,97,274,563
Bobsleigh,34,109,215
Boxing,172,687,1202
Canoe,226,651,1320


### Gold Share
Evaluating what fraction of the gold medals were awarded to a given country in a given event

In [12]:
countryDF['Gold_Share'] = countryDF.apply(lambda x: x.Gold/totMeds.loc[x.Sport].Gold, axis=1 )
countryDF.head()

Unnamed: 0,Country,Elevation,Lowest_Point,Highest_Point,Elevation_Range,Sport,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share
0,Afghanistan,1884.71,258.0,7492.0,7234.0,Taekwondo,0,0,2,2,2,0.0
1,Algeria,572.8028,-40.0,2908.0,2948.0,Athletics,4,3,2,9,20,0.006163
2,Algeria,572.8028,-40.0,2908.0,2948.0,Boxing,1,0,5,6,8,0.005814
3,Algeria,572.8028,-40.0,2908.0,2948.0,Judo,0,1,1,2,3,0.0
4,Argentina,595.62,-105.0,6962.0,7067.0,Aquatics,0,0,1,1,1,0.0


### Medal Share
Evaluating what fraction of the medals were awarded to a given country in a given event

In [13]:
countryDF['Medal_Share'] = countryDF.apply(lambda x: x.Total_Medals/totMeds.loc[x.Sport].Total_Medals, axis=1 )
countryDF.head()

Unnamed: 0,Country,Elevation,Lowest_Point,Highest_Point,Elevation_Range,Sport,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,Medal_Share
0,Afghanistan,1884.71,258.0,7492.0,7234.0,Taekwondo,0,0,2,2,2,0.0,0.013889
1,Algeria,572.8028,-40.0,2908.0,2948.0,Athletics,4,3,2,9,20,0.006163,0.004767
2,Algeria,572.8028,-40.0,2908.0,2948.0,Boxing,1,0,5,6,8,0.005814,0.008734
3,Algeria,572.8028,-40.0,2908.0,2948.0,Judo,0,1,1,2,3,0.0,0.00367
4,Argentina,595.62,-105.0,6962.0,7067.0,Aquatics,0,0,1,1,1,0.0,0.000523


### Rating Share
Evaluating what fraction of the rating were awarded to a given country in a given event

In [14]:
countryDF['Rating_Share'] = countryDF.apply(lambda x: x.Ath_Rating/totMeds.loc[x.Sport].Ath_Rating, axis=1 )
countryDF.head()

Unnamed: 0,Country,Elevation,Lowest_Point,Highest_Point,Elevation_Range,Sport,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,Medal_Share,Rating_Share
0,Afghanistan,1884.71,258.0,7492.0,7234.0,Taekwondo,0,0,2,2,2,0.0,0.013889,0.007576
1,Algeria,572.8028,-40.0,2908.0,2948.0,Athletics,4,3,2,9,20,0.006163,0.004767,0.005247
2,Algeria,572.8028,-40.0,2908.0,2948.0,Boxing,1,0,5,6,8,0.005814,0.008734,0.006656
3,Algeria,572.8028,-40.0,2908.0,2948.0,Judo,0,1,1,2,3,0.0,0.00367,0.003141
4,Argentina,595.62,-105.0,6962.0,7067.0,Aquatics,0,0,1,1,1,0.0,0.000523,0.000255


# Normalising the data

In [15]:
def minMaxNorm(mn,mx,val):
    return (val-mn)/(mx-mn)

### Gold_Share

In [16]:
mn = countryDF['Gold_Share'].min()
mx = countryDF['Gold_Share'].max()
countryDF['norm_Gold_Share'] = countryDF['Gold_Share'].apply(lambda x: minMaxNorm(mn,mx,x))
countryDF.head()

Unnamed: 0,Country,Elevation,Lowest_Point,Highest_Point,Elevation_Range,Sport,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,Medal_Share,Rating_Share,norm_Gold_Share
0,Afghanistan,1884.71,258.0,7492.0,7234.0,Taekwondo,0,0,2,2,2,0.0,0.013889,0.007576,0.0
1,Algeria,572.8028,-40.0,2908.0,2948.0,Athletics,4,3,2,9,20,0.006163,0.004767,0.005247,0.007088
2,Algeria,572.8028,-40.0,2908.0,2948.0,Boxing,1,0,5,6,8,0.005814,0.008734,0.006656,0.006686
3,Algeria,572.8028,-40.0,2908.0,2948.0,Judo,0,1,1,2,3,0.0,0.00367,0.003141,0.0
4,Argentina,595.62,-105.0,6962.0,7067.0,Aquatics,0,0,1,1,1,0.0,0.000523,0.000255,0.0


### Medal_Share

In [17]:
mn = countryDF['Medal_Share'].min()
mx = countryDF['Medal_Share'].max()
countryDF['norm_Medal_Share'] = countryDF['Medal_Share'].apply(lambda x: minMaxNorm(mn,mx,x))
countryDF.head()

Unnamed: 0,Country,Elevation,Lowest_Point,Highest_Point,Elevation_Range,Sport,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,Medal_Share,Rating_Share,norm_Gold_Share,norm_Medal_Share
0,Afghanistan,1884.71,258.0,7492.0,7234.0,Taekwondo,0,0,2,2,2,0.0,0.013889,0.007576,0.0,0.030305
1,Algeria,572.8028,-40.0,2908.0,2948.0,Athletics,4,3,2,9,20,0.006163,0.004767,0.005247,0.007088,0.009622
2,Algeria,572.8028,-40.0,2908.0,2948.0,Boxing,1,0,5,6,8,0.005814,0.008734,0.006656,0.006686,0.018616
3,Algeria,572.8028,-40.0,2908.0,2948.0,Judo,0,1,1,2,3,0.0,0.00367,0.003141,0.0,0.007134
4,Argentina,595.62,-105.0,6962.0,7067.0,Aquatics,0,0,1,1,1,0.0,0.000523,0.000255,0.0,0.0


### Rating_Share

In [18]:
mn = countryDF['Rating_Share'].min()
mx = countryDF['Rating_Share'].max()
countryDF['norm_Rating_Share'] = countryDF['Rating_Share'].apply(lambda x: minMaxNorm(mn,mx,x))
countryDF.head()

Unnamed: 0,Country,Elevation,Lowest_Point,Highest_Point,Elevation_Range,Sport,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,Medal_Share,Rating_Share,norm_Gold_Share,norm_Medal_Share,norm_Rating_Share
0,Afghanistan,1884.71,258.0,7492.0,7234.0,Taekwondo,0,0,2,2,2,0.0,0.013889,0.007576,0.0,0.030305,0.012991
1,Algeria,572.8028,-40.0,2908.0,2948.0,Athletics,4,3,2,9,20,0.006163,0.004767,0.005247,0.007088,0.009622,0.008858
2,Algeria,572.8028,-40.0,2908.0,2948.0,Boxing,1,0,5,6,8,0.005814,0.008734,0.006656,0.006686,0.018616,0.011358
3,Algeria,572.8028,-40.0,2908.0,2948.0,Judo,0,1,1,2,3,0.0,0.00367,0.003141,0.0,0.007134,0.005122
4,Argentina,595.62,-105.0,6962.0,7067.0,Aquatics,0,0,1,1,1,0.0,0.000523,0.000255,0.0,0.0,0.0


### Avearge Elevation

In [19]:
mn = countryDF['Elevation'].min()
mx = countryDF['Elevation'].max()
countryDF['norm_Elevation'] = countryDF['Elevation'].apply(lambda x: minMaxNorm(mn,mx,x))
countryDF.head()

Unnamed: 0,Country,Elevation,Lowest_Point,Highest_Point,Elevation_Range,Sport,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,Medal_Share,Rating_Share,norm_Gold_Share,norm_Medal_Share,norm_Rating_Share,norm_Elevation
0,Afghanistan,1884.71,258.0,7492.0,7234.0,Taekwondo,0,0,2,2,2,0.0,0.013889,0.007576,0.0,0.030305,0.012991,0.591575
1,Algeria,572.8028,-40.0,2908.0,2948.0,Athletics,4,3,2,9,20,0.006163,0.004767,0.005247,0.007088,0.009622,0.008858,0.179792
2,Algeria,572.8028,-40.0,2908.0,2948.0,Boxing,1,0,5,6,8,0.005814,0.008734,0.006656,0.006686,0.018616,0.011358,0.179792
3,Algeria,572.8028,-40.0,2908.0,2948.0,Judo,0,1,1,2,3,0.0,0.00367,0.003141,0.0,0.007134,0.005122,0.179792
4,Argentina,595.62,-105.0,6962.0,7067.0,Aquatics,0,0,1,1,1,0.0,0.000523,0.000255,0.0,0.0,0.0,0.186954


# Output

In [20]:
countryDF.to_csv('../../data/final/Education.csv', index=False)