# Population
## Purpose
Generates a dataframe containing relevant information for our research into the effect that population can have on a countries ability to succeed at the Olympics
## Datasets
<b>population_data.csv</b> - A csv containing relevant data created in 1000-Joining.ipynb notebook

Import necessary libraries

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import os.path
from sklearn.linear_model import LinearRegression 
import math

Loads in the dataset

In [64]:
filepath = '../../data/analysis/population_data.csv'

In [65]:
if not os.path.exists( filepath ):
    print("Missing dataset file")
else:
    df = pd.read_csv(filepath , encoding = "ISO-8859-1")
    print("File Read")

File Read


In [66]:
df.head()

Unnamed: 0.1,Unnamed: 0,Year,Host_City,Host_Country,Total_Males,Total_Females,Total_Athletes,Summer,Winter,Discipline,...,Country,Population,Males,Females,Life_Expectancy,Region,Elevation,Area_SqKM,Population_Density,Code
0,0,1960,Rome,ITA,4727,611,5338,True,False,Sailing,...,Argentina,20619075.0,10471009.0,10148066.0,65.215537,South America,595.62,2775401.0,7.429224,ARG
1,1,1960,Rome,ITA,4727,611,5338,True,False,Sailing,...,Argentina,20619075.0,10471009.0,10148066.0,65.215537,South America,595.62,2775401.0,7.429224,ARG
2,2,1960,Rome,ITA,4727,611,5338,True,False,Sailing,...,Argentina,20619075.0,10471009.0,10148066.0,65.215537,South America,595.62,2775401.0,7.429224,ARG
3,3,1960,Rome,ITA,4727,611,5338,True,False,Boxing,...,Argentina,20619075.0,10471009.0,10148066.0,65.215537,South America,595.62,2775401.0,7.429224,ARG
4,4,1960,Rome,ITA,4727,611,5338,True,False,Swimming,...,Australia,10276477.0,5191453.0,5085024.0,70.817073,Oceania,272.4731,7662592.0,1.341123,AUS


Print the column names

In [67]:
df.columns

Index(['Unnamed: 0', 'Year', 'Host_City', 'Host_Country', 'Total_Males',
       'Total_Females', 'Total_Athletes', 'Summer', 'Winter', 'Discipline',
       'Sport', 'Ath_Name', 'Gender', 'NOC', 'NOC_Males_Sent',
       'NOC_Females_Sent', 'NOC_Total_Sent', 'Home_Adv', 'Gold', 'Silver',
       'Bronze', 'Total_Medals', 'NOC_Gold', 'NOC_Silver', 'NOC_Bronze',
       'NOC_Total_Medals', 'NOC_Rating', 'NOC_Rank', 'Ath_Rating', 'Ath_Rank',
       'Country', 'Population', 'Males', 'Females', 'Life_Expectancy',
       'Region', 'Elevation', 'Area_SqKM', 'Population_Density', 'Code'],
      dtype='object')

## Adding Column for Share of Medals and Rating won at a single games 
To quantify how successful countires are over different games in which different numbers of medals are given out we decided the best methof of comparison was to compare df.head() countries based on the share of medals won or ratings they won at a given Olympics, this gives us a method of comparrison between different games<br>
To work out the fraction of medals and ratings taken home by each country we must first work out the total number of medals and total ratings awarded at each Games

In [334]:
totMeds = df.groupby(['Year','Host_City'])[['Gold','Total_Medals','Ath_Rating']].sum()
totMeds.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Gold,Total_Medals,Ath_Rating
Year,Host_City,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1960,Rome,295,882,1758
1960,Squaw Valley,50,147,295
1964,Innsbruck,60,185,372
1964,Tokyo,331,1013,2013
1968,Grenoble,66,199,401
1968,Mexico,341,1030,2051
1972,Munich,387,1184,2343
1972,Sapporo,70,200,404
1976,Innsbruck,70,210,420
1976,Montréal,429,1304,2591


The next stage is to figure out the number of gold medals and medals overall that were won for every games along with the ratings of each country and 

In [69]:
countryDF = df.groupby(['Country','Host_City','Year','Gender'])[['Gold', 'Silver','Bronze', 'Total_Medals','Ath_Rating']].sum()
countryDF = countryDF.reset_index()
countryDF.head(10)

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1
1,Afghanistan,London,2012,Men,0,0,1,1,1
2,Algeria,Atlanta,1996,Men,2,0,1,3,7
3,Algeria,Barcelone,1992,Men,0,0,1,1,1
4,Algeria,Barcelone,1992,Women,1,0,0,1,3
5,Algeria,Beijing,2008,Men,0,1,0,1,2
6,Algeria,Beijing,2008,Women,0,0,1,1,1
7,Algeria,London,2012,Men,1,0,0,1,3
8,Algeria,Los Angeles,1984,Men,0,0,2,2,2
9,Algeria,Rio,2016,Men,0,2,0,2,4


### Share of Golds

In [70]:
countryDF['Gold_Share'] = countryDF.apply(lambda x: x.Gold/totMeds.loc[x.Year].loc[x.Host_City].Gold, axis=1 )
countryDF.head()

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1,0.0
1,Afghanistan,London,2012,Men,0,0,1,1,1,0.0
2,Algeria,Atlanta,1996,Men,2,0,1,3,7,0.003252
3,Algeria,Barcelone,1992,Men,0,0,1,1,1,0.0
4,Algeria,Barcelone,1992,Women,1,0,0,1,3,0.001779


### Share of Total Medals

In [71]:
countryDF['Medal_Share'] = countryDF.apply(lambda x: x.Total_Medals/totMeds.loc[x.Year].loc[x.Host_City].Total_Medals, axis=1 )
countryDF.head()

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,Medal_Share
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1,0.0,0.00049
1,Afghanistan,London,2012,Men,0,0,1,1,1,0.0,0.000514
2,Algeria,Atlanta,1996,Men,2,0,1,3,7,0.003252,0.001616
3,Algeria,Barcelone,1992,Men,0,0,1,1,1,0.0,0.000584
4,Algeria,Barcelone,1992,Women,1,0,0,1,3,0.001779,0.000584


### Share of Ratings

In [72]:
countryDF['Rating_Share'] = countryDF.apply(lambda x: x.Ath_Rating/totMeds.loc[x.Year].loc[x.Host_City].Ath_Rating, axis=1 )
countryDF.head()

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,Medal_Share,Rating_Share
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1,0.0,0.00049,0.000248
1,Afghanistan,London,2012,Men,0,0,1,1,1,0.0,0.000514,0.00026
2,Algeria,Atlanta,1996,Men,2,0,1,3,7,0.003252,0.001616,0.001894
3,Algeria,Barcelone,1992,Men,0,0,1,1,1,0.0,0.000584,0.000295
4,Algeria,Barcelone,1992,Women,1,0,0,1,3,0.001779,0.000584,0.000885


# Adding in relevant Population Data
The data for the population, number of males and females along with population density and how many athelets they sent to the games will now be added to the countryDF

Select the relevant data from the the dataframe

In [73]:
popDF = df[['Year','Host_City','Country','Males','Females','Summer','Winter','Population_Density','Population',
            'NOC_Males_Sent', 'NOC_Females_Sent','NOC_Total_Sent']]

popDF = popDF.drop_duplicates()
popDF.head()

Unnamed: 0,Year,Host_City,Country,Males,Females,Summer,Winter,Population_Density,Population,NOC_Males_Sent,NOC_Females_Sent,NOC_Total_Sent
0,1960,Rome,Argentina,10471009.0,10148066.0,True,False,7.429224,20619075.0,91.0,0.0,91.0
4,1960,Rome,Australia,5191453.0,5085024.0,True,False,1.341123,10276477.0,165.0,29.0,194.0
28,1960,Rome,Austria,3282797.0,3764742.0,True,False,83.004368,7047539.0,83.0,21.0,104.0
31,1960,Rome,Belgium,4503524.0,4649965.0,True,False,292.272576,9153489.0,86.0,8.0,94.0
35,1960,Rome,Brazil,35952077.0,36255477.0,True,False,8.575067,72207554.0,70.0,1.0,71.0


The next step was to join the two dataframes together on Country, Year and Host_City columns

In [74]:
new_df = pd.merge(countryDF, popDF,  how='left', left_on=['Country','Host_City','Year'], 
                    right_on = ['Country','Host_City','Year'])
new_df.head()

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,...,Rating_Share,Males,Females,Summer,Winter,Population_Density,Population,NOC_Males_Sent,NOC_Females_Sent,NOC_Total_Sent
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1,0.0,...,0.000248,14054874.0,13239157.0,True,False,42.236961,27294031.0,3.0,1.0,4.0
1,Afghanistan,London,2012,Men,0,0,1,1,1,0.0,...,0.00026,15784301.0,14912657.0,True,False,47.502922,30696958.0,5.0,1.0,6.0
2,Algeria,Atlanta,1996,Men,2,0,1,3,7,0.003252,...,0.001894,14898762.0,14512653.0,True,False,12.735059,29411415.0,38.0,6.0,44.0
3,Algeria,Barcelone,1992,Men,0,0,1,1,1,0.0,...,0.000295,13764639.0,13416455.0,True,False,11.769336,27181094.0,33.0,2.0,35.0
4,Algeria,Barcelone,1992,Women,1,0,0,1,3,0.001779,...,0.000885,13764639.0,13416455.0,True,False,11.769336,27181094.0,33.0,2.0,35.0


# Population Representation
Examine the representation of each country in terms of the fraction of their population who compete at the games

### Female

In [75]:
new_df['Female_Representation'] = new_df.NOC_Females_Sent/new_df.Females
new_df.head()

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,...,Males,Females,Summer,Winter,Population_Density,Population,NOC_Males_Sent,NOC_Females_Sent,NOC_Total_Sent,Female_Representation
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1,0.0,...,14054874.0,13239157.0,True,False,42.236961,27294031.0,3.0,1.0,4.0,7.553351e-08
1,Afghanistan,London,2012,Men,0,0,1,1,1,0.0,...,15784301.0,14912657.0,True,False,47.502922,30696958.0,5.0,1.0,6.0,6.705713e-08
2,Algeria,Atlanta,1996,Men,2,0,1,3,7,0.003252,...,14898762.0,14512653.0,True,False,12.735059,29411415.0,38.0,6.0,44.0,4.134323e-07
3,Algeria,Barcelone,1992,Men,0,0,1,1,1,0.0,...,13764639.0,13416455.0,True,False,11.769336,27181094.0,33.0,2.0,35.0,1.490707e-07
4,Algeria,Barcelone,1992,Women,1,0,0,1,3,0.001779,...,13764639.0,13416455.0,True,False,11.769336,27181094.0,33.0,2.0,35.0,1.490707e-07


### Male

In [76]:
new_df['Male_Representation'] = new_df.NOC_Males_Sent/new_df.Males
new_df.head()

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,...,Females,Summer,Winter,Population_Density,Population,NOC_Males_Sent,NOC_Females_Sent,NOC_Total_Sent,Female_Representation,Male_Representation
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1,0.0,...,13239157.0,True,False,42.236961,27294031.0,3.0,1.0,4.0,7.553351e-08,2.134491e-07
1,Afghanistan,London,2012,Men,0,0,1,1,1,0.0,...,14912657.0,True,False,47.502922,30696958.0,5.0,1.0,6.0,6.705713e-08,3.167704e-07
2,Algeria,Atlanta,1996,Men,2,0,1,3,7,0.003252,...,14512653.0,True,False,12.735059,29411415.0,38.0,6.0,44.0,4.134323e-07,2.550547e-06
3,Algeria,Barcelone,1992,Men,0,0,1,1,1,0.0,...,13416455.0,True,False,11.769336,27181094.0,33.0,2.0,35.0,1.490707e-07,2.397448e-06
4,Algeria,Barcelone,1992,Women,1,0,0,1,3,0.001779,...,13416455.0,True,False,11.769336,27181094.0,33.0,2.0,35.0,1.490707e-07,2.397448e-06


### Overall 

In [77]:
new_df['Population_Representation'] = new_df.NOC_Total_Sent/new_df.Population
new_df.head()

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,...,Summer,Winter,Population_Density,Population,NOC_Males_Sent,NOC_Females_Sent,NOC_Total_Sent,Female_Representation,Male_Representation,Population_Representation
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1,0.0,...,True,False,42.236961,27294031.0,3.0,1.0,4.0,7.553351e-08,2.134491e-07,1.465522e-07
1,Afghanistan,London,2012,Men,0,0,1,1,1,0.0,...,True,False,47.502922,30696958.0,5.0,1.0,6.0,6.705713e-08,3.167704e-07,1.954591e-07
2,Algeria,Atlanta,1996,Men,2,0,1,3,7,0.003252,...,True,False,12.735059,29411415.0,38.0,6.0,44.0,4.134323e-07,2.550547e-06,1.496018e-06
3,Algeria,Barcelone,1992,Men,0,0,1,1,1,0.0,...,True,False,11.769336,27181094.0,33.0,2.0,35.0,1.490707e-07,2.397448e-06,1.28766e-06
4,Algeria,Barcelone,1992,Women,1,0,0,1,3,0.001779,...,True,False,11.769336,27181094.0,33.0,2.0,35.0,1.490707e-07,2.397448e-06,1.28766e-06


# Normalising Data
Now that all of the relevant data is stored within a single dataframe the next stage is to normalise it so that it is all within the same scale (0 - 1)<br>

To normalise the data we decided to use the min max formalua

In [78]:
def minMaxNorm(mn,mx,val):
    return (val-mn)/(mx-mn)

### Gold_Share

In [79]:
mn = new_df['Gold_Share'].min()
mx = new_df['Gold_Share'].max()
new_df['norm_Gold_Share'] = new_df['Gold_Share'].apply(lambda x: minMaxNorm(mn,mx,x))
new_df.head()

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,...,Winter,Population_Density,Population,NOC_Males_Sent,NOC_Females_Sent,NOC_Total_Sent,Female_Representation,Male_Representation,Population_Representation,norm_Gold_Share
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1,0.0,...,False,42.236961,27294031.0,3.0,1.0,4.0,7.553351e-08,2.134491e-07,1.465522e-07,0.0
1,Afghanistan,London,2012,Men,0,0,1,1,1,0.0,...,False,47.502922,30696958.0,5.0,1.0,6.0,6.705713e-08,3.167704e-07,1.954591e-07,0.0
2,Algeria,Atlanta,1996,Men,2,0,1,3,7,0.003252,...,False,12.735059,29411415.0,38.0,6.0,44.0,4.134323e-07,2.550547e-06,1.496018e-06,0.007588
3,Algeria,Barcelone,1992,Men,0,0,1,1,1,0.0,...,False,11.769336,27181094.0,33.0,2.0,35.0,1.490707e-07,2.397448e-06,1.28766e-06,0.0
4,Algeria,Barcelone,1992,Women,1,0,0,1,3,0.001779,...,False,11.769336,27181094.0,33.0,2.0,35.0,1.490707e-07,2.397448e-06,1.28766e-06,0.004152


### Medal_Share

In [80]:
mn = new_df['Medal_Share'].min()
mx = new_df['Medal_Share'].max()
new_df['norm_Medal_Share'] = new_df['Medal_Share'].apply(lambda x: minMaxNorm(mn,mx,x))
new_df.head()

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,...,Population_Density,Population,NOC_Males_Sent,NOC_Females_Sent,NOC_Total_Sent,Female_Representation,Male_Representation,Population_Representation,norm_Gold_Share,norm_Medal_Share
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1,0.0,...,42.236961,27294031.0,3.0,1.0,4.0,7.553351e-08,2.134491e-07,1.465522e-07,0.0,0.0
1,Afghanistan,London,2012,Men,0,0,1,1,1,0.0,...,47.502922,30696958.0,5.0,1.0,6.0,6.705713e-08,3.167704e-07,1.954591e-07,0.0,0.000101
2,Algeria,Atlanta,1996,Men,2,0,1,3,7,0.003252,...,12.735059,29411415.0,38.0,6.0,44.0,4.134323e-07,2.550547e-06,1.496018e-06,0.007588,0.004735
3,Algeria,Barcelone,1992,Men,0,0,1,1,1,0.0,...,11.769336,27181094.0,33.0,2.0,35.0,1.490707e-07,2.397448e-06,1.28766e-06,0.0,0.000394
4,Algeria,Barcelone,1992,Women,1,0,0,1,3,0.001779,...,11.769336,27181094.0,33.0,2.0,35.0,1.490707e-07,2.397448e-06,1.28766e-06,0.004152,0.000394


### Rating_Share

In [81]:
mn = new_df['Rating_Share'].min()
mx = new_df['Rating_Share'].max()
new_df['norm_Rating_Share'] = new_df['Rating_Share'].apply(lambda x: minMaxNorm(mn,mx,x))
new_df.head()

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,...,Population,NOC_Males_Sent,NOC_Females_Sent,NOC_Total_Sent,Female_Representation,Male_Representation,Population_Representation,norm_Gold_Share,norm_Medal_Share,norm_Rating_Share
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1,0.0,...,27294031.0,3.0,1.0,4.0,7.553351e-08,2.134491e-07,1.465522e-07,0.0,0.0,0.0
1,Afghanistan,London,2012,Men,0,0,1,1,1,0.0,...,30696958.0,5.0,1.0,6.0,6.705713e-08,3.167704e-07,1.954591e-07,0.0,0.000101,4.8e-05
2,Algeria,Atlanta,1996,Men,2,0,1,3,7,0.003252,...,29411415.0,38.0,6.0,44.0,4.134323e-07,2.550547e-06,1.496018e-06,0.007588,0.004735,0.006418
3,Algeria,Barcelone,1992,Men,0,0,1,1,1,0.0,...,27181094.0,33.0,2.0,35.0,1.490707e-07,2.397448e-06,1.28766e-06,0.0,0.000394,0.000184
4,Algeria,Barcelone,1992,Women,1,0,0,1,3,0.001779,...,27181094.0,33.0,2.0,35.0,1.490707e-07,2.397448e-06,1.28766e-06,0.004152,0.000394,0.002484


### Population 

In [82]:
mn = new_df['Population'].min()
mx = new_df['Population'].max()
new_df['norm_Population'] = new_df['Population'].apply(lambda x: minMaxNorm(mn,mx,x))
new_df.head()

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,...,NOC_Males_Sent,NOC_Females_Sent,NOC_Total_Sent,Female_Representation,Male_Representation,Population_Representation,norm_Gold_Share,norm_Medal_Share,norm_Rating_Share,norm_Population
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1,0.0,...,3.0,1.0,4.0,7.553351e-08,2.134491e-07,1.465522e-07,0.0,0.0,0.0,0.01978
1,Afghanistan,London,2012,Men,0,0,1,1,1,0.0,...,5.0,1.0,6.0,6.705713e-08,3.167704e-07,1.954591e-07,0.0,0.000101,4.8e-05,0.022249
2,Algeria,Atlanta,1996,Men,2,0,1,3,7,0.003252,...,38.0,6.0,44.0,4.134323e-07,2.550547e-06,1.496018e-06,0.007588,0.004735,0.006418,0.021316
3,Algeria,Barcelone,1992,Men,0,0,1,1,1,0.0,...,33.0,2.0,35.0,1.490707e-07,2.397448e-06,1.28766e-06,0.0,0.000394,0.000184,0.019699
4,Algeria,Barcelone,1992,Women,1,0,0,1,3,0.001779,...,33.0,2.0,35.0,1.490707e-07,2.397448e-06,1.28766e-06,0.004152,0.000394,0.002484,0.019699


### Population Density

In [83]:
mn = new_df['Population_Density'].min()
mx = new_df['Population_Density'].max()
new_df['norm_Population_Density'] = new_df['Population_Density'].apply(lambda x: minMaxNorm(mn,mx,x))
new_df.head()

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,...,NOC_Females_Sent,NOC_Total_Sent,Female_Representation,Male_Representation,Population_Representation,norm_Gold_Share,norm_Medal_Share,norm_Rating_Share,norm_Population,norm_Population_Density
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1,0.0,...,1.0,4.0,7.553351e-08,2.134491e-07,1.465522e-07,0.0,0.0,0.0,0.01978,0.00508
1,Afghanistan,London,2012,Men,0,0,1,1,1,0.0,...,1.0,6.0,6.705713e-08,3.167704e-07,1.954591e-07,0.0,0.000101,4.8e-05,0.022249,0.005726
2,Algeria,Atlanta,1996,Men,2,0,1,3,7,0.003252,...,6.0,44.0,4.134323e-07,2.550547e-06,1.496018e-06,0.007588,0.004735,0.006418,0.021316,0.001466
3,Algeria,Barcelone,1992,Men,0,0,1,1,1,0.0,...,2.0,35.0,1.490707e-07,2.397448e-06,1.28766e-06,0.0,0.000394,0.000184,0.019699,0.001347
4,Algeria,Barcelone,1992,Women,1,0,0,1,3,0.001779,...,2.0,35.0,1.490707e-07,2.397448e-06,1.28766e-06,0.004152,0.000394,0.002484,0.019699,0.001347


### Male, Female and Overall Representation

In [84]:
columns = ['Male_Representation','Female_Representation','Population_Representation']
for col in columns:
    mn = new_df[col].min()
    mx = new_df[col].max()
    new_df['norm_'+col] = new_df[col].apply(lambda x: minMaxNorm(mn,mx,x))
new_df.head()

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,...,Male_Representation,Population_Representation,norm_Gold_Share,norm_Medal_Share,norm_Rating_Share,norm_Population,norm_Population_Density,norm_Male_Representation,norm_Female_Representation,norm_Population_Representation
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1,0.0,...,2.134491e-07,1.465522e-07,0.0,0.0,0.0,0.01978,0.00508,0.000246,0.000315,0.000273
1,Afghanistan,London,2012,Men,0,0,1,1,1,0.0,...,3.167704e-07,1.954591e-07,0.0,0.000101,4.8e-05,0.022249,0.005726,0.000372,0.00028,0.000379
2,Algeria,Atlanta,1996,Men,2,0,1,3,7,0.003252,...,2.550547e-06,1.496018e-06,0.007588,0.004735,0.006418,0.021316,0.001466,0.00309,0.001726,0.00319
3,Algeria,Barcelone,1992,Men,0,0,1,1,1,0.0,...,2.397448e-06,1.28766e-06,0.0,0.000394,0.000184,0.019699,0.001347,0.002904,0.000622,0.002739
4,Algeria,Barcelone,1992,Women,1,0,0,1,3,0.001779,...,2.397448e-06,1.28766e-06,0.004152,0.000394,0.002484,0.019699,0.001347,0.002904,0.000622,0.002739


### Males and Females

In [85]:
columns = ['Males','Females']
for col in columns:
    mn = new_df[col].min()
    mx = new_df[col].max()
    new_df['norm_'+col] = new_df[col].apply(lambda x: minMaxNorm(mn,mx,x))
new_df.head()

Unnamed: 0,Country,Host_City,Year,Gender,Gold,Silver,Bronze,Total_Medals,Ath_Rating,Gold_Share,...,norm_Gold_Share,norm_Medal_Share,norm_Rating_Share,norm_Population,norm_Population_Density,norm_Male_Representation,norm_Female_Representation,norm_Population_Representation,norm_Males,norm_Females
0,Afghanistan,Beijing,2008,Men,0,0,1,1,1,0.0,...,0.0,0.0,0.0,0.01978,0.00508,0.000246,0.000315,0.000273,0.019769,0.019792
1,Afghanistan,London,2012,Men,0,0,1,1,1,0.0,...,0.0,0.000101,4.8e-05,0.022249,0.005726,0.000372,0.00028,0.000379,0.022204,0.022296
2,Algeria,Atlanta,1996,Men,2,0,1,3,7,0.003252,...,0.007588,0.004735,0.006418,0.021316,0.001466,0.00309,0.001726,0.00319,0.020957,0.021698
3,Algeria,Barcelone,1992,Men,0,0,1,1,1,0.0,...,0.0,0.000394,0.000184,0.019699,0.001347,0.002904,0.000622,0.002739,0.019361,0.020057
4,Algeria,Barcelone,1992,Women,1,0,0,1,3,0.001779,...,0.004152,0.000394,0.002484,0.019699,0.001347,0.002904,0.000622,0.002739,0.019361,0.020057


# Output Data for Final Analysis

In [96]:
new_df.to_csv('../../data/final/Population.csv', index=False)