In [1]:
# Import Dependencies
from pymongo import MongoClient
from password import connection_string
import pandas as pd
import numpy as np

In [2]:
# Import datasets
athlete_events = pd.read_csv('Data Sources/athlete_events.csv')
noc_regions = pd.read_csv('Data Sources/noc_regions.csv')
gdp = pd.read_excel('Data Sources/gdp.xlsx')
coordinates = pd.read_csv('Data Sources/coordinates.csv')
hosts = pd.read_csv('Data Sources/hosts.csv')
population = pd.read_excel('Data Sources/population.xlsx')

In [3]:
# Put the athletes_events data into a DataFrame
athlete_events_df = pd.DataFrame(athlete_events)

In [4]:
# Put the noc_regions data into a DataFrame
noc_regions_df = pd.DataFrame(noc_regions)

In [5]:
# Put the GDP data into a DataFrame
gdp_df = pd.DataFrame(gdp)

In [6]:
# Put the coordinates data into a DataFrame
coordinates_df = pd.DataFrame(coordinates)

In [7]:
# Put the hosts data into a DataFrame
hosts_df = pd.DataFrame(hosts)

In [8]:
# Put the population data into a DataFrame
population_df = pd.DataFrame(population)

Connecting to the Database

In [9]:
# # connect to MongoDB
# myclient = MongoClient(connection_string) 
   
# # database
# db = myclient["test_database"]
   
# # collection
# collection = db["test_data2"]
  
# # change format of data
# data = noc_regions_df.to_dict('records')

# # add data to MongoDB
# collection.insert_many(data)

In [10]:
# # databases that are on MongoDB
# for db in myclient.list_databases():
#     print(db)

In [11]:
# # Take data off of MongoDB
# df = pd.DataFrame(list(collection.find()))

In [12]:
# df.head()

Clean the data

In [13]:
# Return first 5 rows of athlete_events_df
athlete_events_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [14]:
# Return first 5 rows of noc_regions_df
noc_regions_df.head()

Unnamed: 0,NOC,region,notes
0,AFG,Afghanistan,
1,ALB,Albania,
2,ALG,Algeria,
3,ASA,American Samoa,
4,AND,Andorra,


In [15]:
# Merge the athlete_events_df with the noc_regions_df
olympic_data_df = athlete_events_df.merge(noc_regions_df, left_on = 'NOC', right_on = 'NOC', how = 'left')

In [16]:
# Return first 5 rows of olympic_data_df
olympic_data_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,region,notes
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,,China,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,,China,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,,Denmark,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold,Denmark,
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,,Netherlands,


In [17]:
# Check for 'NaN'
olympic_data_df.isna().sum()

ID             0
Name           0
Sex            0
Age         9474
Height     60171
Weight     62875
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
Medal     231333
region       370
notes     266077
dtype: int64

In [18]:
# Checking if there are any rows where the 'NOC' didn't have a 'region' to match it
olympic_data_df.loc[olympic_data_df['region'].isnull(),['NOC', 'Team']].drop_duplicates()

Unnamed: 0,NOC,Team
578,SGP,Singapore
6267,ROT,Refugee Olympic Athletes
44376,SGP,June Climene
61080,UNK,Unknown
64674,TUV,Tuvalu
80986,SGP,Rika II
108582,SGP,Singapore-2
235895,SGP,Singapore-1


In [19]:
# "Manually" fixing the values above
olympic_data_df['region'] = np.where(olympic_data_df['NOC']=='SGP', 'Singapore', olympic_data_df['region'])
olympic_data_df['region'] = np.where(olympic_data_df['NOC']=='ROT', 'Refugee Olympic Athletes', olympic_data_df['region'])
olympic_data_df['region'] = np.where(olympic_data_df['NOC']=='TUV', 'Tuvalu', olympic_data_df['region'])
olympic_data_df['region'] = np.where(olympic_data_df['NOC']=='UNK', 'Unknown', olympic_data_df['region'])

In [20]:
# Check for 'NaN'
olympic_data_df.isna().sum()

ID             0
Name           0
Sex            0
Age         9474
Height     60171
Weight     62875
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
Medal     231333
region         0
notes     266077
dtype: int64

In [21]:
# Return first 5 rows of olympic_data_df
olympic_data_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,region,notes
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,,China,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,,China,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,,Denmark,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold,Denmark,
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,,Netherlands,


In [22]:
# Dropping uneccessary columns
olympic_data_df.drop(['ID', 'Name', 'Sex', 'Age', 'Height', 'Weight', 'Team',
                         'Games', 'City', 'Sport', 'notes'], axis = 1, inplace = True)

In [23]:
# Change column names
olympic_data_df.rename(columns = {'region': 'Team'}, inplace = True)

In [24]:
# Reorder columns
olympic_data_df = olympic_data_df[['Team', 'NOC', 'Year', 'Season', 'Event', 'Medal']]

In [25]:
# Find unique values in the medal column
olympic_data_df["Medal"].unique()

array([nan, 'Gold', 'Bronze', 'Silver'], dtype=object)

In [26]:
# Replace rows with 'NaN' for medal with 'DNW' (Did not win). This way, these rows won't be dropped if use the 'dropna'
# function later. Even though those athletes didn't win a medal, the more data to train on, the better.
olympic_data_df.fillna('DNW', inplace=True)

In [27]:
# Return first 5 rows of olympic_data_df
olympic_data_df.head()

Unnamed: 0,Team,NOC,Year,Season,Event,Medal
0,China,CHN,1992,Summer,Basketball Men's Basketball,DNW
1,China,CHN,2012,Summer,Judo Men's Extra-Lightweight,DNW
2,Denmark,DEN,1920,Summer,Football Men's Football,DNW
3,Denmark,DEN,1900,Summer,Tug-Of-War Men's Tug-Of-War,Gold
4,Netherlands,NED,1988,Winter,Speed Skating Women's 500 metres,DNW


In [28]:
# Return first 5 rows of coordinates_df
coordinates_df.head()

Unnamed: 0,country_code,latitude,longitude,country,usa_state_code,usa_state_latitude,usa_state_longitude,usa_state
0,AF,33.93911,67.709953,Afghanistan,AR,35.20105,-91.831833,Arkansas
1,AL,41.153332,20.168331,Albania,CO,39.550051,-105.782067,Colorado
2,DZ,28.033886,1.659626,Algeria,,,,
3,AS,-14.270972,-170.132217,American Samoa,HI,19.898682,-155.665857,Hawaii
4,AD,42.546245,1.601554,Andorra,AK,63.588753,-154.493062,Alaska


In [29]:
# Dropping uneccessary columns
coordinates_df.drop(['usa_state_code', 'usa_state_latitude', 'usa_state_longitude', 'usa_state'], axis=1, inplace=True)

In [30]:
# Return first 5 rows of coordinates_df
coordinates_df.head()

Unnamed: 0,country_code,latitude,longitude,country
0,AF,33.93911,67.709953,Afghanistan
1,AL,41.153332,20.168331,Albania
2,DZ,28.033886,1.659626,Algeria
3,AS,-14.270972,-170.132217,American Samoa
4,AD,42.546245,1.601554,Andorra


In [31]:
# Check for 'NaN'
coordinates_df.isna().sum()

country_code    2
latitude        1
longitude       1
country         0
dtype: int64

In [32]:
# Showing 'NaN' 'country_code'
coordinates_df.loc[coordinates_df['country_code'].isnull(),['latitude', 'longitude', 'country']].drop_duplicates()

Unnamed: 0,latitude,longitude,country
150,-22.95764,18.49041,Namibia
204,6.877,31.307,South Sudan


In [33]:
# Showing 'NaN' 'latitude' and 'longitude'
coordinates_df.loc[coordinates_df['latitude'].isnull(),['country_code', 'longitude', 'country']].drop_duplicates()

Unnamed: 0,country_code,longitude,country
228,UM,,U.S. Minor Outlying Islands


In [34]:
# Merging olympic and coordinate data
olympic_coordinate_df = olympic_data_df.merge(coordinates_df, left_on = 'Team', right_on = 'country', how = 'left')

In [35]:
# Return first 5 rows of olympic_coordinate_df
olympic_coordinate_df.head()

Unnamed: 0,Team,NOC,Year,Season,Event,Medal,country_code,latitude,longitude,country
0,China,CHN,1992,Summer,Basketball Men's Basketball,DNW,CN,35.86166,104.195397,China
1,China,CHN,2012,Summer,Judo Men's Extra-Lightweight,DNW,CN,35.86166,104.195397,China
2,Denmark,DEN,1920,Summer,Football Men's Football,DNW,DK,56.26392,9.501785,Denmark
3,Denmark,DEN,1900,Summer,Tug-Of-War Men's Tug-Of-War,Gold,DK,56.26392,9.501785,Denmark
4,Netherlands,NED,1988,Winter,Speed Skating Women's 500 metres,DNW,NL,52.132633,5.291266,Netherlands


In [36]:
# Checking 'NaN' 'country' after the merge
olympic_coordinate_df.loc[olympic_coordinate_df['country'].isnull(),['Team', 'NOC']].drop_duplicates()

Unnamed: 0,Team,NOC
3165,Individual Olympic Athletes,IOA
6267,Refugee Olympic Athletes,ROT
61080,Unknown,UNK


In [37]:
# Dropping uneccessary columns
olympic_coordinate_df.drop(['country_code', 'country'], axis=1, inplace=True)

In [38]:
# Return first 5 rows of olympic_coordinate_df
olympic_coordinate_df.head()

Unnamed: 0,Team,NOC,Year,Season,Event,Medal,latitude,longitude
0,China,CHN,1992,Summer,Basketball Men's Basketball,DNW,35.86166,104.195397
1,China,CHN,2012,Summer,Judo Men's Extra-Lightweight,DNW,35.86166,104.195397
2,Denmark,DEN,1920,Summer,Football Men's Football,DNW,56.26392,9.501785
3,Denmark,DEN,1900,Summer,Tug-Of-War Men's Tug-Of-War,Gold,56.26392,9.501785
4,Netherlands,NED,1988,Winter,Speed Skating Women's 500 metres,DNW,52.132633,5.291266


In [39]:
# Return first 5 rows of hosts_df
hosts_df.head()

Unnamed: 0,Type,GamesUrl,Disciplines,DisciplinesList,Country,Date,Athletes,Countries,Events,City,Year,Host
0,Summer,https://www.olympic.org/athens-1896,10,"['Athletics', 'Cycling Road', 'Cycling Track',...",Greece,06 Apr - 15 Apr,241.0,14.0,43.0,Athens,1896,1
1,Summer,https://www.olympic.org/paris-1900,20,"['Archery', 'Athletics', 'Basque Pelota', 'Cri...",France,14 May - 28 Oct,997.0,24.0,95.0,Paris,1900,1
2,Summer,https://www.olympic.org/st-louis-1904,19,"['Archery', 'Athletics', 'Basketball', 'Boxing...",USA,01 Jul - 23 Nov,651.0,12.0,95.0,St Louis,1904,1
3,Summer,https://www.olympic.org/london-1908,25,"['Archery', 'Athletics', 'Boxing', 'Cycling Tr...",UK,27 Apr - 31 Oct,2008.0,22.0,110.0,London,1908,1
4,Summer,https://www.olympic.org/stockholm-1912,18,"['Athletics', 'Cycling Road', 'Diving', 'Eques...",Sweden,05 May - 27 Jul,2407.0,28.0,102.0,Stockholm,1912,1


In [40]:
# Dropping uneccessary columns
hosts_df.drop(['GamesUrl', 'Date', 
               'Events','Countries', 'City'], axis=1, inplace=True)

In [41]:
# Return first 5 rows of hosts_df
hosts_df.head()

Unnamed: 0,Type,Disciplines,DisciplinesList,Country,Athletes,Year,Host
0,Summer,10,"['Athletics', 'Cycling Road', 'Cycling Track',...",Greece,241.0,1896,1
1,Summer,20,"['Archery', 'Athletics', 'Basque Pelota', 'Cri...",France,997.0,1900,1
2,Summer,19,"['Archery', 'Athletics', 'Basketball', 'Boxing...",USA,651.0,1904,1
3,Summer,25,"['Archery', 'Athletics', 'Boxing', 'Cycling Tr...",UK,2008.0,1908,1
4,Summer,18,"['Athletics', 'Cycling Road', 'Diving', 'Eques...",Sweden,2407.0,1912,1


In [42]:
# Merging the olympic + coordinate data with the host data
olympic_coordinate_host_df = olympic_coordinate_df.merge(hosts_df, left_on = ['Year', 'Season', 'Team'], 
                                                               right_on = ['Year', 'Type', 'Country'], how = 'left')

In [43]:
# Return first 5 rows of olympic_coordinate_host_df
olympic_coordinate_host_df.head()

Unnamed: 0,Team,NOC,Year,Season,Event,Medal,latitude,longitude,Type,Disciplines,DisciplinesList,Country,Athletes,Host
0,China,CHN,1992,Summer,Basketball Men's Basketball,DNW,35.86166,104.195397,,,,,,
1,China,CHN,2012,Summer,Judo Men's Extra-Lightweight,DNW,35.86166,104.195397,,,,,,
2,Denmark,DEN,1920,Summer,Football Men's Football,DNW,56.26392,9.501785,,,,,,
3,Denmark,DEN,1900,Summer,Tug-Of-War Men's Tug-Of-War,Gold,56.26392,9.501785,,,,,,
4,Netherlands,NED,1988,Winter,Speed Skating Women's 500 metres,DNW,52.132633,5.291266,,,,,,


In [44]:
# Dropping uneccessary columns
olympic_coordinate_host_df.drop(['Type', 'Country'], axis=1, inplace=True)

In [45]:
# NEW
# This code will remove the [] from the Disciplines List. Leave this commented out for now as it may interfere with the ML model
# olympic_coordinate_new_host_df['DisciplinesList'] = olympic_coordinate_new_host_df['DisciplinesList'].str.strip('[]')

In [46]:
# Replacing 'NaN' values in the host column with 0 (1 means host and 0 means not host)
values = {'Host':0}
olympic_coordinate_host_df = olympic_coordinate_host_df.fillna(value=values)

In [47]:
# Check for 'NaN' values
olympic_coordinate_host_df.isna().sum()

Team                    0
NOC                     0
Year                    0
Season                  0
Event                   0
Medal                   0
latitude              108
longitude             108
Disciplines        250424
DisciplinesList    250424
Athletes           250424
Host                    0
dtype: int64

In [48]:
# Return first 5 rows of gdp_df
gdp_df.head()

Unnamed: 0,geo,name,time,Income per person,GDP total
0,afg,Afghanistan,1896,1014,4782510168
1,afg,Afghanistan,1897,1033,4893829692
2,afg,Afghanistan,1898,1051,5006665488
3,afg,Afghanistan,1899,1069,5122735743
4,afg,Afghanistan,1900,1088,5255457328


In [49]:
# Merging the olympic + coordinate + host data with the gdp data
olympic_coordinate_host_gdp_df = olympic_coordinate_host_df.merge(gdp_df, left_on = ['Year', 'Team'], 
                                                               right_on = ['time', 'name'], how = 'left')

In [50]:
# Return first 5 rows of olympic_coordinate_host_gdp_df
olympic_coordinate_host_gdp_df.head()

Unnamed: 0,Team,NOC,Year,Season,Event,Medal,latitude,longitude,Disciplines,DisciplinesList,Athletes,Host,geo,name,time,Income per person,GDP total
0,China,CHN,1992,Summer,Basketball Men's Basketball,DNW,35.86166,104.195397,,,,0.0,chn,China,1992.0,1851.0,2234003000000.0
1,China,CHN,2012,Summer,Judo Men's Extra-Lightweight,DNW,35.86166,104.195397,,,,0.0,chn,China,2012.0,11115.0,15385610000000.0
2,Denmark,DEN,1920,Summer,Football Men's Football,DNW,56.26392,9.501785,,,,0.0,dnk,Denmark,1920.0,7173.0,23253480000.0
3,Denmark,DEN,1900,Summer,Tug-Of-War Men's Tug-Of-War,Gold,56.26392,9.501785,,,,0.0,dnk,Denmark,1900.0,5420.0,13905540000.0
4,Netherlands,NED,1988,Winter,Speed Skating Women's 500 metres,DNW,52.132633,5.291266,,,,0.0,nld,Netherlands,1988.0,29947.0,442443700000.0


In [51]:
# Checking which 'name' rows contain 'NaN'
olympic_coordinate_host_gdp_df.loc[olympic_coordinate_host_gdp_df['name'].isnull(),
                                      ['Team']].drop_duplicates()

Unnamed: 0,Team
734,"Virgin Islands, US"
1162,Puerto Rico
1858,Netherlands Antilles
1891,American Samoa
2757,Bermuda
3165,Individual Olympic Athletes
4499,"Virgin Islands, British"
4775,Guam
4925,Cayman Islands
6267,Refugee Olympic Athletes


In [52]:
# Dropping uneccessary columns
olympic_coordinate_host_gdp_df.drop(['geo', 'name', 'time'], axis=1, inplace=True)

In [53]:
# Return first 5 rows of olympic_coordinate_host_gdp_df
olympic_coordinate_host_gdp_df.head()

Unnamed: 0,Team,NOC,Year,Season,Event,Medal,latitude,longitude,Disciplines,DisciplinesList,Athletes,Host,Income per person,GDP total
0,China,CHN,1992,Summer,Basketball Men's Basketball,DNW,35.86166,104.195397,,,,0.0,1851.0,2234003000000.0
1,China,CHN,2012,Summer,Judo Men's Extra-Lightweight,DNW,35.86166,104.195397,,,,0.0,11115.0,15385610000000.0
2,Denmark,DEN,1920,Summer,Football Men's Football,DNW,56.26392,9.501785,,,,0.0,7173.0,23253480000.0
3,Denmark,DEN,1900,Summer,Tug-Of-War Men's Tug-Of-War,Gold,56.26392,9.501785,,,,0.0,5420.0,13905540000.0
4,Netherlands,NED,1988,Winter,Speed Skating Women's 500 metres,DNW,52.132633,5.291266,,,,0.0,29947.0,442443700000.0


In [54]:
# Return first 5 rows of population_df
population_df.head()

Unnamed: 0,geo,name,time,Population
0,afg,Afghanistan,1896,4714779
1,afg,Afghanistan,1897,4738246
2,afg,Afghanistan,1898,4761826
3,afg,Afghanistan,1899,4793171
4,afg,Afghanistan,1900,4832414


In [55]:
# Merging the olympic + coordinate + host + gdp data with the population data
olympic_coordinate_host_gdp_pop_df = olympic_coordinate_host_gdp_df.merge(population_df, left_on = ['Year', 'Team'], 
                                                               right_on = ['time', 'name'], how = 'left')

In [56]:
# Return first 5 rows of olympic_coordinate_host_gdp_pop_df
olympic_coordinate_host_gdp_pop_df.head()

Unnamed: 0,Team,NOC,Year,Season,Event,Medal,latitude,longitude,Disciplines,DisciplinesList,Athletes,Host,Income per person,GDP total,geo,name,time,Population
0,China,CHN,1992,Summer,Basketball Men's Basketball,DNW,35.86166,104.195397,,,,0.0,1851.0,2234003000000.0,chn,China,1992.0,1206711000.0
1,China,CHN,2012,Summer,Judo Men's Extra-Lightweight,DNW,35.86166,104.195397,,,,0.0,11115.0,15385610000000.0,chn,China,2012.0,1384206000.0
2,Denmark,DEN,1920,Summer,Football Men's Football,DNW,56.26392,9.501785,,,,0.0,7173.0,23253480000.0,dnk,Denmark,1920.0,3241807.0
3,Denmark,DEN,1900,Summer,Tug-Of-War Men's Tug-Of-War,Gold,56.26392,9.501785,,,,0.0,5420.0,13905540000.0,dnk,Denmark,1900.0,2565597.0
4,Netherlands,NED,1988,Winter,Speed Skating Women's 500 metres,DNW,52.132633,5.291266,,,,0.0,29947.0,442443700000.0,nld,Netherlands,1988.0,14774030.0


In [57]:
# Checking which 'name' rows contain 'NaN'
olympic_coordinate_host_gdp_pop_df.loc[olympic_coordinate_host_gdp_pop_df['name'].isnull(),
                                      ['Team']].drop_duplicates()

Unnamed: 0,Team
734,"Virgin Islands, US"
1162,Puerto Rico
1858,Netherlands Antilles
1891,American Samoa
2757,Bermuda
3165,Individual Olympic Athletes
4499,"Virgin Islands, British"
4775,Guam
4925,Cayman Islands
6267,Refugee Olympic Athletes


In [58]:
# Dropping uneccessary columns
olympic_coordinate_host_gdp_pop_df.drop(['geo', 'name', 'time'], axis=1, inplace=True)

In [59]:
# Creating the 'GDP per capita' column
olympic_coordinate_host_gdp_pop_df['GDP per capita'] = np.where(olympic_coordinate_host_gdp_pop_df['GDP total'] == np.nan, np.nan,
                    olympic_coordinate_host_gdp_pop_df['GDP total']
                    /olympic_coordinate_host_gdp_pop_df['Population'])

In [60]:
# Return first 5 rows of olympic_coordinate_host_gdp_pop_df
olympic_coordinate_host_gdp_pop_df.head()

Unnamed: 0,Team,NOC,Year,Season,Event,Medal,latitude,longitude,Disciplines,DisciplinesList,Athletes,Host,Income per person,GDP total,Population,GDP per capita
0,China,CHN,1992,Summer,Basketball Men's Basketball,DNW,35.86166,104.195397,,,,0.0,1851.0,2234003000000.0,1206711000.0,1851.315492
1,China,CHN,2012,Summer,Judo Men's Extra-Lightweight,DNW,35.86166,104.195397,,,,0.0,11115.0,15385610000000.0,1384206000.0,11115.110675
2,Denmark,DEN,1920,Summer,Football Men's Football,DNW,56.26392,9.501785,,,,0.0,7173.0,23253480000.0,3241807.0,7173.0
3,Denmark,DEN,1900,Summer,Tug-Of-War Men's Tug-Of-War,Gold,56.26392,9.501785,,,,0.0,5420.0,13905540000.0,2565597.0,5420.0
4,Netherlands,NED,1988,Winter,Speed Skating Women's 500 metres,DNW,52.132633,5.291266,,,,0.0,29947.0,442443700000.0,14774030.0,29947.389734


In [61]:
# Making a new column and putting a 1 if a medal was won and a 0 if not
olympic_coordinate_host_gdp_pop_df['Medal Won'] = np.where(olympic_coordinate_host_gdp_pop_df.loc[:,'Medal'] == 'DNW', 0, 1)

In [62]:
olympic_coordinate_host_gdp_pop_df.head()

Unnamed: 0,Team,NOC,Year,Season,Event,Medal,latitude,longitude,Disciplines,DisciplinesList,Athletes,Host,Income per person,GDP total,Population,GDP per capita,Medal Won
0,China,CHN,1992,Summer,Basketball Men's Basketball,DNW,35.86166,104.195397,,,,0.0,1851.0,2234003000000.0,1206711000.0,1851.315492,0
1,China,CHN,2012,Summer,Judo Men's Extra-Lightweight,DNW,35.86166,104.195397,,,,0.0,11115.0,15385610000000.0,1384206000.0,11115.110675,0
2,Denmark,DEN,1920,Summer,Football Men's Football,DNW,56.26392,9.501785,,,,0.0,7173.0,23253480000.0,3241807.0,7173.0,0
3,Denmark,DEN,1900,Summer,Tug-Of-War Men's Tug-Of-War,Gold,56.26392,9.501785,,,,0.0,5420.0,13905540000.0,2565597.0,5420.0,1
4,Netherlands,NED,1988,Winter,Speed Skating Women's 500 metres,DNW,52.132633,5.291266,,,,0.0,29947.0,442443700000.0,14774030.0,29947.389734,0


In [63]:
# Check whether number of medals won in a year for an event by a team exceeds 1. This indicates a team event.
identify_team_events = pd.pivot_table(olympic_coordinate_host_gdp_pop_df,
                                      index = ['Team', 'Year', 'Season', 'Event'],
                                      columns = 'Medal',
                                      values = 'Medal Won',
                                      aggfunc = 'sum',
                                     fill_value = 0).drop('DNW', axis = 1).reset_index()

identify_team_events_gold = identify_team_events.loc[identify_team_events['Gold'] > 1, :]
identify_team_events_silver = identify_team_events.loc[identify_team_events['Silver'] > 1, :]
identify_team_events_bronze = identify_team_events.loc[identify_team_events['Bronze'] > 1, :]

team_sports_gold = identify_team_events_gold['Event'].unique()
team_sports_silver = identify_team_events_silver['Event'].unique()
team_sports_bronze = identify_team_events_bronze['Event'].unique()

In [64]:
team_sports_total = np.concatenate([team_sports_bronze, team_sports_gold, team_sports_silver])

In [65]:
len(team_sports_total)

672

In [66]:
team_sports_total

array(["Fencing Men's Foil, Team", "Rowing Men's Coxless Pairs",
       "Volleyball Men's Volleyball", "Tennis Men's Doubles",
       "Sailing Men's Two Person Dinghy", "Hockey Women's Hockey",
       'Sailing Mixed Multihull', "Tennis Women's Doubles",
       "Basketball Men's Basketball", "Rowing Men's Coxed Eights",
       "Canoeing Men's Kayak Doubles, 10,000 metres",
       "Rowing Men's Double Sculls", 'Sailing Mixed 5.5 metres',
       "Swimming Men's 4 x 200 metres Freestyle Relay",
       "Hockey Men's Hockey",
       "Swimming Men's 4 x 100 metres Freestyle Relay",
       "Swimming Men's 4 x 100 metres Medley Relay",
       'Equestrianism Mixed Three-Day Event, Team',
       'Sailing Mixed Two Person Dinghy',
       "Canoeing Men's Kayak Doubles, 1,000 metres",
       "Rowing Women's Coxed Fours",
       "Cycling Men's Team Pursuit, 4,000 metres",
       "Canoeing Men's Kayak Fours, 1,000 metres",
       "Short Track Speed Skating Men's 5,000 metres Relay",
       "Basketball

In [71]:
remove_sports = ["Gymnastics Women's Balance Beam", "Gymnastics Men's Horizontal Bar", 
                 "Swimming Women's 100 metres Freestyle", "Swimming Men's 50 metres Freestyle"]

team_sports = list(set(team_sports_total) - set(remove_sports))

In [72]:
len(team_sports)

249

In [None]:
# if an event name matches with one in team sports, then it is a team event. Others are singles events.
team_event_mask = olympic_coordinate_host_gdp_pop_df['Event'].map(lambda x: x in team_sports)
single_event_mask = [not i for i in team_event_mask]

# rows where medal_won is 1
medal_mask = olympic_coordinate_host_gdp_pop_df['Medal Won'] == 1

# Put 1 under team event if medal is won and event in team event list
olympic_coordinate_host_gdp_pop_df['Team_Event'] = np.where(team_event_mask & medal_mask, 1, 0)

# Put 1 under singles event if medal is won and event not in team event list
olympic_coordinate_host_gdp_pop_df['Single_Event'] = np.where(single_event_mask & medal_mask, 1, 0)

# Add an identifier for team/single event
olympic_coordinate_host_gdp_pop_df['Event_Category'] = olympic_coordinate_host_gdp_pop_df['Single_Event'] + \
olympic_coordinate_host_gdp_pop_df['Team_Event']

In [None]:
medal_tally_agnostic = olympic_coordinate_host_gdp_pop_df.\
groupby(['Year', 'Team', 'Event', 'Medal'])[['Medal Won', 'Event_Category']].\
agg('sum').reset_index()

medal_tally_agnostic['Medal_Won_Corrected'] = medal_tally_agnostic['Medal Won']/medal_tally_agnostic['Event_Category']

In [None]:
medal_tally_agnostic.head()

In [None]:
# Drop duplicates to aovid things like adding 12 medals to a country that won basketball (one for each player)
# instead of just adding 1 (one for whole team)
olympic_data_df = olympic_data_df.drop_duplicates(['NOC', 'Year', 'Season', 'Event', 'Medal', 'Team'])

In [None]:
# Summing medals by 'NOC', 'Year', 'Season', and 'Team'
olympic_data_df = pd.DataFrame({'Medal Count' : olympic_data_df.groupby( ['NOC', 'Year', 'Season', 'Team'] ).size()}).reset_index()

In [None]:
# # Export olympic_coordinate_host_gdp_merge to MS Excel - final.xlsx
# olympic_coordinate_new_host_gdp_df.to_excel('final.xlsx')

In [None]:
# # Drop missing values & Export olympic_coordinate_new_host_gdp_df to MS Excel - tableau_final.xlsx
# olympic_coordinate_new_host_gdp_df.dropna().to_excel('tableau_final.xlsx')