In [4]:
# Import dependencies
from pymongo import MongoClient
from password import connection_string
import pandas as pd
import numpy as np

## Working with MongoDB

In [5]:
# connect to MongoDB
myclient = MongoClient(connection_string)

# database
db = myclient['Olympics_Machine_Learning_Project_Data']

In [6]:
# Hosts collection
hosts_collection = db["hosts"]

# Take data off of MongoDB
hosts_df = pd.DataFrame(list(hosts_collection.find()))

In [7]:
# GDP collection
gdp_collection = db["gdp"]

# Take data off of MongoDB
gdp_df = pd.DataFrame(list(gdp_collection.find()))

In [8]:
# Population collection
population_collection = db["population"]

# Take data off of MongoDB
population_df = pd.DataFrame(list(population_collection.find()))

In [9]:
# Coordinates collection
coordinates_collection = db["coordinates"]

# Take data off of MongoDB
coordinates_df = pd.DataFrame(list(coordinates_collection.find()))

In [10]:
# NOC regions collection
noc_regions_collection = db["noc_regions"]

# Take data off of MongoDB
noc_regions_df = pd.DataFrame(list(noc_regions_collection.find()))

In [11]:
# Athlete events collection
athlete_events_collection = db["athlete_events"]

# Take data off of MongoDB
athlete_events_df = pd.DataFrame(list(athlete_events_collection.find()))

## Cleaning and Merging the Data

In [12]:
# Return first 5 rows of athlete_events_df
athlete_events_df.head()

Unnamed: 0,_id,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,6094127943a40af1a4be5bbd,15,Arvo Ossian Aaltonen,M,22.0,,,Finland,FIN,1912 Summer,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,
1,6094127943a40af1a4be5ba7,6,Per Knut Aaland,M,31.0,188.0,75.0,United States,USA,1992 Winter,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,
2,6094127943a40af1a4be5bd9,20,Kjetil Andr Aamodt,M,22.0,176.0,85.0,Norway,NOR,1994 Winter,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver
3,6094127943a40af1a4be5bee,24,Nils Egil Aaness,M,24.0,,,Norway,NOR,1960 Winter,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",
4,6094127943a40af1a4be5bfe,36,Stefan Remco Aartsen,M,21.0,194.0,78.0,Netherlands,NED,1996 Summer,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,


In [13]:
# Return first 5 rows of noc_regions_df
noc_regions_df.head()

Unnamed: 0,_id,NOC,region,notes
0,6094127543a40af1a4be5acc,BOL,Bolivia,
1,6094127543a40af1a4be5add,CHN,China,
2,6094127543a40af1a4be5aea,COD,Democratic Republic of the Congo,
3,6094127543a40af1a4be5aee,DOM,Dominican Republic,
4,6094127543a40af1a4be5b0e,IOA,Individual Olympic Athletes,Individual Olympic Athletes


In [14]:
# Merge the athlete_events_df with the noc_regions_df
olympic_data_df = athlete_events_df.merge(noc_regions_df, left_on = 'NOC', right_on = 'NOC', how = 'left')

In [15]:
# Return first 5 rows of olympic_data_df
olympic_data_df.head()

Unnamed: 0,_id_x,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,_id_y,region,notes
0,6094127943a40af1a4be5bbd,15,Arvo Ossian Aaltonen,M,22.0,,,Finland,FIN,1912 Summer,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,,6094127543a40af1a4be5af7,Finland,
1,6094127943a40af1a4be5ba7,6,Per Knut Aaland,M,31.0,188.0,75.0,United States,USA,1992 Winter,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,,6094127543a40af1a4be5b8c,USA,
2,6094127943a40af1a4be5bd9,20,Kjetil Andr Aamodt,M,22.0,176.0,85.0,Norway,NOR,1994 Winter,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,6094127543a40af1a4be5b49,Norway,
3,6094127943a40af1a4be5bee,24,Nils Egil Aaness,M,24.0,,,Norway,NOR,1960 Winter,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",,6094127543a40af1a4be5b49,Norway,
4,6094127943a40af1a4be5bfe,36,Stefan Remco Aartsen,M,21.0,194.0,78.0,Netherlands,NED,1996 Summer,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,,6094127543a40af1a4be5b43,Netherlands,


In [16]:
# Check for 'NaN'
olympic_data_df.isna().sum()

_id_x          0
ID             0
Name           0
Sex            0
Age         9474
Height     60171
Weight     62875
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
Medal     231333
_id_y        349
region       370
notes     266077
dtype: int64

In [17]:
# Showing rows where 'region is 'NaN'
olympic_data_df.loc[olympic_data_df['region'].isnull(),['NOC', 'Team']].drop_duplicates()

Unnamed: 0,NOC,Team
506,SGP,Singapore
6354,ROT,Refugee Olympic Athletes
44391,SGP,June Climene
61106,UNK,Unknown
64269,TUV,Tuvalu
82050,SGP,Rika II
108281,SGP,Singapore-2
235299,SGP,Singapore-1


In [18]:
# "Manually" fixing the values above
olympic_data_df['region'] = np.where(olympic_data_df['NOC']=='SGP', 'Singapore', olympic_data_df['region'])
olympic_data_df['region'] = np.where(olympic_data_df['NOC']=='ROT', 'Refugee Olympic Athletes', olympic_data_df['region'])
olympic_data_df['region'] = np.where(olympic_data_df['NOC']=='TUV', 'Tuvalu', olympic_data_df['region'])
olympic_data_df['region'] = np.where(olympic_data_df['NOC']=='UNK', 'Unknown', olympic_data_df['region'])

In [19]:
# Check for 'NaN'
olympic_data_df.isna().sum()

_id_x          0
ID             0
Name           0
Sex            0
Age         9474
Height     60171
Weight     62875
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
Medal     231333
_id_y        349
region         0
notes     266077
dtype: int64

In [20]:
# Return first 5 rows of olympic_data_df
olympic_data_df.head()

Unnamed: 0,_id_x,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,_id_y,region,notes
0,6094127943a40af1a4be5bbd,15,Arvo Ossian Aaltonen,M,22.0,,,Finland,FIN,1912 Summer,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,,6094127543a40af1a4be5af7,Finland,
1,6094127943a40af1a4be5ba7,6,Per Knut Aaland,M,31.0,188.0,75.0,United States,USA,1992 Winter,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,,6094127543a40af1a4be5b8c,USA,
2,6094127943a40af1a4be5bd9,20,Kjetil Andr Aamodt,M,22.0,176.0,85.0,Norway,NOR,1994 Winter,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,6094127543a40af1a4be5b49,Norway,
3,6094127943a40af1a4be5bee,24,Nils Egil Aaness,M,24.0,,,Norway,NOR,1960 Winter,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",,6094127543a40af1a4be5b49,Norway,
4,6094127943a40af1a4be5bfe,36,Stefan Remco Aartsen,M,21.0,194.0,78.0,Netherlands,NED,1996 Summer,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,,6094127543a40af1a4be5b43,Netherlands,


In [21]:
# Dropping uneccessary columns
olympic_data_df.drop(['_id_x', 'Name', 'Age', 'Height', 'Weight', 'Team', 'NOC',
                         'Games', '_id_y', 'notes'], axis = 1, inplace = True)

In [22]:
# Change column names
olympic_data_df.rename(columns = {'region': 'Team', 'City': 'Host City'}, inplace = True)

In [23]:
# Return first 5 rows of olympic_data_df
olympic_data_df.head()

Unnamed: 0,ID,Sex,Year,Season,Host City,Sport,Event,Medal,Team
0,15,M,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,,Finland
1,6,M,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,,USA
2,20,M,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway
3,24,M,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",,Norway
4,36,M,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,,Netherlands


In [24]:
# Creating copies of the olympic_data_df to help get the discipline and event count per team
discipline_helper_df = olympic_data_df
event_helper_df = olympic_data_df

In [25]:
# Adding a column to each of the helper DataFrames that only contains ones
discipline_helper_df['discipline helper'] = 1
event_helper_df['event helper'] = 1

In [26]:
# Dropping duplicates so we can get a unique count
discipline_helper_df = discipline_helper_df.drop_duplicates(['Year', 'Season', 'Team', 'Sport'])

In [27]:
# Dropping duplicates so we can get a unique count
event_helper_df = event_helper_df.drop_duplicates(['Year', 'Season', 'Team', 'Event'])

In [28]:
# Getting a count of the unique disciplines per team
discipline_helper_df = discipline_helper_df.groupby(['Year', 'Season', 'Team'])[['discipline helper']]\
.agg('sum').reset_index()

In [29]:
# Getting a count of the unique events per team
event_helper_df = event_helper_df.groupby(['Year', 'Season', 'Team'])[['event helper']].agg('sum').reset_index()

In [30]:
# Merging the olympic_data_df with the discipline_helper_df
olympic_data_df = olympic_data_df.merge(discipline_helper_df, left_on = ['Year', 'Season', 'Team'], 
                                        right_on = ['Year', 'Season', 'Team'], how = 'left')

In [31]:
# Merging the olympic_data_df with the event_helper_df
olympic_data_df = olympic_data_df.merge(event_helper_df, left_on = ['Year', 'Season', 'Team'], 
                                        right_on = ['Year', 'Season', 'Team'], how = 'left')

In [32]:
# Return first 5 rows of olympic_data_df
olympic_data_df.head()

Unnamed: 0,ID,Sex,Year,Season,Host City,Sport,Event,Medal,Team,discipline helper_x,event helper_x,discipline helper_y,event helper_y
0,15,M,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,,Finland,1,1,10,49
1,6,M,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,,USA,1,1,12,56
2,20,M,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,1,1,10,48
3,24,M,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",,Norway,1,1,6,17
4,36,M,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,,Netherlands,1,1,19,94


In [33]:
# Dropping the helper columns
olympic_data_df.drop(['discipline helper_x', 'event helper_x'], axis = 1, inplace = True)

In [34]:
# Return first 5 rows of olympic_data_df
olympic_data_df.head()

Unnamed: 0,ID,Sex,Year,Season,Host City,Sport,Event,Medal,Team,discipline helper_y,event helper_y
0,15,M,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,,Finland,10,49
1,6,M,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,,USA,12,56
2,20,M,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48
3,24,M,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",,Norway,6,17
4,36,M,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,,Netherlands,19,94


In [35]:
# Renaming columns
olympic_data_df.rename(columns = {'discipline helper_y': 'Team Disciplines', 'event helper_y': 'Team Events'}, inplace = True)

In [36]:
# Return first 5 rows of olympic_data_df
olympic_data_df.head()

Unnamed: 0,ID,Sex,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events
0,15,M,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,,Finland,10,49
1,6,M,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,,USA,12,56
2,20,M,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48
3,24,M,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",,Norway,6,17
4,36,M,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,,Netherlands,19,94


In [37]:
# Dropping duplicates and getting the gender count per team
gender_count = olympic_data_df.drop_duplicates(['Year', 'Team', 'ID', 'Season']).groupby(['Team', 'Year', 'Season', 'Sex']).size()

In [38]:
# Making gender_count a DataFrame
gender_count_df = pd.DataFrame(gender_count)

In [39]:
# Return first 5 rows of gender_count_df
gender_count_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0
Team,Year,Season,Sex,Unnamed: 4_level_1
Afghanistan,1936,Summer,M,15
Afghanistan,1948,Summer,M,25
Afghanistan,1956,Summer,M,12
Afghanistan,1960,Summer,M,12
Afghanistan,1964,Summer,M,8


In [40]:
# Merging the olympic_data_df with the gender_count_df
olympic_data_df = olympic_data_df.merge(gender_count_df, left_on = ['Team', 'Year', 'Season', 'Sex'], 
                                        right_on = ['Team', 'Year', 'Season', 'Sex'], how = 'left')

In [41]:
# Return first 5 rows of olympic_data_df
olympic_data_df.head()

Unnamed: 0,ID,Sex,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,0
0,15,M,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,,Finland,10,49,162
1,6,M,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,,USA,12,56,98
2,20,M,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,66
3,24,M,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",,Norway,6,17,25
4,36,M,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,,Netherlands,19,94,137


In [42]:
# Renaming columns
olympic_data_df.rename(columns = {0: 'Gender Count'}, inplace = True)

In [43]:
# Return first 5 rows of olympic_data_df
olympic_data_df.head()

Unnamed: 0,ID,Sex,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,Gender Count
0,15,M,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,,Finland,10,49,162
1,6,M,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,,USA,12,56,98
2,20,M,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,66
3,24,M,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",,Norway,6,17,25
4,36,M,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,,Netherlands,19,94,137


In [44]:
# Creating a helper DataFrame and dropping the duplicates
helper_df = olympic_data_df.drop_duplicates(['Sex', 'Team', 'Season', 'Year'])

In [45]:
# Pivoting the helper_df to add the gender columns
helper_df = helper_df.pivot(index=['Year', 'Season', 'Team'], columns='Sex', values='Gender Count')

In [46]:
# Return first 5 rows of helper_df
helper_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sex,F,M
Year,Season,Team,Unnamed: 3_level_1,Unnamed: 4_level_1
1896,Summer,Australia,,1.0
1896,Summer,Austria,,3.0
1896,Summer,Denmark,,3.0
1896,Summer,France,,12.0
1896,Summer,Germany,,19.0


In [47]:
# Merging the olympic_data_df with the helper_df
olympic_data_df = olympic_data_df.merge(helper_df, left_on = ['Team', 'Year', 'Season'], 
                                        right_on = ['Team', 'Year', 'Season'], how = 'left')

In [48]:
# Return first 5 rows of olympic_data_df
olympic_data_df.head()

Unnamed: 0,ID,Sex,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,Gender Count,F,M
0,15,M,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,,Finland,10,49,162,2.0,162.0
1,6,M,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,,USA,12,56,98,50.0,98.0
2,20,M,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,66,21.0,66.0
3,24,M,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",,Norway,6,17,25,4.0,25.0
4,36,M,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,,Netherlands,19,94,137,102.0,137.0


In [49]:
# Dropping columns
olympic_data_df.drop(['Gender Count', 'Sex'], axis = 1, inplace = True)

In [50]:
# Renaming columns
olympic_data_df.rename(columns = {'F': 'Females', 'M': 'Males'}, inplace = True)

In [51]:
# Return first 5 rows of olympic_data_df
olympic_data_df.head()

Unnamed: 0,ID,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,Females,Males
0,15,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,,Finland,10,49,2.0,162.0
1,6,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,,USA,12,56,50.0,98.0
2,20,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,21.0,66.0
3,24,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",,Norway,6,17,4.0,25.0
4,36,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,,Netherlands,19,94,102.0,137.0


In [52]:
# Find unique values in the medal column
olympic_data_df["Medal"].unique()

array([nan, 'Silver', 'Gold', 'Bronze'], dtype=object)

In [53]:
# Replace rows with 'NaN' for medal with 'DNW' (Did not win). This way, these rows won't be dropped if use the 'dropna'
# function later. Even though those athletes didn't win a medal, the more data to train on, the better.
olympic_data_df['Medal'].fillna('DNW', inplace=True)

In [54]:
# Return first 5 rows of olympic_data_df
olympic_data_df.head()

Unnamed: 0,ID,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,Females,Males
0,15,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,DNW,Finland,10,49,2.0,162.0
1,6,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,DNW,USA,12,56,50.0,98.0
2,20,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,21.0,66.0
3,24,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",DNW,Norway,6,17,4.0,25.0
4,36,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,DNW,Netherlands,19,94,102.0,137.0


In [55]:
# Check for 'NaN'
olympic_data_df.isna().sum()

ID                      0
Year                    0
Season                  0
Host City               0
Sport                   0
Event                   0
Medal                   0
Team                    0
Team Disciplines        0
Team Events             0
Females             18861
Males                  72
dtype: int64

In [56]:
# Fill 'NaN' with zeros
olympic_data_df = olympic_data_df.fillna(0)

In [57]:
# Check for 'NaN'
olympic_data_df.isna().sum()

ID                  0
Year                0
Season              0
Host City           0
Sport               0
Event               0
Medal               0
Team                0
Team Disciplines    0
Team Events         0
Females             0
Males               0
dtype: int64

In [58]:
# Return first 5 rows of coordinates_df
coordinates_df.head()

Unnamed: 0,_id,country_code,latitude,longitude,country,usa_state_code,usa_state_latitude,usa_state_longitude,usa_state
0,6094127243a40af1a4be59e2,CI,7.539989,-5.54708,Ivory Coast,SC,33.836081,-81.163725,South Carolina
1,6094127243a40af1a4be59dc,IO,-6.343194,71.876519,British Indian Ocean Territory,,,,
2,6094127243a40af1a4be59e9,TD,15.454166,18.732207,Chad,,,,
3,6094127243a40af1a4be5a03,FK,-51.796253,-59.523613,Falkland Islands [Islas Malvinas],,,,
4,6094127243a40af1a4be5a0c,GM,13.443182,-15.310139,Gambia,,,,


In [59]:
# Dropping uneccessary columns
coordinates_df.drop(['usa_state_code', 'usa_state_latitude', 'usa_state_longitude', 'usa_state', '_id'], axis=1, inplace=True)

In [60]:
# Return first 5 rows of coordinates_df
coordinates_df.head()

Unnamed: 0,country_code,latitude,longitude,country
0,CI,7.539989,-5.54708,Ivory Coast
1,IO,-6.343194,71.876519,British Indian Ocean Territory
2,TD,15.454166,18.732207,Chad
3,FK,-51.796253,-59.523613,Falkland Islands [Islas Malvinas]
4,GM,13.443182,-15.310139,Gambia


In [61]:
# Check for 'NaN'
coordinates_df.isna().sum()

country_code    2
latitude        1
longitude       1
country         0
dtype: int64

In [62]:
# Showing 'NaN' 'country_code'
coordinates_df.loc[coordinates_df['country_code'].isnull(),['latitude', 'longitude', 'country']].drop_duplicates()

Unnamed: 0,latitude,longitude,country
19,6.877,31.307,South Sudan
157,-22.95764,18.49041,Namibia


In [63]:
# Showing 'NaN' 'latitude' and 'longitude'
coordinates_df.loc[coordinates_df['latitude'].isnull(),['country_code', 'longitude', 'country']].drop_duplicates()

Unnamed: 0,country_code,longitude,country
177,UM,,U.S. Minor Outlying Islands


In [64]:
# Merging olympic and coordinate data
olympic_coordinate_df = olympic_data_df.merge(coordinates_df, left_on = 'Team', right_on = 'country', how = 'left')

In [65]:
# Return first 5 rows of olympic_coordinate_df
olympic_coordinate_df.head()

Unnamed: 0,ID,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,Females,Males,country_code,latitude,longitude,country
0,15,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,DNW,Finland,10,49,2.0,162.0,FI,61.92411,25.748151,Finland
1,6,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,DNW,USA,12,56,50.0,98.0,US,37.09024,-95.712891,USA
2,20,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,21.0,66.0,NO,60.472024,8.468946,Norway
3,24,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",DNW,Norway,6,17,4.0,25.0,NO,60.472024,8.468946,Norway
4,36,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,DNW,Netherlands,19,94,102.0,137.0,NL,52.132633,5.291266,Netherlands


In [66]:
# Checking 'NaN' 'country' after the merge
olympic_coordinate_df.loc[olympic_coordinate_df['country'].isnull(),['Team']].drop_duplicates()

Unnamed: 0,Team
2316,Individual Olympic Athletes
6354,Refugee Olympic Athletes
61106,Unknown


In [67]:
# Dropping uneccessary columns
olympic_coordinate_df.drop(['country_code', 'country'], axis=1, inplace=True)

In [68]:
# Return first 5 rows of olympic_coordinate_df
olympic_coordinate_df.head()

Unnamed: 0,ID,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,Females,Males,latitude,longitude
0,15,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,DNW,Finland,10,49,2.0,162.0,61.92411,25.748151
1,6,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,DNW,USA,12,56,50.0,98.0,37.09024,-95.712891
2,20,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,21.0,66.0,60.472024,8.468946
3,24,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",DNW,Norway,6,17,4.0,25.0,60.472024,8.468946
4,36,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,DNW,Netherlands,19,94,102.0,137.0,52.132633,5.291266


In [69]:
# Return first 5 rows of hosts_df
hosts_df.head()

Unnamed: 0,_id,Type,GamesUrl,Disciplines,DisciplinesList,Country,Date,Athletes,Countries,Events,City,Year,Host
0,6094126943a40af1a4bd4d5e,Summer,https://www.olympic.org/melbourne-stockholm-1956,23,"['Athletics', 'Basketball', 'Boxing', 'Canoe S...","Australia, Sweden",22 Nov - 08 Dec,3314.0,72.0,151.0,Melbourne - Stockholm,1956,1
1,6094126943a40af1a4bd4d63,Summer,https://www.olympic.org/montreal-1976,27,"['Archery', 'Athletics', 'Basketball', 'Boxing...",Canada,17 Jul - 01 Aug,6084.0,92.0,198.0,Montreal,1976,1
2,6094126943a40af1a4bd4d6c,Summer,https://www.olympic.org/london-2012,36,"['Archery', 'Athletics', 'Badminton', 'Basketb...",UK,27 Jul - 12 Aug,10568.0,204.0,302.0,London,2012,1
3,6094126943a40af1a4bd4d80,Winter,https://www.olympic.org/albertville-1992,12,"['Alpine Skiing', 'Biathlon', 'Bobsleigh', 'Cr...",France,08 Feb - 23 Feb,1801.0,64.0,57.0,Albertville,1992,1
4,6094126943a40af1a4bd4d54,Summer,https://www.olympic.org/st-louis-1904,19,"['Archery', 'Athletics', 'Basketball', 'Boxing...",USA,01 Jul - 23 Nov,651.0,12.0,95.0,St Louis,1904,1


In [70]:
# Dropping uneccessary columns
hosts_df.drop(['GamesUrl', 'Date', 'City', '_id', 'DisciplinesList'], axis=1, inplace=True)

In [71]:
# Return first 5 rows of hosts_df
hosts_df.head()

Unnamed: 0,Type,Disciplines,Country,Athletes,Countries,Events,Year,Host
0,Summer,23,"Australia, Sweden",3314.0,72.0,151.0,1956,1
1,Summer,27,Canada,6084.0,92.0,198.0,1976,1
2,Summer,36,UK,10568.0,204.0,302.0,2012,1
3,Winter,12,France,1801.0,64.0,57.0,1992,1
4,Summer,19,USA,651.0,12.0,95.0,1904,1


In [72]:
# Checking for 'NaN'. Those five are from years 2020 and beyond so we don't need to worry about them.
hosts_df.isna().sum()

Type           0
Disciplines    0
Country        0
Athletes       5
Countries      5
Events         5
Year           0
Host           0
dtype: int64

In [73]:
# Merging the olympic + coordinate data with certain columns of the host data
olympic_coordinate_host_df = olympic_coordinate_df.merge(hosts_df[['Host', 'Year', 'Type', 'Country']],
                                                         left_on = ['Year', 'Season', 'Team'], 
                                                         right_on = ['Year', 'Type', 'Country'], how = 'left')

In [74]:
# Return first 5 rows of olympic_coordinate_host_df
olympic_coordinate_host_df.head()

Unnamed: 0,ID,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,Females,Males,latitude,longitude,Host,Type,Country
0,15,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,DNW,Finland,10,49,2.0,162.0,61.92411,25.748151,,,
1,6,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,DNW,USA,12,56,50.0,98.0,37.09024,-95.712891,,,
2,20,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,21.0,66.0,60.472024,8.468946,1.0,Winter,Norway
3,24,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",DNW,Norway,6,17,4.0,25.0,60.472024,8.468946,,,
4,36,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,DNW,Netherlands,19,94,102.0,137.0,52.132633,5.291266,,,


In [75]:
# Dropping uneccessary columns
olympic_coordinate_host_df.drop(['Type', 'Country'], axis=1, inplace=True)

In [76]:
# Merging the olympic + coordinate data with certain columns of the host data
olympic_coordinate_host_df = olympic_coordinate_host_df.merge(hosts_df[['Year', 'Type', 'Disciplines', 'Athletes', 'Countries', 'Events']],
                                                         left_on = ['Year', 'Season'], 
                                                         right_on = ['Year', 'Type'], how = 'left')

In [77]:
# Return first 5 rows of olympic_coordinate_host_df
olympic_coordinate_host_df.head()

Unnamed: 0,ID,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,Females,Males,latitude,longitude,Host,Type,Disciplines,Athletes,Countries,Events
0,15,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,DNW,Finland,10,49,2.0,162.0,61.92411,25.748151,,Summer,18.0,2407.0,28.0,102.0
1,6,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,DNW,USA,12,56,50.0,98.0,37.09024,-95.712891,,Winter,12.0,1801.0,64.0,57.0
2,20,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,21.0,66.0,60.472024,8.468946,1.0,Winter,12.0,1737.0,67.0,61.0
3,24,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",DNW,Norway,6,17,4.0,25.0,60.472024,8.468946,,Winter,8.0,665.0,30.0,27.0
4,36,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,DNW,Netherlands,19,94,102.0,137.0,52.132633,5.291266,,Summer,37.0,10318.0,197.0,271.0


In [78]:
# Dropping uneccessary columns
olympic_coordinate_host_df.drop(['Type'], axis=1, inplace=True)

In [79]:
# Replacing 'NaN' values in the host column with 0 (1 means host and 0 means not host)
values = {'Host':0}
olympic_coordinate_host_df = olympic_coordinate_host_df.fillna(value=values)

In [80]:
# Check for 'NaN' values
olympic_coordinate_host_df.isna().sum()

ID                     0
Year                   0
Season                 0
Host City              0
Sport                  0
Event                  0
Medal                  0
Team                   0
Team Disciplines       0
Team Events            0
Females                0
Males                  0
latitude             108
longitude            108
Host                   0
Disciplines         1733
Athletes            1733
Countries           1733
Events              1733
dtype: int64

In [81]:
# Changing floats in the 'Host' column to int
olympic_coordinate_host_df['Host'] = olympic_coordinate_host_df['Host'].astype(int)

In [82]:
# Return first 5 rows of olympic_coordinate_host_df
olympic_coordinate_host_df.head()

Unnamed: 0,ID,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,Females,Males,latitude,longitude,Host,Disciplines,Athletes,Countries,Events
0,15,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,DNW,Finland,10,49,2.0,162.0,61.92411,25.748151,0,18.0,2407.0,28.0,102.0
1,6,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,DNW,USA,12,56,50.0,98.0,37.09024,-95.712891,0,12.0,1801.0,64.0,57.0
2,20,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,21.0,66.0,60.472024,8.468946,1,12.0,1737.0,67.0,61.0
3,24,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",DNW,Norway,6,17,4.0,25.0,60.472024,8.468946,0,8.0,665.0,30.0,27.0
4,36,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,DNW,Netherlands,19,94,102.0,137.0,52.132633,5.291266,0,37.0,10318.0,197.0,271.0


In [83]:
# Return first 5 rows of gdp_df
gdp_df.head()

Unnamed: 0,_id,geo,name,time,Income per person,GDP total
0,6094126c43a40af1a4bd4d98,afg,Afghanistan,1910,1277,7254447258
1,6094126c43a40af1a4bd4d99,afg,Afghanistan,1911,1297,7751814270
2,6094126c43a40af1a4bd4da6,afg,Afghanistan,1924,1599,14719115468
3,6094126c43a40af1a4bd4dc5,afg,Afghanistan,1955,2579,21330888368
4,6094126c43a40af1a4bd4de6,afg,Afghanistan,1988,2140,24858545881


In [84]:
# Merging the olympic + coordinate + host data with the gdp data
olympic_coordinate_host_gdp_df = olympic_coordinate_host_df.merge(gdp_df, left_on = ['Year', 'Team'], 
                                                               right_on = ['time', 'name'], how = 'left')

In [85]:
# Return first 5 rows of olympic_coordinate_host_gdp_df
olympic_coordinate_host_gdp_df.head()

Unnamed: 0,ID,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,...,Disciplines,Athletes,Countries,Events,_id,geo,name,time,Income per person,GDP total
0,15,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,DNW,Finland,10,49,...,18.0,2407.0,28.0,102.0,6094126c43a40af1a4bd6e74,fin,Finland,1912.0,3274.0,9707610000.0
1,6,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,DNW,USA,12,56,...,12.0,1801.0,64.0,57.0,6094126c43a40af1a4bdb6b3,usa,USA,1992.0,37045.0,9520302000000.0
2,20,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,...,12.0,1737.0,67.0,61.0,6094126c43a40af1a4bd966c,nor,Norway,1994.0,48306.0,209725200000.0
3,24,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",DNW,Norway,6,17,...,8.0,665.0,30.0,27.0,6094126c43a40af1a4bd964a,nor,Norway,1960.0,21705.0,77758340000.0
4,36,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,DNW,Netherlands,19,94,...,37.0,10318.0,197.0,271.0,6094126c43a40af1a4bd9399,nld,Netherlands,1996.0,36053.0,561108600000.0


In [86]:
# Checking which 'name' rows contain 'NaN'
olympic_coordinate_host_gdp_df.loc[olympic_coordinate_host_gdp_df['name'].isnull(),
                                      ['Team']].drop_duplicates()

Unnamed: 0,Team
581,"Virgin Islands, US"
1046,Puerto Rico
1226,American Samoa
1508,Netherlands Antilles
2102,Bermuda
2316,Individual Olympic Athletes
4860,Guam
5086,"Virgin Islands, British"
5111,Cayman Islands
6354,Refugee Olympic Athletes


In [87]:
# Dropping uneccessary columns
olympic_coordinate_host_gdp_df.drop(['geo', 'name', 'time', '_id', 'Income per person'], axis=1, inplace=True)

In [88]:
# Return first 5 rows of olympic_coordinate_host_gdp_df
olympic_coordinate_host_gdp_df.head()

Unnamed: 0,ID,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,Females,Males,latitude,longitude,Host,Disciplines,Athletes,Countries,Events,GDP total
0,15,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,DNW,Finland,10,49,2.0,162.0,61.92411,25.748151,0,18.0,2407.0,28.0,102.0,9707610000.0
1,6,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,DNW,USA,12,56,50.0,98.0,37.09024,-95.712891,0,12.0,1801.0,64.0,57.0,9520302000000.0
2,20,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,21.0,66.0,60.472024,8.468946,1,12.0,1737.0,67.0,61.0,209725200000.0
3,24,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",DNW,Norway,6,17,4.0,25.0,60.472024,8.468946,0,8.0,665.0,30.0,27.0,77758340000.0
4,36,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,DNW,Netherlands,19,94,102.0,137.0,52.132633,5.291266,0,37.0,10318.0,197.0,271.0,561108600000.0


In [89]:
# Return first 5 rows of population_df
population_df.head()

Unnamed: 0,_id,geo,name,time,Population
0,6094126f43a40af1a4bdbbfd,afg,Afghanistan,1896,4714779
1,6094126f43a40af1a4bdbc02,afg,Afghanistan,1901,4879685
2,6094126f43a40af1a4bdbc08,afg,Afghanistan,1907,5262120
3,6094126f43a40af1a4bdbc16,afg,Afghanistan,1921,10531818
4,6094126f43a40af1a4bdbc33,afg,Afghanistan,1950,7752117


In [90]:
# Merging the olympic + coordinate + host + gdp data with the population data
olympic_coordinate_host_gdp_pop_df = olympic_coordinate_host_gdp_df.merge(population_df, left_on = ['Year', 'Team'], 
                                                               right_on = ['time', 'name'], how = 'left')

In [91]:
# Return first 5 rows of olympic_coordinate_host_gdp_pop_df
olympic_coordinate_host_gdp_pop_df.head()

Unnamed: 0,ID,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,...,Disciplines,Athletes,Countries,Events,GDP total,_id,geo,name,time,Population
0,15,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,DNW,Finland,10,49,...,18.0,2407.0,28.0,102.0,9707610000.0,6094126f43a40af1a4bdea7f,fin,Finland,1912.0,2965061.0
1,6,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,DNW,USA,12,56,...,12.0,1801.0,64.0,57.0,9520302000000.0,6094126f43a40af1a4be521c,usa,USA,1992.0,256990608.0
2,20,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,...,12.0,1737.0,67.0,61.0,209725200000.0,6094126f43a40af1a4be2479,nor,Norway,1994.0,4341616.0
3,24,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",DNW,Norway,6,17,...,8.0,665.0,30.0,27.0,77758340000.0,6094126f43a40af1a4be2457,nor,Norway,1960.0,3582508.0
4,36,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,DNW,Netherlands,19,94,...,37.0,10318.0,197.0,271.0,561108600000.0,6094126f43a40af1a4be207a,nld,Netherlands,1996.0,15563252.0


In [92]:
# Checking which 'name' rows contain 'NaN'
olympic_coordinate_host_gdp_pop_df.loc[olympic_coordinate_host_gdp_pop_df['name'].isnull(),
                                      ['Team']].drop_duplicates()

Unnamed: 0,Team
581,"Virgin Islands, US"
1046,Puerto Rico
1226,American Samoa
1508,Netherlands Antilles
2102,Bermuda
2316,Individual Olympic Athletes
4860,Guam
5086,"Virgin Islands, British"
5111,Cayman Islands
6354,Refugee Olympic Athletes


In [93]:
# Dropping uneccessary columns
olympic_coordinate_host_gdp_pop_df.drop(['geo', 'name', 'time', '_id'], axis=1, inplace=True)

In [94]:
# Return first 5 rows of olympic_coordinate_host_gdp_pop_df
olympic_coordinate_host_gdp_pop_df.head()

Unnamed: 0,ID,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,...,Males,latitude,longitude,Host,Disciplines,Athletes,Countries,Events,GDP total,Population
0,15,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,DNW,Finland,10,49,...,162.0,61.92411,25.748151,0,18.0,2407.0,28.0,102.0,9707610000.0,2965061.0
1,6,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,DNW,USA,12,56,...,98.0,37.09024,-95.712891,0,12.0,1801.0,64.0,57.0,9520302000000.0,256990608.0
2,20,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,...,66.0,60.472024,8.468946,1,12.0,1737.0,67.0,61.0,209725200000.0,4341616.0
3,24,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",DNW,Norway,6,17,...,25.0,60.472024,8.468946,0,8.0,665.0,30.0,27.0,77758340000.0,3582508.0
4,36,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,DNW,Netherlands,19,94,...,137.0,52.132633,5.291266,0,37.0,10318.0,197.0,271.0,561108600000.0,15563252.0


In [95]:
# Creating the 'GDP per capita' column
olympic_coordinate_host_gdp_pop_df['GDP per capita'] = np.where(olympic_coordinate_host_gdp_pop_df['Population'] == np.nan, np.nan,
                    olympic_coordinate_host_gdp_pop_df['GDP total']
                    /olympic_coordinate_host_gdp_pop_df['Population'])

In [96]:
# Return first 5 rows of olympic_coordinate_host_gdp_pop_df
olympic_coordinate_host_gdp_pop_df.head()

Unnamed: 0,ID,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,...,latitude,longitude,Host,Disciplines,Athletes,Countries,Events,GDP total,Population,GDP per capita
0,15,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,DNW,Finland,10,49,...,61.92411,25.748151,0,18.0,2407.0,28.0,102.0,9707610000.0,2965061.0,3274.0
1,6,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,DNW,USA,12,56,...,37.09024,-95.712891,0,12.0,1801.0,64.0,57.0,9520302000000.0,256990608.0,37045.33012
2,20,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,...,60.472024,8.468946,1,12.0,1737.0,67.0,61.0,209725200000.0,4341616.0,48305.780704
3,24,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",DNW,Norway,6,17,...,60.472024,8.468946,0,8.0,665.0,30.0,27.0,77758340000.0,3582508.0,21705.0
4,36,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,DNW,Netherlands,19,94,...,52.132633,5.291266,0,37.0,10318.0,197.0,271.0,561108600000.0,15563252.0,36053.428086


In [97]:
# Check for 'NaN' values
olympic_coordinate_host_gdp_pop_df.isna().sum()

ID                     0
Year                   0
Season                 0
Host City              0
Sport                  0
Event                  0
Medal                  0
Team                   0
Team Disciplines       0
Team Events            0
Females                0
Males                  0
latitude             108
longitude            108
Host                   0
Disciplines         1733
Athletes            1733
Countries           1733
Events              1733
GDP total           2356
Population          1987
GDP per capita      2356
dtype: int64

In [98]:
# Checking the length of olympic_coordinate_host_gdp_pop_df
len(olympic_coordinate_host_gdp_pop_df)

271116

In [99]:
# Dropping duplicates from olympic_coordinate_host_gdp_pop_df and making a new DataFrame. We're not
# doing this to olympic_coordinate_host_gdp_pop_df because by dropping duplicate ID (an ID represents an athlete),
# we can no longer track athletes who won a medal in multiple events. However, we can now track the team size (just because
# an athlete competed in multiple events doesn't mean they count for more than 1 person in the team's size)
drop_duplicates_df = olympic_coordinate_host_gdp_pop_df.drop_duplicates(['Year', 'Team', 'ID', 'Season'])

In [100]:
# Creating a column of zeros called 'Team Size' in drop_duplicates_df
drop_duplicates_df['Team Size'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drop_duplicates_df['Team Size'] = 0


In [101]:
# Getting the fisrt 5 rows of drop_duplicates_df
drop_duplicates_df.head()

Unnamed: 0,ID,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,...,longitude,Host,Disciplines,Athletes,Countries,Events,GDP total,Population,GDP per capita,Team Size
0,15,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,DNW,Finland,10,49,...,25.748151,0,18.0,2407.0,28.0,102.0,9707610000.0,2965061.0,3274.0,0
1,6,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,DNW,USA,12,56,...,-95.712891,0,12.0,1801.0,64.0,57.0,9520302000000.0,256990608.0,37045.33012,0
2,20,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,...,8.468946,1,12.0,1737.0,67.0,61.0,209725200000.0,4341616.0,48305.780704,0
3,24,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",DNW,Norway,6,17,...,8.468946,0,8.0,665.0,30.0,27.0,77758340000.0,3582508.0,21705.0,0
4,36,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,DNW,Netherlands,19,94,...,5.291266,0,37.0,10318.0,197.0,271.0,561108600000.0,15563252.0,36053.428086,0


In [102]:
# Getting the size of each team using the groupby and count functions
size_by_team = drop_duplicates_df.groupby(['Year', 'Team', 'Season']).count()['Team Size']

In [103]:
# Checking the data type of size_by_team
size_by_team.dtype

dtype('int64')

In [104]:
# Making size_by_team a DataFrame
size_by_team_df = pd.DataFrame(size_by_team)

In [105]:
# Merging the olympic_coordinate_host_gdp_pop_df with the size_by_team_df
final_df = olympic_coordinate_host_gdp_pop_df.merge(size_by_team_df, left_on = ['Year', 'Team', 'Season'], 
                                                    right_on = ['Year', 'Team', 'Season'], how = 'left')

In [106]:
# Getting the first 5 rows of final_df
final_df.head()

Unnamed: 0,ID,Year,Season,Host City,Sport,Event,Medal,Team,Team Disciplines,Team Events,...,longitude,Host,Disciplines,Athletes,Countries,Events,GDP total,Population,GDP per capita,Team Size
0,15,1912,Summer,Stockholm,Swimming,Swimming Men's 200 metres Breaststroke,DNW,Finland,10,49,...,25.748151,0,18.0,2407.0,28.0,102.0,9707610000.0,2965061.0,3274.0,164
1,6,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,DNW,USA,12,56,...,-95.712891,0,12.0,1801.0,64.0,57.0,9520302000000.0,256990608.0,37045.33012,148
2,20,1994,Winter,Lillehammer,Alpine Skiing,Alpine Skiing Men's Downhill,Silver,Norway,10,48,...,8.468946,1,12.0,1737.0,67.0,61.0,209725200000.0,4341616.0,48305.780704,87
3,24,1960,Winter,Squaw Valley,Speed Skating,"Speed Skating Men's 1,500 metres",DNW,Norway,6,17,...,8.468946,0,8.0,665.0,30.0,27.0,77758340000.0,3582508.0,21705.0,29
4,36,1996,Summer,Atlanta,Swimming,Swimming Men's 100 metres Butterfly,DNW,Netherlands,19,94,...,5.291266,0,37.0,10318.0,197.0,271.0,561108600000.0,15563252.0,36053.428086,239


In [107]:
# We no longer need the 'ID' clumn
final_df.drop('ID', axis=1, inplace=True)

In [108]:
# Drop duplicates to aovid things like adding 12 medals to a country that won basketball (one for each player)
# instead of just adding 1 (one for whole team)
final_df = final_df.drop_duplicates(['Year', 'Season', 'Event', 'Medal', 'Team'])

In [109]:
# We no longer need the 'Event' column
final_df.drop('Event', axis=1, inplace=True)

In [110]:
# Getting the first 5 rows of final_df
final_df.head()

Unnamed: 0,Year,Season,Host City,Sport,Medal,Team,Team Disciplines,Team Events,Females,Males,...,longitude,Host,Disciplines,Athletes,Countries,Events,GDP total,Population,GDP per capita,Team Size
0,1912,Summer,Stockholm,Swimming,DNW,Finland,10,49,2.0,162.0,...,25.748151,0,18.0,2407.0,28.0,102.0,9707610000.0,2965061.0,3274.0,164
1,1992,Winter,Albertville,Cross Country Skiing,DNW,USA,12,56,50.0,98.0,...,-95.712891,0,12.0,1801.0,64.0,57.0,9520302000000.0,256990608.0,37045.33012,148
2,1994,Winter,Lillehammer,Alpine Skiing,Silver,Norway,10,48,21.0,66.0,...,8.468946,1,12.0,1737.0,67.0,61.0,209725200000.0,4341616.0,48305.780704,87
3,1960,Winter,Squaw Valley,Speed Skating,DNW,Norway,6,17,4.0,25.0,...,8.468946,0,8.0,665.0,30.0,27.0,77758340000.0,3582508.0,21705.0,29
4,1996,Summer,Atlanta,Swimming,DNW,Netherlands,19,94,102.0,137.0,...,5.291266,0,37.0,10318.0,197.0,271.0,561108600000.0,15563252.0,36053.428086,239


In [111]:
# Summing medals by 'Medal', 'Year', 'Season', and 'Team'. Doing this in a new DataFrame because otherwise we will lose
# the other columns
final2_df = pd.DataFrame({'Medal Count' : final_df.groupby( ['Year', 'Season', 'Team', 'Medal'] ).size()}).reset_index()

In [112]:
# Getting the first 5 rows of final2_df
final2_df.head()

Unnamed: 0,Year,Season,Team,Medal,Medal Count
0,1896,Summer,Australia,Bronze,1
1,1896,Summer,Australia,DNW,2
2,1896,Summer,Australia,Gold,2
3,1896,Summer,Austria,Bronze,2
4,1896,Summer,Austria,DNW,3


In [113]:
# Dropping rows with 'DNW' so these aren't summed
final2_df = final2_df[final2_df.Medal != 'DNW']

In [114]:
# Getting the first 5 rows of final2_df
final2_df.head()

Unnamed: 0,Year,Season,Team,Medal,Medal Count
0,1896,Summer,Australia,Bronze,1
2,1896,Summer,Australia,Gold,2
3,1896,Summer,Austria,Bronze,2
5,1896,Summer,Austria,Gold,2
6,1896,Summer,Austria,Silver,1


In [115]:
# Merging final_df and final2_df to merge the medal totals
final_df = final_df.merge(final2_df, left_on = ['Year', 'Team', 'Season'], 
                                                right_on = ['Year', 'Team', 'Season'], how = 'left')

In [116]:
# Getting the first 5 rows of final_df
final_df.head()

Unnamed: 0,Year,Season,Host City,Sport,Medal_x,Team,Team Disciplines,Team Events,Females,Males,...,Disciplines,Athletes,Countries,Events,GDP total,Population,GDP per capita,Team Size,Medal_y,Medal Count
0,1912,Summer,Stockholm,Swimming,DNW,Finland,10,49,2.0,162.0,...,18.0,2407.0,28.0,102.0,9707610000.0,2965061.0,3274.0,164,Bronze,9.0
1,1912,Summer,Stockholm,Swimming,DNW,Finland,10,49,2.0,162.0,...,18.0,2407.0,28.0,102.0,9707610000.0,2965061.0,3274.0,164,Gold,9.0
2,1912,Summer,Stockholm,Swimming,DNW,Finland,10,49,2.0,162.0,...,18.0,2407.0,28.0,102.0,9707610000.0,2965061.0,3274.0,164,Silver,8.0
3,1992,Winter,Albertville,Cross Country Skiing,DNW,USA,12,56,50.0,98.0,...,12.0,1801.0,64.0,57.0,9520302000000.0,256990608.0,37045.33012,148,Bronze,2.0
4,1992,Winter,Albertville,Cross Country Skiing,DNW,USA,12,56,50.0,98.0,...,12.0,1801.0,64.0,57.0,9520302000000.0,256990608.0,37045.33012,148,Gold,5.0


In [117]:
# Dropping uneccessary columns
final_df.drop(['Medal_x'], axis=1, inplace=True)

In [118]:
# Change column names
final_df.rename(columns = {'Medal_y': 'Medal Type'}, inplace = True)

In [119]:
# Getting the first 5 rows of final_df
final_df.head()

Unnamed: 0,Year,Season,Host City,Sport,Team,Team Disciplines,Team Events,Females,Males,latitude,...,Disciplines,Athletes,Countries,Events,GDP total,Population,GDP per capita,Team Size,Medal Type,Medal Count
0,1912,Summer,Stockholm,Swimming,Finland,10,49,2.0,162.0,61.92411,...,18.0,2407.0,28.0,102.0,9707610000.0,2965061.0,3274.0,164,Bronze,9.0
1,1912,Summer,Stockholm,Swimming,Finland,10,49,2.0,162.0,61.92411,...,18.0,2407.0,28.0,102.0,9707610000.0,2965061.0,3274.0,164,Gold,9.0
2,1912,Summer,Stockholm,Swimming,Finland,10,49,2.0,162.0,61.92411,...,18.0,2407.0,28.0,102.0,9707610000.0,2965061.0,3274.0,164,Silver,8.0
3,1992,Winter,Albertville,Cross Country Skiing,USA,12,56,50.0,98.0,37.09024,...,12.0,1801.0,64.0,57.0,9520302000000.0,256990608.0,37045.33012,148,Bronze,2.0
4,1992,Winter,Albertville,Cross Country Skiing,USA,12,56,50.0,98.0,37.09024,...,12.0,1801.0,64.0,57.0,9520302000000.0,256990608.0,37045.33012,148,Gold,5.0


In [120]:
# Have to drop duplicates for some reason. Will figure this out later to cut down on confusion and steps but for now
# this way is fine
final_df = final_df.drop_duplicates(['Year', 'Season', 'Medal Type', 'Team'])

In [121]:
# Creating a DataFrame that pivots the 'Medal Type' to columns
pivot_df = final_df.pivot(index=['Team', 'Year', 'Season'], columns='Medal Type', values='Medal Count')

In [122]:
# Getting the first 5 rows of pivot_df
pivot_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Medal Type,NaN,Bronze,Gold,Silver
Team,Year,Season,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,1936,Summer,,,,
Afghanistan,1948,Summer,,,,
Afghanistan,1956,Summer,,,,
Afghanistan,1960,Summer,,,,
Afghanistan,1964,Summer,,,,


In [123]:
# Filling 'NaN' with zeros
pivot_df = pivot_df.fillna(0)

In [124]:
# Making pivot_df a DataFrame
pivot_df = pd.DataFrame(pivot_df)

In [125]:
# Merging final_df with pivot_df
final_df = final_df.merge(pivot_df, left_on = ['Year', 'Team', 'Season'], 
                                                right_on = ['Year', 'Team', 'Season'], how = 'left')

In [126]:
# Getting the first 5 rows of final_df
final_df.head()

Unnamed: 0,Year,Season,Host City,Sport,Team,Team Disciplines,Team Events,Females,Males,latitude,...,GDP total,Population,GDP per capita,Team Size,Medal Type,Medal Count,NaN,Bronze,Gold,Silver
0,1912,Summer,Stockholm,Swimming,Finland,10,49,2.0,162.0,61.92411,...,9707610000.0,2965061.0,3274.0,164,Bronze,9.0,0.0,9.0,9.0,8.0
1,1912,Summer,Stockholm,Swimming,Finland,10,49,2.0,162.0,61.92411,...,9707610000.0,2965061.0,3274.0,164,Gold,9.0,0.0,9.0,9.0,8.0
2,1912,Summer,Stockholm,Swimming,Finland,10,49,2.0,162.0,61.92411,...,9707610000.0,2965061.0,3274.0,164,Silver,8.0,0.0,9.0,9.0,8.0
3,1992,Winter,Albertville,Cross Country Skiing,USA,12,56,50.0,98.0,37.09024,...,9520302000000.0,256990608.0,37045.33012,148,Bronze,2.0,0.0,2.0,5.0,4.0
4,1992,Winter,Albertville,Cross Country Skiing,USA,12,56,50.0,98.0,37.09024,...,9520302000000.0,256990608.0,37045.33012,148,Gold,5.0,0.0,2.0,5.0,4.0


In [127]:
# Dropping columns
final_df.drop(['Medal Type', 'Medal Count', np.nan], axis=1, inplace=True)

In [128]:
# Getting the first 5 rows of final_df
final_df.head()

Unnamed: 0,Year,Season,Host City,Sport,Team,Team Disciplines,Team Events,Females,Males,latitude,...,Athletes,Countries,Events,GDP total,Population,GDP per capita,Team Size,Bronze,Gold,Silver
0,1912,Summer,Stockholm,Swimming,Finland,10,49,2.0,162.0,61.92411,...,2407.0,28.0,102.0,9707610000.0,2965061.0,3274.0,164,9.0,9.0,8.0
1,1912,Summer,Stockholm,Swimming,Finland,10,49,2.0,162.0,61.92411,...,2407.0,28.0,102.0,9707610000.0,2965061.0,3274.0,164,9.0,9.0,8.0
2,1912,Summer,Stockholm,Swimming,Finland,10,49,2.0,162.0,61.92411,...,2407.0,28.0,102.0,9707610000.0,2965061.0,3274.0,164,9.0,9.0,8.0
3,1992,Winter,Albertville,Cross Country Skiing,USA,12,56,50.0,98.0,37.09024,...,1801.0,64.0,57.0,9520302000000.0,256990608.0,37045.33012,148,2.0,5.0,4.0
4,1992,Winter,Albertville,Cross Country Skiing,USA,12,56,50.0,98.0,37.09024,...,1801.0,64.0,57.0,9520302000000.0,256990608.0,37045.33012,148,2.0,5.0,4.0


In [129]:
# Dropping columns
final_df.drop(['Sport'], axis=1, inplace=True)

In [130]:
# Dropping duplicate rows
final_df = final_df.drop_duplicates(['Host City', 'Year', 'Team', 'Season', 'Females', 'Males', 'latitude',
                                     'longitude', 'Host', 'Disciplines', 'Athletes',
                                     'Countries', 'Events', 'GDP total',
                                     'Population', 'GDP per capita', 'Team Size', 'Bronze', 'Silver', 'Gold', 'Team Events',
                                    'Team Disciplines'])

In [131]:
# Change column names
final_df.rename(columns = {'latitude': 'Latitude', 'longitude': 'Longitude', 'GDP total': 'GDP Total',
                          'GDP per capita': 'GDP per Capita', 'Disciplines': 'Total Disciplines',
                          'Athletes': 'Total Athletes', 'Countries': 'Total Countries',
                           'Events': 'Total Events'}, inplace = True)

In [132]:
# Getting the first 5 rows of final_df
final_df.head()

Unnamed: 0,Year,Season,Host City,Team,Team Disciplines,Team Events,Females,Males,Latitude,Longitude,...,Total Athletes,Total Countries,Total Events,GDP Total,Population,GDP per Capita,Team Size,Bronze,Gold,Silver
0,1912,Summer,Stockholm,Finland,10,49,2.0,162.0,61.92411,25.748151,...,2407.0,28.0,102.0,9707610000.0,2965061.0,3274.0,164,9.0,9.0,8.0
3,1992,Winter,Albertville,USA,12,56,50.0,98.0,37.09024,-95.712891,...,1801.0,64.0,57.0,9520302000000.0,256990608.0,37045.33012,148,2.0,5.0,4.0
6,1994,Winter,Lillehammer,Norway,10,48,21.0,66.0,60.472024,8.468946,...,1737.0,67.0,61.0,209725200000.0,4341616.0,48305.780704,87,5.0,8.0,10.0
9,1960,Winter,Squaw Valley,Norway,6,17,4.0,25.0,60.472024,8.468946,...,665.0,30.0,27.0,77758340000.0,3582508.0,21705.0,29,0.0,3.0,3.0
11,1996,Summer,Atlanta,Netherlands,19,94,102.0,137.0,52.132633,5.291266,...,10318.0,197.0,271.0,561108600000.0,15563252.0,36053.428086,239,10.0,4.0,5.0


In [133]:
# Drop rows with 'NaN'
final_df = final_df.dropna()

In [134]:
# Creating a column that sums the three types of medals
final_df['Total Medals'] = final_df.iloc[:, 19:].sum(axis=1)

In [135]:
# Getting the first 5 rows of final_df
final_df.head()

Unnamed: 0,Year,Season,Host City,Team,Team Disciplines,Team Events,Females,Males,Latitude,Longitude,...,Total Countries,Total Events,GDP Total,Population,GDP per Capita,Team Size,Bronze,Gold,Silver,Total Medals
0,1912,Summer,Stockholm,Finland,10,49,2.0,162.0,61.92411,25.748151,...,28.0,102.0,9707610000.0,2965061.0,3274.0,164,9.0,9.0,8.0,26.0
3,1992,Winter,Albertville,USA,12,56,50.0,98.0,37.09024,-95.712891,...,64.0,57.0,9520302000000.0,256990608.0,37045.33012,148,2.0,5.0,4.0,11.0
6,1994,Winter,Lillehammer,Norway,10,48,21.0,66.0,60.472024,8.468946,...,67.0,61.0,209725200000.0,4341616.0,48305.780704,87,5.0,8.0,10.0,23.0
9,1960,Winter,Squaw Valley,Norway,6,17,4.0,25.0,60.472024,8.468946,...,30.0,27.0,77758340000.0,3582508.0,21705.0,29,0.0,3.0,3.0,6.0
11,1996,Summer,Atlanta,Netherlands,19,94,102.0,137.0,52.132633,5.291266,...,197.0,271.0,561108600000.0,15563252.0,36053.428086,239,10.0,4.0,5.0,19.0


In [136]:
# Make a column to represent the seasons as a number to help with machine learning
final_df['Season (Binary)'] = np.where(final_df.loc[:,'Season'] == 'Summer', 1, 0)

In [137]:
# Getting the first 5 rows of final_df
final_df.head()

Unnamed: 0,Year,Season,Host City,Team,Team Disciplines,Team Events,Females,Males,Latitude,Longitude,...,Total Events,GDP Total,Population,GDP per Capita,Team Size,Bronze,Gold,Silver,Total Medals,Season (Binary)
0,1912,Summer,Stockholm,Finland,10,49,2.0,162.0,61.92411,25.748151,...,102.0,9707610000.0,2965061.0,3274.0,164,9.0,9.0,8.0,26.0,1
3,1992,Winter,Albertville,USA,12,56,50.0,98.0,37.09024,-95.712891,...,57.0,9520302000000.0,256990608.0,37045.33012,148,2.0,5.0,4.0,11.0,0
6,1994,Winter,Lillehammer,Norway,10,48,21.0,66.0,60.472024,8.468946,...,61.0,209725200000.0,4341616.0,48305.780704,87,5.0,8.0,10.0,23.0,0
9,1960,Winter,Squaw Valley,Norway,6,17,4.0,25.0,60.472024,8.468946,...,27.0,77758340000.0,3582508.0,21705.0,29,0.0,3.0,3.0,6.0,0
11,1996,Summer,Atlanta,Netherlands,19,94,102.0,137.0,52.132633,5.291266,...,271.0,561108600000.0,15563252.0,36053.428086,239,10.0,4.0,5.0,19.0,1


In [138]:
# Check for 'NaN'
final_df.isna().sum()

Year                 0
Season               0
Host City            0
Team                 0
Team Disciplines     0
Team Events          0
Females              0
Males                0
Latitude             0
Longitude            0
Host                 0
Total Disciplines    0
Total Athletes       0
Total Countries      0
Total Events         0
GDP Total            0
Population           0
GDP per Capita       0
Team Size            0
Bronze               0
Gold                 0
Silver               0
Total Medals         0
Season (Binary)      0
dtype: int64

In [139]:
# Get the length of final_df
len(final_df)

3606

In [140]:
# Changing order of the columns
final_df = final_df[['Host City', 'Year', 'Team', 'Season', 'Season (Binary)', 'Latitude', 'Longitude', 'Host', 'GDP Total',
                     'Population', 'Total Disciplines', 'Males', 'Females', 'Total Athletes', 'Total Events', 'Total Countries', 
                     'GDP per Capita', 'Team Disciplines', 'Team Events', 'Team Size', 'Bronze', 'Silver', 'Gold', 'Total Medals']]

In [141]:
# Getting the first 5 rows of final_df
final_df.head()

Unnamed: 0,Host City,Year,Team,Season,Season (Binary),Latitude,Longitude,Host,GDP Total,Population,...,Total Events,Total Countries,GDP per Capita,Team Disciplines,Team Events,Team Size,Bronze,Silver,Gold,Total Medals
0,Stockholm,1912,Finland,Summer,1,61.92411,25.748151,0,9707610000.0,2965061.0,...,102.0,28.0,3274.0,10,49,164,9.0,8.0,9.0,26.0
3,Albertville,1992,USA,Winter,0,37.09024,-95.712891,0,9520302000000.0,256990608.0,...,57.0,64.0,37045.33012,12,56,148,2.0,4.0,5.0,11.0
6,Lillehammer,1994,Norway,Winter,0,60.472024,8.468946,1,209725200000.0,4341616.0,...,61.0,67.0,48305.780704,10,48,87,5.0,10.0,8.0,23.0
9,Squaw Valley,1960,Norway,Winter,0,60.472024,8.468946,0,77758340000.0,3582508.0,...,27.0,30.0,21705.0,6,17,29,0.0,3.0,3.0,6.0
11,Atlanta,1996,Netherlands,Summer,1,52.132633,5.291266,0,561108600000.0,15563252.0,...,271.0,197.0,36053.428086,19,94,239,10.0,5.0,4.0,19.0


In [None]:
# # Export final_df to MS Excel - tableau_final.xlsx
# final_df.to_excel('tableau_final.xlsx')

In [142]:
# # change format of data
# data = final_df.to_dict('records')

# # Create a collection
# final_df_collection = db["final_df"]

# # add data to MongoDB
# final_df_collection.insert_many(data)

<pymongo.results.InsertManyResult at 0x23e02fc7a00>

## Visualization

In [None]:
# Import dependencies
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Plot to compare two continuous variables
sns.jointplot(x="Latitude", y="Total Medals", data=final_df)

In [None]:
# Dropping rows with 'Winter' for the season so these aren't included in the graph
summer_df = final_df[final_df.Season != 'Winter']

In [None]:
# Dropping rows with 'Summer' for the season so these aren't included in the graph
winter_df = final_df[final_df.Season != 'Summer']

In [None]:
# Plot to compare two continuous variables with the summer_df
sns.jointplot(x="Latitude", y="Total Medals", data=summer_df)

In [None]:
# Plot to compare two continuous variables with the winter_df
sns.jointplot(x="Latitude", y="Total Medals", data=winter_df)

In [None]:
# Representing correlations between various features in the summer_df as a heatmap
corrmat = summer_df[['Latitude', 'Longitude', 'Host', 'GDP Total', 'Males', 'Females', 'Team Events', 'Team Disciplines',
                     'Population', 'GDP per Capita',
                     'Team Size', 'Total Medals']].corr()
f, ax = plt.subplots(figsize=(10,10))
# annot controls annotations, square=True outputs squares as correlation representing figures, cmap represents color map
sns.heatmap(corrmat, vmax=.8, square=True, annot=True, fmt=".2f", cmap="summer")

In [None]:
# Representing correlations between various features in the winter_df as a heatmap
corrmat = winter_df[['Latitude', 'Longitude', 'Host', 'GDP Total', 'Males', 'Females', 'Team Events', 'Team Disciplines',
                     'Population', 'GDP per Capita',
                     'Team Size', 'Total Medals']].corr()
f, ax = plt.subplots(figsize=(10,10))
# annot controls annotations, square=True outputs squares as correlation representing figures, cmap represents color map
sns.heatmap(corrmat, vmax=.8, square=True, annot=True, fmt=".2f", cmap="winter")

In [None]:
# Plot to compare two continuous variables with the final_df
sns.jointplot(x="Team Size", y="Total Medals", data=final_df)

In [None]:
# Plot to compare two continuous variables with the final_df
sns.jointplot(x="GDP Total", y="Total Medals", data=final_df)

In [None]:
# Plot to compare two continuous variables with the final_df
sns.jointplot(x="GDP per Capita", y="Total Medals", data=final_df)

In [None]:
# Import dependency
from scipy.stats import linregress

# Create a function to create perform linear regression on the data
# and plot a regression line and the equation with the data
def plot_linear_regression(x_values, y_values, title, y_label, text_coordinates):
    
    # Run regression on data
    (slope, intercept, r_value, p_value, std_err) = linregress(x_values, y_values)
    
    # Calculate the regression line "y values" from the slope and intercept
    regress_values = x_values * slope + intercept
    
    # Get the equation of the line
    line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
    
    # Create a scatter plot and plot the regression line
    plt.scatter(x_values,y_values)
    plt.plot(x_values,regress_values,"r")
    
    # Annotate the text for the line equation
#     plt.annotate(line_eq, text_coordinates, fontsize=15, color="red")
    plt.xlabel('Team Size')
    plt.ylabel(y_label)
    plt.show()

In [None]:
# Assigning values to the axis
x_values = final_df['Team Size']
y_values = final_df['Total Medals']

# Giving the graph a title
plt.title('Team Size versus Total Medals')

# Plotting the data
plot_linear_regression(x_values, y_values, 'Linear Regression for Team Size versus Total Medals', 'Total Medals', (5, 33))

# # Save the figure
# plt.savefig("Figures/teamsize_totalmedals.png")