In [1]:
# Import Dependencies
from pymongo import MongoClient
from pymongo.errors import OperationFailure
from password import connection_string
import pandas as pd
import numpy as np

import seaborn
import matplotlib.pyplot as matplotlib
 
from matplotlib.lines import Line2D
from scipy.stats import pearsonr
 
# set seed to make results reproducible
rf_seed = 5

In [2]:
# Import Datasets
athlete_events = pd.read_csv('Data Sources/athlete_events.csv')
noc_regions = pd.read_csv('Data Sources/noc_regions.csv')
gdp = pd.read_excel('Data Sources/gdp.xlsx')
coordinates = pd.read_csv('Data Sources/coordinates.csv')
hosts = pd.read_csv('Data Sources/hosts.csv')

In [3]:
# Put the athletes_events data into a DataFrame
athlete_events_df = pd.DataFrame(athlete_events)

In [4]:
# Put the noc_regions data into a DataFrame
noc_regions_df = pd.DataFrame(noc_regions)

In [5]:
# Put the GDP data into a DataFrame
gdp_df = pd.DataFrame(gdp)

In [6]:
# Put the coordinates data into a DataFrame
coordinates_df = pd.DataFrame(coordinates)

In [7]:
# Put the hosts data into a DataFrame
hosts_df = pd.DataFrame(hosts)

### Connecting to the Database

In [8]:
# # connect to MongoDB
# myclient = MongoClient(connection_string) 
   
# # database
# db = myclient["test_database"]
   
# # collection
# collection = db["test_data2"]
  
# # change format of data
# data = noc_regions_df.to_dict('records')

# # add data to MongoDB
# collection.insert_many(data)

In [9]:
# # databases that are on MongoDB
# for db in myclient.list_databases():
#     print(db)

In [10]:
# # take data off of MongoDB
# df = pd.DataFrame(list(collection.find()))

In [11]:
# df.head()

### Clean the Data

In [12]:
# Return first 5 rows of athlete_events_df
athlete_events_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [13]:
# Return first 5 rows of noc_regions_df
noc_regions_df.head()

Unnamed: 0,NOC,region,notes
0,AFG,Afghanistan,
1,ALB,Albania,
2,ALG,Algeria,
3,ASA,American Samoa,
4,AND,Andorra,


In [14]:
# Merge the athlete_events data with the noc_regions data
olympic_data_merge = athlete_events_df.merge(noc_regions_df, left_on = 'NOC', right_on = 'NOC', how = 'left')

In [15]:
# Return first 5 rows of olympic_data_df
olympic_data_merge.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,region,notes
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,,China,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,,China,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,,Denmark,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold,Denmark,
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,,Netherlands,


In [16]:
# Check for 'NaN'
olympic_data_merge.isna().sum()

ID             0
Name           0
Sex            0
Age         9474
Height     60171
Weight     62875
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
Medal     231333
region       370
notes     266077
dtype: int64

In [17]:
# Checking if there are any rows where the 'NOC' didn't have a 'region' to match it
olympic_data_merge.loc[olympic_data_merge['region'].isnull(),['NOC', 'Team']].drop_duplicates()

Unnamed: 0,NOC,Team
578,SGP,Singapore
6267,ROT,Refugee Olympic Athletes
44376,SGP,June Climene
61080,UNK,Unknown
64674,TUV,Tuvalu
80986,SGP,Rika II
108582,SGP,Singapore-2
235895,SGP,Singapore-1


In [18]:
# "Manually" fixing the values above
olympic_data_merge['region'] = np.where(olympic_data_merge['NOC']=='SGP', 'Singapore', olympic_data_merge['region'])
olympic_data_merge['region'] = np.where(olympic_data_merge['NOC']=='ROT', 'Refugee Olympic Athletes', olympic_data_merge['region'])
olympic_data_merge['region'] = np.where(olympic_data_merge['NOC']=='TUV', 'Tuvalu', olympic_data_merge['region'])
olympic_data_merge['region'] = np.where(olympic_data_merge['NOC']=='UNK', 'Unknown', olympic_data_merge['region'])

In [19]:
# Check for 'NaN'
olympic_data_merge.isna().sum()

ID             0
Name           0
Sex            0
Age         9474
Height     60171
Weight     62875
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
Medal     231333
region         0
notes     266077
dtype: int64

In [20]:
# Dropping uneccessary columns
olympic_data_merge.drop(['ID', 'Name', 'Sex', 'Age', 'Height', 'Weight', 'Team',
                         'Games', 'City', 'Sport', 'notes'], axis = 1, inplace = True)

In [21]:
# Change column names
olympic_data_merge.rename(columns = {'region': 'Team'}, inplace = True)

In [22]:
# Reorder columns
olympic_data_merge[['Team', 'NOC', 'Year', 'Season', 'Event', 'Medal']]

Unnamed: 0,Team,NOC,Year,Season,Event,Medal
0,China,CHN,1992,Summer,Basketball Men's Basketball,
1,China,CHN,2012,Summer,Judo Men's Extra-Lightweight,
2,Denmark,DEN,1920,Summer,Football Men's Football,
3,Denmark,DEN,1900,Summer,Tug-Of-War Men's Tug-Of-War,Gold
4,Netherlands,NED,1988,Winter,Speed Skating Women's 500 metres,
...,...,...,...,...,...,...
271111,Poland,POL,1976,Winter,Luge Mixed (Men)'s Doubles,
271112,Poland,POL,2014,Winter,"Ski Jumping Men's Large Hill, Individual",
271113,Poland,POL,2014,Winter,"Ski Jumping Men's Large Hill, Team",
271114,Poland,POL,1998,Winter,Bobsleigh Men's Four,


In [23]:
# Find unique values in the medal column
olympic_data_merge["Medal"].unique()

array([nan, 'Gold', 'Bronze', 'Silver'], dtype=object)

In [24]:
# Drop the rows with 'NaN' in the 'Medal' column
olympic_data_merge = olympic_data_merge.dropna()

In [25]:
# Drop duplicates so things like adding 12 medals to a country that won basketball (one for each player)
# when it should just add 1 (one for whole team)
olympic_data_merge = olympic_data_merge.drop_duplicates(['NOC', 'Year', 'Season', 'Event', 'Medal', 'Team'])

In [26]:
# Summing medals by 'NOC', 'Year', 'Season', and 'Team'
olympic_data_merge = pd.DataFrame({'Medal Count' : olympic_data_merge.groupby( ['NOC', 'Year', 'Season', 'Team'] ).size()}).reset_index()

In [27]:
# olympic_data_merge.to_csv('test.csv')

In [28]:
# Return first 5 rows of coordinates_df
coordinates_df.head()

Unnamed: 0,country_code,latitude,longitude,country,usa_state_code,usa_state_latitude,usa_state_longitude,usa_state
0,AF,33.93911,67.709953,Afghanistan,AR,35.20105,-91.831833,Arkansas
1,AL,41.153332,20.168331,Albania,CO,39.550051,-105.782067,Colorado
2,DZ,28.033886,1.659626,Algeria,,,,
3,AS,-14.270972,-170.132217,American Samoa,HI,19.898682,-155.665857,Hawaii
4,AD,42.546245,1.601554,Andorra,AK,63.588753,-154.493062,Alaska


In [29]:
# Drop uneccessary columns
coordinates_df.drop(['usa_state_code', 'usa_state_latitude', 'usa_state_longitude', 'usa_state'], axis=1, inplace=True)

In [30]:
# Return first 5 rows of coordinates_df
coordinates_df.head()

Unnamed: 0,country_code,latitude,longitude,country
0,AF,33.93911,67.709953,Afghanistan
1,AL,41.153332,20.168331,Albania
2,DZ,28.033886,1.659626,Algeria
3,AS,-14.270972,-170.132217,American Samoa
4,AD,42.546245,1.601554,Andorra


In [31]:
# Check for 'NaN'
coordinates_df.isna().sum()

country_code    1
latitude        1
longitude       1
country         0
dtype: int64

In [32]:
# Showing 'NaN' 'country_code'
coordinates_df.loc[coordinates_df['country_code'].isnull(),['latitude', 'longitude', 'country']].drop_duplicates()

Unnamed: 0,latitude,longitude,country
150,-22.95764,18.49041,Namibia


In [33]:
# Showing 'NaN' 'latitude' and 'longitude'
coordinates_df.loc[coordinates_df['latitude'].isnull(),['country_code', 'longitude', 'country']].drop_duplicates()

Unnamed: 0,country_code,longitude,country
227,UM,,U.S. Minor Outlying Islands


In [34]:
# Merging olympic and coordinate data
olympic_coordinate_merge = olympic_data_merge.merge(coordinates_df, left_on = 'Team', right_on = 'country', how = 'left')

In [35]:
# Return first 5 rows of olympic_coordinate_df
olympic_coordinate_merge.head()

Unnamed: 0,NOC,Year,Season,Team,Medal Count,country_code,latitude,longitude,country
0,AFG,2008,Summer,Afghanistan,1,AF,33.93911,67.709953,Afghanistan
1,AFG,2012,Summer,Afghanistan,1,AF,33.93911,67.709953,Afghanistan
2,AHO,1988,Summer,Netherlands Antilles,1,AN,12.226079,-69.060087,Netherlands Antilles
3,ALG,1984,Summer,Algeria,2,DZ,28.033886,1.659626,Algeria
4,ALG,1992,Summer,Algeria,2,DZ,28.033886,1.659626,Algeria


In [36]:
# Checking 'NaN' 'country' after the merge
olympic_coordinate_merge.loc[olympic_coordinate_merge['country'].isnull(),['Team', 'NOC']].drop_duplicates()

Unnamed: 0,Team,NOC
751,Individual Olympic Athletes,IOA
1635,West Indies Federation,WIF


In [37]:
# NEW
# Dropping longitude column
# Adding longitude back
# olympic_coordinate_merge.drop(['longitude', 'country_code', 'country'], axis=1, inplace=True)
olympic_coordinate_merge.drop(['country_code', 'country'], axis=1, inplace=True)

In [38]:
# Return first 5 rows of olympic_coordinate_df
olympic_coordinate_merge.head()

Unnamed: 0,NOC,Year,Season,Team,Medal Count,latitude,longitude
0,AFG,2008,Summer,Afghanistan,1,33.93911,67.709953
1,AFG,2012,Summer,Afghanistan,1,33.93911,67.709953
2,AHO,1988,Summer,Netherlands Antilles,1,12.226079,-69.060087
3,ALG,1984,Summer,Algeria,2,28.033886,1.659626
4,ALG,1992,Summer,Algeria,2,28.033886,1.659626


In [39]:
# Return first 5 rows of hosts_df
hosts_df.head()

Unnamed: 0,Type,GamesUrl,Disciplines,DisciplinesList,Country,Date,Athletes,Countries,Events,City,Year,Host
0,Summer,https://www.olympic.org/athens-1896,10,"['Athletics', 'Cycling Road', 'Cycling Track',...",Greece,06 Apr - 15 Apr,241.0,14.0,43.0,Athens,1896,1
1,Summer,https://www.olympic.org/paris-1900,20,"['Archery', 'Athletics', 'Basque Pelota', 'Cri...",France,14 May - 28 Oct,997.0,24.0,95.0,Paris,1900,1
2,Summer,https://www.olympic.org/st-louis-1904,19,"['Archery', 'Athletics', 'Basketball', 'Boxing...",USA,01 Jul - 23 Nov,651.0,12.0,95.0,St Louis,1904,1
3,Summer,https://www.olympic.org/london-1908,25,"['Archery', 'Athletics', 'Boxing', 'Cycling Tr...",UK,27 Apr - 31 Oct,2008.0,22.0,110.0,London,1908,1
4,Summer,https://www.olympic.org/stockholm-1912,18,"['Athletics', 'Cycling Road', 'Diving', 'Eques...",Sweden,05 May - 27 Jul,2407.0,28.0,102.0,Stockholm,1912,1


In [40]:
# NEW
# Creating a copy of the hosts_df
hosts_new_df = hosts_df.copy()

In [41]:
# Drop columns we don't need
# Dropping 'GamesUrl', 'Date', 'Disciplines', 'DisciplinesList', 'Athletes', 'Events','Countries', 'City' from hosts_df
hosts_df.drop(['GamesUrl', 'Date', 'Disciplines', 'DisciplinesList', 'Athletes', 'Events','Countries', 'City'], axis=1, inplace=True)

In [42]:
# NEW
# Dropping 'GamesUrl', 'Date', 'Host' from hosts_new_df
hosts_new_df.drop(['GamesUrl', 'Date', 'Host'], axis=1, inplace=True)

In [43]:
# Return first 5 rows of hosts_df
hosts_df.head()

Unnamed: 0,Type,Country,Year,Host
0,Summer,Greece,1896,1
1,Summer,France,1900,1
2,Summer,USA,1904,1
3,Summer,UK,1908,1
4,Summer,Sweden,1912,1


In [44]:
# NEW
# Return first 5 rows of hosts_new_df
hosts_new_df.head()

Unnamed: 0,Type,Disciplines,DisciplinesList,Country,Athletes,Countries,Events,City,Year
0,Summer,10,"['Athletics', 'Cycling Road', 'Cycling Track',...",Greece,241.0,14.0,43.0,Athens,1896
1,Summer,20,"['Archery', 'Athletics', 'Basque Pelota', 'Cri...",France,997.0,24.0,95.0,Paris,1900
2,Summer,19,"['Archery', 'Athletics', 'Basketball', 'Boxing...",USA,651.0,12.0,95.0,St Louis,1904
3,Summer,25,"['Archery', 'Athletics', 'Boxing', 'Cycling Tr...",UK,2008.0,22.0,110.0,London,1908
4,Summer,18,"['Athletics', 'Cycling Road', 'Diving', 'Eques...",Sweden,2407.0,28.0,102.0,Stockholm,1912


In [45]:
# Change column names
hosts_df.rename(columns = {'Country': 'Host Country'}, inplace = True)

In [46]:
# Return first 5 rows of olympic_coordinate_merge
olympic_coordinate_merge.head()

Unnamed: 0,NOC,Year,Season,Team,Medal Count,latitude,longitude
0,AFG,2008,Summer,Afghanistan,1,33.93911,67.709953
1,AFG,2012,Summer,Afghanistan,1,33.93911,67.709953
2,AHO,1988,Summer,Netherlands Antilles,1,12.226079,-69.060087
3,ALG,1984,Summer,Algeria,2,28.033886,1.659626
4,ALG,1992,Summer,Algeria,2,28.033886,1.659626


In [47]:
# Merging the olympic + coordinate data with the host data
olympic_coordinate_host_merge = olympic_coordinate_merge.merge(hosts_df, left_on = ['Year', 'Season', 'Team'], 
                                                               right_on = ['Year', 'Type', 'Host Country'], how = 'left')

In [48]:
# NEW
# Return a series of counts for unique rows for 'Host' 
olympic_coordinate_host_merge.Host.value_counts()

1.0    48
Name: Host, dtype: int64

In [49]:
# NEW
# Merging the olympic + coordinate data with the host data
olympic_coordinate_host_merge = olympic_coordinate_host_merge.merge(hosts_new_df, left_on = ['Year', 'Season'], 
                                                               right_on = ['Year', 'Type'], how = 'left')

In [50]:
# Return first 5 rows of olympic_coordinate_host_merge
olympic_coordinate_host_merge.head()

Unnamed: 0,NOC,Year,Season,Team,Medal Count,latitude,longitude,Type_x,Host Country,Host,Type_y,Disciplines,DisciplinesList,Country,Athletes,Countries,Events,City
0,AFG,2008,Summer,Afghanistan,1,33.93911,67.709953,,,,Summer,41.0,"['Archery', 'Athletics', 'Badminton', 'Basebal...",China,10942.0,204.0,302.0,Beijing
1,AFG,2012,Summer,Afghanistan,1,33.93911,67.709953,,,,Summer,36.0,"['Archery', 'Athletics', 'Badminton', 'Basketb...",UK,10568.0,204.0,302.0,London
2,AHO,1988,Summer,Netherlands Antilles,1,12.226079,-69.060087,,,,Summer,31.0,"['Archery', 'Athletics', 'Basketball', 'Boxing...",South Korea,8397.0,159.0,237.0,Seoul
3,ALG,1984,Summer,Algeria,2,28.033886,1.659626,,,,Summer,29.0,"['Archery', 'Athletics', 'Basketball', 'Boxing...",USA,6829.0,140.0,221.0,Los Angeles
4,ALG,1992,Summer,Algeria,2,28.033886,1.659626,,,,Summer,34.0,"['Archery', 'Athletics', 'Badminton', 'Basebal...",Spain,9356.0,169.0,257.0,Barcelona


In [51]:
# NEW
# Dropping duplicate columns
olympic_coordinate_host_merge.drop(['Type_x', 'Host Country', 'Type_y','Country'], axis=1, inplace=True)

In [52]:
# NEW
# Return first 5 rows of olympic_coordinate_host_merge
olympic_coordinate_host_merge.head()

Unnamed: 0,NOC,Year,Season,Team,Medal Count,latitude,longitude,Host,Disciplines,DisciplinesList,Athletes,Countries,Events,City
0,AFG,2008,Summer,Afghanistan,1,33.93911,67.709953,,41.0,"['Archery', 'Athletics', 'Badminton', 'Basebal...",10942.0,204.0,302.0,Beijing
1,AFG,2012,Summer,Afghanistan,1,33.93911,67.709953,,36.0,"['Archery', 'Athletics', 'Badminton', 'Basketb...",10568.0,204.0,302.0,London
2,AHO,1988,Summer,Netherlands Antilles,1,12.226079,-69.060087,,31.0,"['Archery', 'Athletics', 'Basketball', 'Boxing...",8397.0,159.0,237.0,Seoul
3,ALG,1984,Summer,Algeria,2,28.033886,1.659626,,29.0,"['Archery', 'Athletics', 'Basketball', 'Boxing...",6829.0,140.0,221.0,Los Angeles
4,ALG,1992,Summer,Algeria,2,28.033886,1.659626,,34.0,"['Archery', 'Athletics', 'Badminton', 'Basebal...",9356.0,169.0,257.0,Barcelona


In [53]:
# NEW - Delete Entirely
# Dropping duplicate columns
# olympic_coordinate_host_merge.drop(['Type', 'Host Country'], axis=1, inplace=True)

In [54]:
# NEW
# This code will remove the [] from the Disciplines List. Leave this commented out for now as it may interfere with the ML model
# olympic_coordinate_host_merge['DisciplinesList'] = olympic_coordinate_host_merge['DisciplinesList'].str.strip('[]')

In [55]:
# Return first 5 rows of olympic_coordinate_host_merge
olympic_coordinate_host_merge.head()

Unnamed: 0,NOC,Year,Season,Team,Medal Count,latitude,longitude,Host,Disciplines,DisciplinesList,Athletes,Countries,Events,City
0,AFG,2008,Summer,Afghanistan,1,33.93911,67.709953,,41.0,"['Archery', 'Athletics', 'Badminton', 'Basebal...",10942.0,204.0,302.0,Beijing
1,AFG,2012,Summer,Afghanistan,1,33.93911,67.709953,,36.0,"['Archery', 'Athletics', 'Badminton', 'Basketb...",10568.0,204.0,302.0,London
2,AHO,1988,Summer,Netherlands Antilles,1,12.226079,-69.060087,,31.0,"['Archery', 'Athletics', 'Basketball', 'Boxing...",8397.0,159.0,237.0,Seoul
3,ALG,1984,Summer,Algeria,2,28.033886,1.659626,,29.0,"['Archery', 'Athletics', 'Basketball', 'Boxing...",6829.0,140.0,221.0,Los Angeles
4,ALG,1992,Summer,Algeria,2,28.033886,1.659626,,34.0,"['Archery', 'Athletics', 'Badminton', 'Basebal...",9356.0,169.0,257.0,Barcelona


In [56]:
#
olympic_coordinate_host_merge["Host"].unique()

array([nan,  1.])

In [57]:
#
olympic_coordinate_host_merge.isna().sum()

NOC                   0
Year                  0
Season                0
Team                  0
Medal Count           0
latitude              3
longitude             3
Host               1609
Disciplines          18
DisciplinesList      18
Athletes             18
Countries            18
Events               18
City                 18
dtype: int64

In [58]:
# Replacing 'NaN' values in the host column with 0 (1 means host and 0 means not host)
values = {'Host':0}
olympic_coordinate_host_merge = olympic_coordinate_host_merge.fillna(value=values)

In [59]:
# Return first 5 rows of olympic_coordinate_host_merge
olympic_coordinate_host_merge.head()

Unnamed: 0,NOC,Year,Season,Team,Medal Count,latitude,longitude,Host,Disciplines,DisciplinesList,Athletes,Countries,Events,City
0,AFG,2008,Summer,Afghanistan,1,33.93911,67.709953,0.0,41.0,"['Archery', 'Athletics', 'Badminton', 'Basebal...",10942.0,204.0,302.0,Beijing
1,AFG,2012,Summer,Afghanistan,1,33.93911,67.709953,0.0,36.0,"['Archery', 'Athletics', 'Badminton', 'Basketb...",10568.0,204.0,302.0,London
2,AHO,1988,Summer,Netherlands Antilles,1,12.226079,-69.060087,0.0,31.0,"['Archery', 'Athletics', 'Basketball', 'Boxing...",8397.0,159.0,237.0,Seoul
3,ALG,1984,Summer,Algeria,2,28.033886,1.659626,0.0,29.0,"['Archery', 'Athletics', 'Basketball', 'Boxing...",6829.0,140.0,221.0,Los Angeles
4,ALG,1992,Summer,Algeria,2,28.033886,1.659626,0.0,34.0,"['Archery', 'Athletics', 'Badminton', 'Basebal...",9356.0,169.0,257.0,Barcelona


In [60]:
# Return first 5 rows of gdp_df
gdp_df.head()

Unnamed: 0,geo,name,time,Income per person,GDP total
0,afg,Afghanistan,1800,603,1977840000
1,afg,Afghanistan,1801,603,1977840000
2,afg,Afghanistan,1802,603,1977840000
3,afg,Afghanistan,1803,603,1977840000
4,afg,Afghanistan,1804,603,1977840000


In [61]:
# Merging the olympic + coordinate + host data with the gdp data
olympic_coordinate_host_gdp_merge = olympic_coordinate_host_merge.merge(gdp_df, left_on = ['Year', 'Team'], 
                                                               right_on = ['time', 'name'], how = 'left')

In [62]:
# Return first 5 rows of olympic_coordinate_host_gdp_merge
olympic_coordinate_host_gdp_merge.head()

Unnamed: 0,NOC,Year,Season,Team,Medal Count,latitude,longitude,Host,Disciplines,DisciplinesList,Athletes,Countries,Events,City,geo,name,time,Income per person,GDP total
0,AFG,2008,Summer,Afghanistan,1,33.93911,67.709953,0.0,41.0,"['Archery', 'Athletics', 'Badminton', 'Basebal...",10942.0,204.0,302.0,Beijing,afg,Afghanistan,2008.0,1268.0,35142010000.0
1,AFG,2012,Summer,Afghanistan,1,33.93911,67.709953,0.0,36.0,"['Archery', 'Athletics', 'Badminton', 'Basketb...",10568.0,204.0,302.0,London,afg,Afghanistan,2012.0,1773.0,55241800000.0
2,AHO,1988,Summer,Netherlands Antilles,1,12.226079,-69.060087,0.0,31.0,"['Archery', 'Athletics', 'Basketball', 'Boxing...",8397.0,159.0,237.0,Seoul,,,,,
3,ALG,1984,Summer,Algeria,2,28.033886,1.659626,0.0,29.0,"['Archery', 'Athletics', 'Basketball', 'Boxing...",6829.0,140.0,221.0,Los Angeles,dza,Algeria,1984.0,11848.0,257859500000.0
4,ALG,1992,Summer,Algeria,2,28.033886,1.659626,0.0,34.0,"['Archery', 'Athletics', 'Badminton', 'Basebal...",9356.0,169.0,257.0,Barcelona,dza,Algeria,1992.0,9871.0,266795100000.0


In [63]:
# Checking which 'name' rows contain 'NaN'
olympic_coordinate_host_gdp_merge.loc[olympic_coordinate_host_gdp_merge['name'].isnull(),
                                      ['Team', 'Year']].drop_duplicates()

Unnamed: 0,Team,Year
2,Netherlands Antilles,1988
167,Bermuda,1976
751,Individual Olympic Athletes,1992
752,Individual Olympic Athletes,2016
795,"Virgin Islands, US",1988
937,Kosovo,2016
956,Liechtenstein,1976
957,Liechtenstein,1980
958,Liechtenstein,1984
959,Liechtenstein,1988


In [64]:
# Droppign columns we don't need
olympic_coordinate_host_gdp_merge.drop(['geo', 'name', 'time', 'Income per person'], axis=1, inplace=True)

In [65]:
# Return first 5 rows of olympic_coordinate_host_gdp_merge
olympic_coordinate_host_gdp_merge.head()

Unnamed: 0,NOC,Year,Season,Team,Medal Count,latitude,longitude,Host,Disciplines,DisciplinesList,Athletes,Countries,Events,City,GDP total
0,AFG,2008,Summer,Afghanistan,1,33.93911,67.709953,0.0,41.0,"['Archery', 'Athletics', 'Badminton', 'Basebal...",10942.0,204.0,302.0,Beijing,35142010000.0
1,AFG,2012,Summer,Afghanistan,1,33.93911,67.709953,0.0,36.0,"['Archery', 'Athletics', 'Badminton', 'Basketb...",10568.0,204.0,302.0,London,55241800000.0
2,AHO,1988,Summer,Netherlands Antilles,1,12.226079,-69.060087,0.0,31.0,"['Archery', 'Athletics', 'Basketball', 'Boxing...",8397.0,159.0,237.0,Seoul,
3,ALG,1984,Summer,Algeria,2,28.033886,1.659626,0.0,29.0,"['Archery', 'Athletics', 'Basketball', 'Boxing...",6829.0,140.0,221.0,Los Angeles,257859500000.0
4,ALG,1992,Summer,Algeria,2,28.033886,1.659626,0.0,34.0,"['Archery', 'Athletics', 'Badminton', 'Basebal...",9356.0,169.0,257.0,Barcelona,266795100000.0


### Exporting Data

In [66]:
# Export olympic_coordinate_host_gdp_merge to MS Excel - final.xlsx
olympic_coordinate_host_gdp_merge.to_excel('final3.xlsx')

In [67]:
# Drop missing values & Export olympic_coordinate_host_gdp_merge to MS Excel - tableau_final.xlsx
olympic_coordinate_host_gdp_merge.dropna().to_csv('tableau_final3.csv')

In [68]:
# # output to match
# labels = np.array(final_merged_df['Medal'])

In [69]:
# # input used to train model to match output
# parameters = np.array(final_merged_df[['Population', 'GDP total']])

In [70]:
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler

In [71]:
# # Use the train_test_split function to create training and testing subsets 
# X_train, X_test, y_train, y_test = train_test_split(parameters, labels, test_size=0.2, random_state=rf_seed)

In [72]:
# X_train[:5]

In [73]:
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

In [74]:
# X_train[:5]

In [75]:
# maybe just decision tree first. if that makes sense then try random forest

In [76]:
# from sklearn.ensemble import RandomForestRegressor

# regressor = RandomForestRegressor(n_estimators=1000, random_state=rf_seed)
# regressor.fit(X_train, y_train)
# y_pred = regressor.predict(X_test)

In [77]:
# from sklearn import metrics

# print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
# print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
# print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))