In [1]:
from pymongo import MongoClient
from pymongo.errors import OperationFailure
from password import connection_string
import pandas as pd
import numpy as np

import seaborn
import matplotlib.pyplot as matplotlib
 
from matplotlib.lines import Line2D
from scipy.stats import pearsonr
 
# set seed to make results reproducible
rf_seed = 5

In [2]:
# import datasets
athlete_events_df = pd.read_csv('Data Sources/athlete_events.csv')
noc_regions_df = pd.read_csv('Data Sources/noc_regions.csv')
population_by_country_df = pd.read_excel('Data Sources/population_by_country.xlsx')
gdp_df = pd.read_excel('Data Sources/gdp.xlsx')

In [3]:
# # connect to MongoDB
# myclient = MongoClient(connection_string) 
   
# # database
# db = myclient["test_database"]
   
# # collection
# collection = db["test_data"]
  
# # change format of data
# data = athlete_events_df.to_dict('records')

# # add data to MongoDB
# # collection.insert_many(data)

In [4]:
# # databases that are on MongoDB
# for db in myclient.list_databases():
#         print(db)

In [5]:
# # take data off of MongoDB
# df = pd.DataFrame(list(collection.find()))

In [6]:
# df.head()

In [7]:
athlete_events_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [8]:
# Since not using gold, silver, etc., make "Medal" column either a 0 or 1
athlete_events_df['Medal'] = athlete_events_df['Medal'].apply(lambda x: 1 if str(x) != 'nan' else 0)

In [9]:
# Drop the columns we don't need
athlete_events_df = athlete_events_df.drop(['Name', 'Age', 'Height', 'Weight', 'Games', 'ID', 'City'], axis=1)

In [10]:
# consolidate so only 1 medal counts towards countries count(example: everyone in basketball gets a medal but USA only gets 1)
consolidate_athletes_df = athlete_events_df.groupby(["Team", "NOC", "Year"]).Medal.sum().reset_index()

In [11]:
len(consolidate_athletes_df)

5204

In [12]:
# merge the two dataframes
merged_df = pd.merge(consolidate_athletes_df, population_by_country_df, how='left', left_on=['Team', 'Year'], right_on=['name', 'time'])

In [13]:
merged_df.head()

Unnamed: 0,Team,NOC,Year,Medal,geo,name,time,Population
0,30. Februar,AUT,1952,0,,,,
1,A North American Team,MEX,1900,3,,,,
2,A North American Team,USA,1900,1,,,,
3,Acipactli,MEX,1964,0,,,,
4,Acturus,ARG,1948,0,,,,


In [14]:
# drop duplicate columns
merged_df = merged_df.drop(['geo', 'name', 'time'], axis=1)

In [15]:
merged_df.isna().sum()

Team             0
NOC              0
Year             0
Medal            0
Population    2302
dtype: int64

In [16]:
# drop rows with nan in it (for now)
merged_df = merged_df.dropna()

In [17]:
# check that rows with nan were dropped
merged_df.isna().sum()

Team          0
NOC           0
Year          0
Medal         0
Population    0
dtype: int64

In [18]:
merged_df.head()

Unnamed: 0,Team,NOC,Year,Medal,Population
5,Afghanistan,AFG,1936,0,6642004.0
6,Afghanistan,AFG,1948,0,7503485.0
7,Afghanistan,AFG,1956,0,8398873.0
8,Afghanistan,AFG,1960,0,8996967.0
9,Afghanistan,AFG,1964,0,9744772.0


In [19]:
# make population column dtype int
merged_df['Population'] = merged_df['Population'].astype(int)

In [20]:
merged_df.head()

Unnamed: 0,Team,NOC,Year,Medal,Population
5,Afghanistan,AFG,1936,0,6642004
6,Afghanistan,AFG,1948,0,7503485
7,Afghanistan,AFG,1956,0,8398873
8,Afghanistan,AFG,1960,0,8996967
9,Afghanistan,AFG,1964,0,9744772


In [21]:
gdp_df.head()

Unnamed: 0,geo,name,time,Income per person,GDP total
0,afg,Afghanistan,1800,603,1977840000
1,afg,Afghanistan,1801,603,1977840000
2,afg,Afghanistan,1802,603,1977840000
3,afg,Afghanistan,1803,603,1977840000
4,afg,Afghanistan,1804,603,1977840000


In [22]:
# merge the athletes + population data with the gdp data
final_merged_df = pd.merge(merged_df, gdp_df, how='left', left_on=['Team', 'Year'], right_on=['name', 'time'])

In [23]:
final_merged_df.head()

Unnamed: 0,Team,NOC,Year,Medal,Population,geo,name,time,Income per person,GDP total
0,Afghanistan,AFG,1936,0,6642004,afg,Afghanistan,1936.0,1940.0,12885750000.0
1,Afghanistan,AFG,1948,0,7503485,afg,Afghanistan,1948.0,2324.0,17437760000.0
2,Afghanistan,AFG,1956,0,8398873,afg,Afghanistan,1956.0,2644.0,22206620000.0
3,Afghanistan,AFG,1960,0,8996967,afg,Afghanistan,1960.0,2744.0,24687680000.0
4,Afghanistan,AFG,1964,0,9744772,afg,Afghanistan,1964.0,2649.0,25818730000.0


In [24]:
# drop duplicate columns
final_merged_df = final_merged_df.drop(['name', 'time', 'geo'], axis=1)

In [25]:
final_merged_df.head()

Unnamed: 0,Team,NOC,Year,Medal,Population,Income per person,GDP total
0,Afghanistan,AFG,1936,0,6642004,1940.0,12885750000.0
1,Afghanistan,AFG,1948,0,7503485,2324.0,17437760000.0
2,Afghanistan,AFG,1956,0,8398873,2644.0,22206620000.0
3,Afghanistan,AFG,1960,0,8996967,2744.0,24687680000.0
4,Afghanistan,AFG,1964,0,9744772,2649.0,25818730000.0


In [26]:
final_merged_df.isna().sum()

Team                  0
NOC                   0
Year                  0
Medal                 0
Population            0
Income per person    25
GDP total            25
dtype: int64

In [27]:
final_merged_df = final_merged_df.dropna()

In [28]:
final_merged_df.isna().sum()

Team                 0
NOC                  0
Year                 0
Medal                0
Population           0
Income per person    0
GDP total            0
dtype: int64

In [29]:
len(final_merged_df)

2877

In [30]:
len(pd.unique(final_merged_df['Medal']))

137

In [31]:
# output to match
labels = np.array(final_merged_df['Medal'])

In [32]:
# input used to train model to match output
parameters = np.array(final_merged_df[['Population', 'GDP total']])

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [34]:
# Use the train_test_split function to create training and testing subsets 
X_train, X_test, y_train, y_test = train_test_split(parameters, labels, test_size=0.2, random_state=rf_seed)

In [35]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [36]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=1000, random_state=rf_seed)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [37]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 11.737413194444445
Mean Squared Error: 610.5000411354166
Root Mean Squared Error: 24.70829903363274
