In [25]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [26]:
# Load the data
athlete_df = pd.read_csv('../Resources/clean_athletes.csv')
athlete_df = athlete_df.drop('Unnamed: 0', axis=1)
athlete_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Country_Name,Country_Code,Games,Year,Season,City,Sport,Event,Medal
0,22,Andreea Aanei,F,22.0,170.0,125.0,Romania,ROU,2016 Summer,2016,Summer,Rio de Janeiro,Weightlifting,Weightlifting Women's Super-Heavyweight,
1,51,Nstor Abad Sanjun,M,23.0,167.0,64.0,Spain,ESP,2016 Summer,2016,Summer,Rio de Janeiro,Gymnastics,Gymnastics Men's Individual All-Around,
2,51,Nstor Abad Sanjun,M,23.0,167.0,64.0,Spain,ESP,2016 Summer,2016,Summer,Rio de Janeiro,Gymnastics,Gymnastics Men's Floor Exercise,
3,51,Nstor Abad Sanjun,M,23.0,167.0,64.0,Spain,ESP,2016 Summer,2016,Summer,Rio de Janeiro,Gymnastics,Gymnastics Men's Parallel Bars,
4,51,Nstor Abad Sanjun,M,23.0,167.0,64.0,Spain,ESP,2016 Summer,2016,Summer,Rio de Janeiro,Gymnastics,Gymnastics Men's Horizontal Bar,


In [27]:
# Fill in rows with missing medals
athlete_df['Medal'] = athlete_df['Medal'].fillna('No_Medals')
athlete_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Country_Name,Country_Code,Games,Year,Season,City,Sport,Event,Medal
0,22,Andreea Aanei,F,22.0,170.0,125.0,Romania,ROU,2016 Summer,2016,Summer,Rio de Janeiro,Weightlifting,Weightlifting Women's Super-Heavyweight,No_Medals
1,51,Nstor Abad Sanjun,M,23.0,167.0,64.0,Spain,ESP,2016 Summer,2016,Summer,Rio de Janeiro,Gymnastics,Gymnastics Men's Individual All-Around,No_Medals
2,51,Nstor Abad Sanjun,M,23.0,167.0,64.0,Spain,ESP,2016 Summer,2016,Summer,Rio de Janeiro,Gymnastics,Gymnastics Men's Floor Exercise,No_Medals
3,51,Nstor Abad Sanjun,M,23.0,167.0,64.0,Spain,ESP,2016 Summer,2016,Summer,Rio de Janeiro,Gymnastics,Gymnastics Men's Parallel Bars,No_Medals
4,51,Nstor Abad Sanjun,M,23.0,167.0,64.0,Spain,ESP,2016 Summer,2016,Summer,Rio de Janeiro,Gymnastics,Gymnastics Men's Horizontal Bar,No_Medals


In [28]:
# Drop the non-beneficial column 'Name'
athlete_df = athlete_df.drop(['ID', 'Name', 'Country_Name', 'Games', 'Year', 'City', 'Event'],1)
athlete_df.head()

  


Unnamed: 0,Sex,Age,Height,Weight,Country_Code,Season,Sport,Medal
0,F,22.0,170.0,125.0,ROU,Summer,Weightlifting,No_Medals
1,M,23.0,167.0,64.0,ESP,Summer,Gymnastics,No_Medals
2,M,23.0,167.0,64.0,ESP,Summer,Gymnastics,No_Medals
3,M,23.0,167.0,64.0,ESP,Summer,Gymnastics,No_Medals
4,M,23.0,167.0,64.0,ESP,Summer,Gymnastics,No_Medals


In [29]:
# Encode Medals
medals = {'Gold': 3, 'Silver': 2, 'Bronze': 1, 'No_Medals':0}
athlete_df['Medal'] = athlete_df['Medal'].map(medals)
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Country_Code,Season,Sport,Medal
0,F,22.0,170.0,125.0,ROU,Summer,Weightlifting,0
1,M,23.0,167.0,64.0,ESP,Summer,Gymnastics,0
2,M,23.0,167.0,64.0,ESP,Summer,Gymnastics,0
3,M,23.0,167.0,64.0,ESP,Summer,Gymnastics,0
4,M,23.0,167.0,64.0,ESP,Summer,Gymnastics,0


In [30]:
athlete_df.columns

Index(['Sex', 'Age', 'Height', 'Weight', 'Country_Code', 'Season', 'Sport',
       'Medal'],
      dtype='object')

In [31]:
# Sum number of Medals per Country
medal_count_by_country = athlete_df.loc[:, ['Sport', 'Country_Code', 'Medal']]
medal_count_by_country

Unnamed: 0,Sport,Country_Code,Medal
0,Weightlifting,ROU,0
1,Gymnastics,ESP,0
2,Gymnastics,ESP,0
3,Gymnastics,ESP,0
4,Gymnastics,ESP,0
...,...,...,...
271111,Athletics,GRE,0
271112,Gymnastics,SUI,2
271113,Gymnastics,SUI,2
271114,Gymnastics,SUI,0


In [32]:
# Group the data by Country_Code and Sport and calculate the sum of the medals columns
medals = medal_count_by_country.groupby(['Sport', 'Country_Code', 'Medal']).size().reset_index(name='Count')

medals.head()



Unnamed: 0,Sport,Country_Code,Medal,Count
0,Aeronautics,SUI,3,1
1,Alpine Skiing,ALB,0,7
2,Alpine Skiing,ALG,0,10
3,Alpine Skiing,AND,0,105
4,Alpine Skiing,ARG,0,262


In [33]:
SUI_test = medals[((medals['Sport'] == 'Curling') & (medals['Country_Code'] == 'SUI'))]
SUI_test.head()

Unnamed: 0,Sport,Country_Code,Medal,Count
2047,Curling,SUI,0,20
2048,Curling,SUI,1,9
2049,Curling,SUI,2,9
2050,Curling,SUI,3,5


In [34]:
bronze_total = medals['Bronze'].sum()
silver_total = medals['Silver'].sum()
gold_total = medals['Gold'].sum()
no_medal_total = medals['No_Medals'].sum()
print(no_medal_total)
print(gold_total)
print(silver_total)
print(bronze_total)

KeyError: 'Bronze'

In [None]:
medals

In [None]:
# Calculate the total number of medals for each country and add a column for the probability of each medal type
medals['Bronze_Prob'] = medals['Bronze']/bronze_total
medals['Silver_Prob'] = medals['Silver']/silver_total
medals['Gold_Prob'] = medals['Gold']/gold_total
medals['No_Medals_Prob'] = medals['No_Medals']/no_medal_total


# Print the result
medals = medals.drop(['Bronze', 'Silver', 'Gold', 'No_Medals'], axis=1)
medals.head()

In [None]:
# Preprocess the data

In [None]:
merged_df = pd.merge(athlete_df, medals, on =['Sport', 'Country_Code'])
#merged_df = merged_df.drop('Country_Code', axis=1)

merged_df.head()

In [None]:
# Drop the rows with missing values in the 'Height' column
#athlete_df.dropna(subset=['Height'], inplace=True)
#athlete_df.head()

In [None]:
# Drop the rows with missing values in the 'Weight' column
#athlete_df.dropna(subset=['Weight'], inplace=True)
#athlete_df.head()

In [None]:
print("Number of rows:", merged_df.shape[0])

In [None]:
# Convert the 'Sex' column to binary values
merged_df['Sex'] = merged_df['Sex'].map({'M':1, 'F':0})
merged_df.head()

In [None]:
# Convert the 'Season' column to binary values
merged_df['Season'] = merged_df['Season'].map({'Summer':1, 'Winter':0})
merged_df.head()

In [None]:
# Encode the 'Sport' column using one-hot encoding
merged_df = pd.get_dummies(merged_df, columns=['Sport'])
merged_df = merged_df.drop('Country_Code', axis=1)
merged_df.head()

In [None]:
# Print the names of the new columns
print(merged_df.columns)

In [None]:
# Print the index of a new sport column - 'Sport_Swimming' for example
#index = athlete_df.columns.get_loc('Sport_Swimming')
#print(index)

In [None]:
# Remove any remaining rows with missing values
merged_df.dropna(inplace=True)
merged_df.tail()

In [None]:
# Split the data into features and target
X= merged_df.drop('Age', axis=1)
y= merged_df['Age']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(merged_df.drop('Age', axis=1), merged_df['Age'], test_size=0.2, random_state=42)

In [None]:
# Define the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Train the model
rf.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = rf.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

In [None]:
merged_df.columns

In [None]:
# Predicting an athelete's potential medal win

In [None]:
# Define the input data for a new athlete to predict which medal_category, if any, they will win
new_athlete_data = pd.DataFrame({'Sex': ['0'], 'Age': [25], 'Height':[170], 'Weight':[110],'Season':[0],'Gold':[0],
       'No_Medals':[0], 'Silver':[0], 'Bronze_Prob':[0], 'Silver_Prob':[0], 'Gold_Prob':[0],
       'No_Medals_Prob':[0], 'Sport_Aeronautics': [0],
       'Sport_Alpine Skiing': [0], 'Sport_Alpinism': [0], 'Sport_Archery': [0],
       'Sport_Art Competitions': [0], 'Sport_Athletics': [0], 'Sport_Badminton': [0],
       'Sport_Baseball': [0], 'Sport_Basketball': [0], 'Sport_Basque Pelota': [0],
       'Sport_Beach Volleyball': [0], 'Sport_Biathlon': [0], 'Sport_Bobsleigh': [0],
       'Sport_Boxing': [0], 'Sport_Canoeing': [0], 'Sport_Cricket': [0], 'Sport_Croquet': [0],
       'Sport_Cross Country Skiing': [0], 'Sport_Curling': [0], 'Sport_Cycling': [0],
       'Sport_Diving': [0], 'Sport_Equestrianism': [0], 'Sport_Fencing': [0],
       'Sport_Figure Skating': [0], 'Sport_Football': [0], 'Sport_Freestyle Skiing': [0],
       'Sport_Golf': [0], 'Sport_Gymnastics': [0], 'Sport_Handball': [0], 'Sport_Hockey': [0],
       'Sport_Ice Hockey': [0], 'Sport_Jeu De Paume': [0], 'Sport_Judo': [0],
       'Sport_Lacrosse': [0], 'Sport_Luge': [0], 'Sport_Military Ski Patrol': [0],
       'Sport_Modern Pentathlon': [0], 'Sport_Motorboating': [0],
       'Sport_Nordic Combined': [0], 'Sport_Polo': [0], 'Sport_Racquets': [0],
       'Sport_Rhythmic Gymnastics': [0], 'Sport_Roque': [0], 'Sport_Rowing': [0],
       'Sport_Rugby': [0], 'Sport_Rugby Sevens': [0], 'Sport_Sailing': [0], 'Sport_Shooting': [0],
       'Sport_Short Track Speed Skating': [0], 'Sport_Skeleton': [0],
       'Sport_Ski Jumping': [0], 'Sport_Snowboarding': [0], 'Sport_Softball': [0],
       'Sport_Speed Skating': [0], 'Sport_Swimming': [0], 'Sport_Synchronized Swimming': [0],
       'Sport_Table Tennis': [0], 'Sport_Taekwondo': [0], 'Sport_Tennis': [0],
       'Sport_Trampolining': [0], 'Sport_Triathlon': [0], 'Sport_Tug-Of-War': [0],
       'Sport_Volleyball': [0], 'Sport_Water Polo': [0], 'Sport_Weightlifting': [0],
       'Sport_Wrestling': [0]})

In [None]:
# Use the trained model to predict the medal category for the new athlete
prediction = rf.predict(new_athlete_data)

# Print the predicted medal category
print(f"Medal prediction for new athlete: {prediction}")