In [97]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [98]:
# Load the data
athlete_df = pd.read_csv('athlete_events.csv')
athlete_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [99]:
# Preprocess the data

In [100]:
# Drop the non-beneficial column 'Name'
athlete_df= athlete_df.drop(['ID', 'Name', 'Team', 'NOC', 'Games', 'Year', 'City', 'Event'],1)
athlete_df.head()

  


Unnamed: 0,Sex,Age,Height,Weight,Season,Sport,Medal
0,M,24.0,180.0,80.0,Summer,Basketball,
1,M,23.0,170.0,60.0,Summer,Judo,
2,M,24.0,,,Summer,Football,
3,M,34.0,,,Summer,Tug-Of-War,Gold
4,F,21.0,185.0,82.0,Winter,Speed Skating,


In [101]:
# Drop the rows with missing values in the 'Medal' column
athlete_df.dropna(subset=['Medal'], inplace=True)
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Season,Sport,Medal
3,M,34.0,,,Summer,Tug-Of-War,Gold
37,M,30.0,,,Summer,Swimming,Bronze
38,M,30.0,,,Summer,Swimming,Bronze
40,M,28.0,184.0,85.0,Winter,Ice Hockey,Bronze
41,M,28.0,175.0,64.0,Summer,Gymnastics,Bronze


In [102]:
# Drop the rows with missing values in the 'Height' column
athlete_df.dropna(subset=['Height'], inplace=True)
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Season,Sport,Medal
40,M,28.0,184.0,85.0,Winter,Ice Hockey,Bronze
41,M,28.0,175.0,64.0,Summer,Gymnastics,Bronze
42,M,28.0,175.0,64.0,Summer,Gymnastics,Gold
44,M,28.0,175.0,64.0,Summer,Gymnastics,Gold
48,M,28.0,175.0,64.0,Summer,Gymnastics,Gold


In [103]:
# Drop the rows with missing values in the 'Weight' column
athlete_df.dropna(subset=['Weight'], inplace=True)
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Season,Sport,Medal
40,M,28.0,184.0,85.0,Winter,Ice Hockey,Bronze
41,M,28.0,175.0,64.0,Summer,Gymnastics,Bronze
42,M,28.0,175.0,64.0,Summer,Gymnastics,Gold
44,M,28.0,175.0,64.0,Summer,Gymnastics,Gold
48,M,28.0,175.0,64.0,Summer,Gymnastics,Gold


In [104]:
print("Number of rows:", athlete_df.shape[0])

Number of rows: 30196


In [105]:
# Convert the 'Sex' column to binary values
athlete_df['Sex'] = athlete_df['Sex'].map({'M':1, 'F':0})
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Season,Sport,Medal
40,1,28.0,184.0,85.0,Winter,Ice Hockey,Bronze
41,1,28.0,175.0,64.0,Summer,Gymnastics,Bronze
42,1,28.0,175.0,64.0,Summer,Gymnastics,Gold
44,1,28.0,175.0,64.0,Summer,Gymnastics,Gold
48,1,28.0,175.0,64.0,Summer,Gymnastics,Gold


In [106]:
# Convert the 'Season' column to binary values
athlete_df['Season'] = athlete_df['Season'].map({'Summer':1, 'Winter':0})
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Season,Sport,Medal
40,1,28.0,184.0,85.0,0,Ice Hockey,Bronze
41,1,28.0,175.0,64.0,1,Gymnastics,Bronze
42,1,28.0,175.0,64.0,1,Gymnastics,Gold
44,1,28.0,175.0,64.0,1,Gymnastics,Gold
48,1,28.0,175.0,64.0,1,Gymnastics,Gold


In [107]:
# Encode the 'Sport' column using one-hot encoding
athlete_df = pd.get_dummies(athlete_df, columns=['Sport'])
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Season,Medal,Sport_Alpine Skiing,Sport_Archery,Sport_Art Competitions,Sport_Athletics,...,Sport_Table Tennis,Sport_Taekwondo,Sport_Tennis,Sport_Trampolining,Sport_Triathlon,Sport_Tug-Of-War,Sport_Volleyball,Sport_Water Polo,Sport_Weightlifting,Sport_Wrestling
40,1,28.0,184.0,85.0,0,Bronze,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41,1,28.0,175.0,64.0,1,Bronze,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42,1,28.0,175.0,64.0,1,Gold,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44,1,28.0,175.0,64.0,1,Gold,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48,1,28.0,175.0,64.0,1,Gold,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
# Print the names of the new columns
print(athlete_df.columns)

Index(['Sex', 'Age', 'Height', 'Weight', 'Season', 'Medal',
       'Sport_Alpine Skiing', 'Sport_Archery', 'Sport_Art Competitions',
       'Sport_Athletics', 'Sport_Badminton', 'Sport_Baseball',
       'Sport_Basketball', 'Sport_Beach Volleyball', 'Sport_Biathlon',
       'Sport_Bobsleigh', 'Sport_Boxing', 'Sport_Canoeing',
       'Sport_Cross Country Skiing', 'Sport_Curling', 'Sport_Cycling',
       'Sport_Diving', 'Sport_Equestrianism', 'Sport_Fencing',
       'Sport_Figure Skating', 'Sport_Football', 'Sport_Freestyle Skiing',
       'Sport_Golf', 'Sport_Gymnastics', 'Sport_Handball', 'Sport_Hockey',
       'Sport_Ice Hockey', 'Sport_Judo', 'Sport_Lacrosse', 'Sport_Luge',
       'Sport_Modern Pentathlon', 'Sport_Nordic Combined',
       'Sport_Rhythmic Gymnastics', 'Sport_Rowing', 'Sport_Rugby',
       'Sport_Rugby Sevens', 'Sport_Sailing', 'Sport_Shooting',
       'Sport_Short Track Speed Skating', 'Sport_Skeleton',
       'Sport_Ski Jumping', 'Sport_Snowboarding', 'Sport_Softball'

In [109]:
# Print the index of a new sport column - 'Sport_Swimming' for example
index = athlete_df.columns.get_loc('Sport_Swimming')
print(index)

49


In [110]:
# Convert the 'Medal' column to numerical values
medals = {'Gold': 3, 'Silver': 2, 'Bronze': 1, 'Nan':0}
athlete_df['Medal'] = athlete_df['Medal'].map(medals)
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Season,Medal,Sport_Alpine Skiing,Sport_Archery,Sport_Art Competitions,Sport_Athletics,...,Sport_Table Tennis,Sport_Taekwondo,Sport_Tennis,Sport_Trampolining,Sport_Triathlon,Sport_Tug-Of-War,Sport_Volleyball,Sport_Water Polo,Sport_Weightlifting,Sport_Wrestling
40,1,28.0,184.0,85.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41,1,28.0,175.0,64.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42,1,28.0,175.0,64.0,1,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44,1,28.0,175.0,64.0,1,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48,1,28.0,175.0,64.0,1,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [130]:
# Remove any remaining rows with missing values
athlete_df.dropna(inplace=True)
athlete_df.tail()

Unnamed: 0,Sex,Age,Height,Weight,Season,Medal,Sport_Alpine Skiing,Sport_Archery,Sport_Art Competitions,Sport_Athletics,...,Sport_Table Tennis,Sport_Taekwondo,Sport_Tennis,Sport_Trampolining,Sport_Triathlon,Sport_Tug-Of-War,Sport_Volleyball,Sport_Water Polo,Sport_Weightlifting,Sport_Wrestling
271078,0,25.0,168.0,80.0,1,2,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
271080,0,33.0,168.0,80.0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
271082,1,28.0,182.0,82.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
271102,0,19.0,171.0,64.0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
271103,0,23.0,171.0,64.0,1,2,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [112]:
# Split the data into features and target
X= athlete_df.drop('Medal', axis=1)
y= athlete_df['Medal']

In [113]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(athlete_df.drop('Medal', axis=1), athlete_df['Medal'], test_size=0.2, random_state=42)

In [114]:
# Define the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [115]:
# Train the model
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [116]:
# Make predictions on the test set
y_pred = rf.predict(X_test)

In [117]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

Accuracy: 36.89%


In [131]:
# Define the input data for a new athlete to predict which medal_category, if any, they will win
new_athlete_data = pd.DataFrame({'Sex': ['1'], 'Age': [28], 'Height': [182], 'Weight': [82], 'Season': [0], 'Sport_Alpine Skiing': [0], 'Sport_Archery': [0], 'Sport_Art Competitions' : [0],
       'Sport_Athletics': [1], 'Sport_Badminton': [0], 'Sport_Baseball': [0],
       'Sport_Basketball': [0], 'Sport_Beach Volleyball': [0], 'Sport_Biathlon': [0],
       'Sport_Bobsleigh': [0], 'Sport_Boxing': [0], 'Sport_Canoeing': [0],
       'Sport_Cross Country Skiing': [0], 'Sport_Curling': [0], 'Sport_Cycling': [0],
       'Sport_Diving': [0], 'Sport_Equestrianism': [0], 'Sport_Fencing': [0],
       'Sport_Figure Skating': [0], 'Sport_Football': [0], 'Sport_Freestyle Skiing': [0],
       'Sport_Golf': [0], 'Sport_Gymnastics': [0], 'Sport_Handball': [0], 'Sport_Hockey': [0],
       'Sport_Ice Hockey': [0], 'Sport_Judo': [0], 'Sport_Lacrosse': [0], 'Sport_Luge': [0],
       'Sport_Modern Pentathlon': [0], 'Sport_Nordic Combined': [0],
       'Sport_Rhythmic Gymnastics': [0], 'Sport_Rowing': [0], 'Sport_Rugby': [0],
       'Sport_Rugby Sevens': [0], 'Sport_Sailing': [0], 'Sport_Shooting': [0],
       'Sport_Short Track Speed Skating': [0], 'Sport_Skeleton': [0],
       'Sport_Ski Jumping': [0], 'Sport_Snowboarding': [0], 'Sport_Softball': [0],
       'Sport_Speed Skating': [0], 'Sport_Swimming': [0], 'Sport_Synchronized Swimming': [0],
       'Sport_Table Tennis': [0], 'Sport_Taekwondo': [0], 'Sport_Tennis': [0],
       'Sport_Trampolining': [0], 'Sport_Triathlon': [0], 'Sport_Tug-Of-War': [0],
       'Sport_Volleyball': [0], 'Sport_Water Polo': [0], 'Sport_Weightlifting': [0],
       'Sport_Wrestling': [0]})

In [133]:
# Use the trained model to predict the medal category for the new athlete
prediction = rf.predict(new_athlete_data)

# Print the predicted medal category
print(f"Medal prediction for new athlete: {prediction}")

Medal prediction for new athlete: [3]
