In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
# Load the data
athlete_df = pd.read_csv('../Resources/clean_athletes.csv')
athlete_df.head()

Unnamed: 0.1,Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Country_Name,Country_Code,Games,Year,Season,City,Sport,Event,Medal
0,0,22,Andreea Aanei,F,22.0,170.0,125.0,Romania,ROU,2016 Summer,2016,Summer,Rio de Janeiro,Weightlifting,Weightlifting Women's Super-Heavyweight,
1,1,51,Nstor Abad Sanjun,M,23.0,167.0,64.0,Spain,ESP,2016 Summer,2016,Summer,Rio de Janeiro,Gymnastics,Gymnastics Men's Individual All-Around,
2,2,51,Nstor Abad Sanjun,M,23.0,167.0,64.0,Spain,ESP,2016 Summer,2016,Summer,Rio de Janeiro,Gymnastics,Gymnastics Men's Floor Exercise,
3,3,51,Nstor Abad Sanjun,M,23.0,167.0,64.0,Spain,ESP,2016 Summer,2016,Summer,Rio de Janeiro,Gymnastics,Gymnastics Men's Parallel Bars,
4,4,51,Nstor Abad Sanjun,M,23.0,167.0,64.0,Spain,ESP,2016 Summer,2016,Summer,Rio de Janeiro,Gymnastics,Gymnastics Men's Horizontal Bar,


In [3]:
# Preprocess the data

In [4]:
# Drop the non-beneficial column 'Name'
athlete_df= athlete_df.drop(['ID', 'Unnamed: 0', 'Name', 'Country_Code', 'Games', 'Year', 'City', 'Event'],1)
athlete_df.head()

  


Unnamed: 0,Sex,Age,Height,Weight,Country_Name,Season,Sport,Medal
0,F,22.0,170.0,125.0,Romania,Summer,Weightlifting,
1,M,23.0,167.0,64.0,Spain,Summer,Gymnastics,
2,M,23.0,167.0,64.0,Spain,Summer,Gymnastics,
3,M,23.0,167.0,64.0,Spain,Summer,Gymnastics,
4,M,23.0,167.0,64.0,Spain,Summer,Gymnastics,


In [5]:
# Drop the rows with missing values in the 'Medal' column
athlete_df.dropna(subset=['Medal'], inplace=True)
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Country_Name,Season,Sport,Medal
8,M,21.0,198.0,90.0,Italy,Summer,Rowing,Bronze
9,F,21.0,165.0,49.0,Azerbaijan,Summer,Taekwondo,Bronze
10,M,31.0,182.0,86.0,France,Summer,Handball,Silver
33,M,26.0,170.0,80.0,Iran,Summer,Wrestling,Bronze
58,M,24.0,161.0,62.0,Russia,Summer,Gymnastics,Silver


In [6]:
# Drop the rows with missing values in the 'Height' column
athlete_df.dropna(subset=['Height'], inplace=True)
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Country_Name,Season,Sport,Medal
8,M,21.0,198.0,90.0,Italy,Summer,Rowing,Bronze
9,F,21.0,165.0,49.0,Azerbaijan,Summer,Taekwondo,Bronze
10,M,31.0,182.0,86.0,France,Summer,Handball,Silver
33,M,26.0,170.0,80.0,Iran,Summer,Wrestling,Bronze
58,M,24.0,161.0,62.0,Russia,Summer,Gymnastics,Silver


In [7]:
# Drop the rows with missing values in the 'Weight' column
athlete_df.dropna(subset=['Weight'], inplace=True)
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Country_Name,Season,Sport,Medal
8,M,21.0,198.0,90.0,Italy,Summer,Rowing,Bronze
9,F,21.0,165.0,49.0,Azerbaijan,Summer,Taekwondo,Bronze
10,M,31.0,182.0,86.0,France,Summer,Handball,Silver
33,M,26.0,170.0,80.0,Iran,Summer,Wrestling,Bronze
58,M,24.0,161.0,62.0,Russia,Summer,Gymnastics,Silver


In [8]:
print("Number of rows:", athlete_df.shape[0])

Number of rows: 30196


In [9]:
# Convert the 'Sex' column to binary values
athlete_df['Sex'] = athlete_df['Sex'].map({'M':1, 'F':0})
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Country_Name,Season,Sport,Medal
8,1,21.0,198.0,90.0,Italy,Summer,Rowing,Bronze
9,0,21.0,165.0,49.0,Azerbaijan,Summer,Taekwondo,Bronze
10,1,31.0,182.0,86.0,France,Summer,Handball,Silver
33,1,26.0,170.0,80.0,Iran,Summer,Wrestling,Bronze
58,1,24.0,161.0,62.0,Russia,Summer,Gymnastics,Silver


In [10]:
# Convert the 'Season' column to binary values
athlete_df['Season'] = athlete_df['Season'].map({'Summer':1, 'Winter':0})
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Country_Name,Season,Sport,Medal
8,1,21.0,198.0,90.0,Italy,1,Rowing,Bronze
9,0,21.0,165.0,49.0,Azerbaijan,1,Taekwondo,Bronze
10,1,31.0,182.0,86.0,France,1,Handball,Silver
33,1,26.0,170.0,80.0,Iran,1,Wrestling,Bronze
58,1,24.0,161.0,62.0,Russia,1,Gymnastics,Silver


In [11]:
# Encode the 'Sport' column using one-hot encoding
athlete_df = pd.get_dummies(athlete_df, columns=['Sport'])
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Country_Name,Season,Medal,Sport_Alpine Skiing,Sport_Archery,Sport_Art Competitions,...,Sport_Table Tennis,Sport_Taekwondo,Sport_Tennis,Sport_Trampolining,Sport_Triathlon,Sport_Tug-Of-War,Sport_Volleyball,Sport_Water Polo,Sport_Weightlifting,Sport_Wrestling
8,1,21.0,198.0,90.0,Italy,1,Bronze,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,21.0,165.0,49.0,Azerbaijan,1,Bronze,0,0,0,...,0,1,0,0,0,0,0,0,0,0
10,1,31.0,182.0,86.0,France,1,Silver,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,1,26.0,170.0,80.0,Iran,1,Bronze,0,0,0,...,0,0,0,0,0,0,0,0,0,1
58,1,24.0,161.0,62.0,Russia,1,Silver,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Print the names of the new columns
print(athlete_df.columns)

Index(['Sex', 'Age', 'Height', 'Weight', 'Country_Name', 'Season', 'Medal',
       'Sport_Alpine Skiing', 'Sport_Archery', 'Sport_Art Competitions',
       'Sport_Athletics', 'Sport_Badminton', 'Sport_Baseball',
       'Sport_Basketball', 'Sport_Beach Volleyball', 'Sport_Biathlon',
       'Sport_Bobsleigh', 'Sport_Boxing', 'Sport_Canoeing',
       'Sport_Cross Country Skiing', 'Sport_Curling', 'Sport_Cycling',
       'Sport_Diving', 'Sport_Equestrianism', 'Sport_Fencing',
       'Sport_Figure Skating', 'Sport_Football', 'Sport_Freestyle Skiing',
       'Sport_Golf', 'Sport_Gymnastics', 'Sport_Handball', 'Sport_Hockey',
       'Sport_Ice Hockey', 'Sport_Judo', 'Sport_Lacrosse', 'Sport_Luge',
       'Sport_Modern Pentathlon', 'Sport_Nordic Combined',
       'Sport_Rhythmic Gymnastics', 'Sport_Rowing', 'Sport_Rugby',
       'Sport_Rugby Sevens', 'Sport_Sailing', 'Sport_Shooting',
       'Sport_Short Track Speed Skating', 'Sport_Skeleton',
       'Sport_Ski Jumping', 'Sport_Snowboarding', 

In [13]:
# Print the index of a new sport column - 'Sport_Swimming' for example
index = athlete_df.columns.get_loc('Sport_Swimming')
print(index)

50


In [14]:
# Convert the 'Medal' column to numerical values
medals = {'Gold': 3, 'Silver': 2, 'Bronze': 1, 'Nan':0}
athlete_df['Medal'] = athlete_df['Medal'].map(medals)
athlete_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Country_Name,Season,Medal,Sport_Alpine Skiing,Sport_Archery,Sport_Art Competitions,...,Sport_Table Tennis,Sport_Taekwondo,Sport_Tennis,Sport_Trampolining,Sport_Triathlon,Sport_Tug-Of-War,Sport_Volleyball,Sport_Water Polo,Sport_Weightlifting,Sport_Wrestling
8,1,21.0,198.0,90.0,Italy,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,21.0,165.0,49.0,Azerbaijan,1,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
10,1,31.0,182.0,86.0,France,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,1,26.0,170.0,80.0,Iran,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
58,1,24.0,161.0,62.0,Russia,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Remove any remaining rows with missing values
athlete_df.dropna(inplace=True)
athlete_df.tail()

Unnamed: 0,Sex,Age,Height,Weight,Country_Name,Season,Medal,Sport_Alpine Skiing,Sport_Archery,Sport_Art Competitions,...,Sport_Table Tennis,Sport_Taekwondo,Sport_Tennis,Sport_Trampolining,Sport_Triathlon,Sport_Tug-Of-War,Sport_Volleyball,Sport_Water Polo,Sport_Weightlifting,Sport_Wrestling
270916,1,21.0,170.0,69.0,United States,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
271040,1,26.0,159.0,70.0,Germany,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
271042,1,26.0,159.0,70.0,Germany,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
271045,1,26.0,159.0,70.0,Germany,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,1
271046,1,26.0,159.0,70.0,Germany,1,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Split the data into features and target
X= athlete_df.drop('Medal', axis=1)
y= athlete_df['Medal']

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(athlete_df.drop(['Medal', 'Country_Name'], axis=1), athlete_df['Medal'], test_size=0.2, random_state=42)

In [18]:
# Define the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [19]:
# Train the model
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [20]:
# Make predictions on the test set
y_pred = rf.predict(X_test)

In [21]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

Accuracy: 35.35%


In [22]:
# Define the input data for a new athlete to predict which medal_category, if any, they will win
new_athlete_data = pd.DataFrame({'Sex': ['1'], 'Age': [28], 'Height': [182], 'Weight': [82], 'Season': [0], 'Sport_Alpine Skiing': [0], 'Sport_Archery': [0], 'Sport_Art Competitions' : [0],
       'Sport_Athletics': [1], 'Sport_Badminton': [0], 'Sport_Baseball': [0],
       'Sport_Basketball': [0], 'Sport_Beach Volleyball': [0], 'Sport_Biathlon': [0],
       'Sport_Bobsleigh': [0], 'Sport_Boxing': [0], 'Sport_Canoeing': [0],
       'Sport_Cross Country Skiing': [0], 'Sport_Curling': [0], 'Sport_Cycling': [0],
       'Sport_Diving': [0], 'Sport_Equestrianism': [0], 'Sport_Fencing': [0],
       'Sport_Figure Skating': [0], 'Sport_Football': [0], 'Sport_Freestyle Skiing': [0],
       'Sport_Golf': [0], 'Sport_Gymnastics': [0], 'Sport_Handball': [0], 'Sport_Hockey': [0],
       'Sport_Ice Hockey': [0], 'Sport_Judo': [0], 'Sport_Lacrosse': [0], 'Sport_Luge': [0],
       'Sport_Modern Pentathlon': [0], 'Sport_Nordic Combined': [0],
       'Sport_Rhythmic Gymnastics': [0], 'Sport_Rowing': [0], 'Sport_Rugby': [0],
       'Sport_Rugby Sevens': [0], 'Sport_Sailing': [0], 'Sport_Shooting': [0],
       'Sport_Short Track Speed Skating': [0], 'Sport_Skeleton': [0],
       'Sport_Ski Jumping': [0], 'Sport_Snowboarding': [0], 'Sport_Softball': [0],
       'Sport_Speed Skating': [0], 'Sport_Swimming': [0], 'Sport_Synchronized Swimming': [0],
       'Sport_Table Tennis': [0], 'Sport_Taekwondo': [0], 'Sport_Tennis': [0],
       'Sport_Trampolining': [0], 'Sport_Triathlon': [0], 'Sport_Tug-Of-War': [0],
       'Sport_Volleyball': [0], 'Sport_Water Polo': [0], 'Sport_Weightlifting': [0],
       'Sport_Wrestling': [0]})

In [23]:
# Use the trained model to predict the medal category for the new athlete
prediction = rf.predict(new_athlete_data)

# Print the predicted medal category
print(f"Medal prediction for new athlete: {prediction}")

Medal prediction for new athlete: [3]
