# Data exploration about the recent history of the Olympic Games


# 1. Importing the modules 

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

import os
print(os.listdir("./input"))

ModuleNotFoundError: No module named 'seaborn'

# 2. Data Importing

In [None]:
data = pd.read_csv('./input/athlete_events.csv')
regions = pd.read_csv('./input/noc_regions.csv')

# 3. Collecting information about the two dataset

In [None]:
data.head(5)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
regions.head(5)

# 4. Joining the dataframes

In [None]:
merged = pd.merge(data, regions, on='NOC', how='left')

In [None]:
merged.head()

# 5. Distribution of the age of gold medalists

In [None]:
goldMedals = merged[(merged.Medal == 'Gold')]
goldMedals.head()

In [None]:
goldMedals.isnull().any()

In [None]:
goldMedals = goldMedals[np.isfinite(goldMedals['Age'])]

In [None]:
goldMedals['ID'][goldMedals['Age'] > 50].count()

In [None]:
masterDisciplines = goldMedals['Sport'][goldMedals['Age'] > 50]

In [None]:
print(masterDisciplines)
plt.figure(figsize=(20, 10))
plt.tight_layout()
sns.barplot(x=masterDisciplines.index,y=masterDisciplines.values)
plt.title('Gold Medals for Athletes Over 50')

# 6. Women in Athletics

In [None]:
womenInOlympics = merged[(merged.Sex == 'F') & (merged.Season == 'Summer')]

In [None]:
womenInOlympics.head(10)

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(20, 10))
sns.countplot(x='Year', data=womenInOlympics)
plt.title('Women medals per edition of the Games')

In [None]:
womenInOlympics.loc[womenInOlympics['Year'] == 1900].head(10)

In [None]:
womenInOlympics['ID'].loc[womenInOlympics['Year'] == 1900].count()

# 7. Medals per country

In [None]:
goldMedals.region.value_counts().reset_index(name='Medal').head(5)

In [None]:
totalGoldMedals = goldMedals.region.value_counts().reset_index(name='Medal').head(5)
g = sns.catplot(x="index", y="Medal", data=totalGoldMedals,
                height=6, kind="bar", palette="muted")
g.despine(left=True)
g.set_xlabels("Top 5 countries")
g.set_ylabels("Number of Medals")
plt.title('Medals per Country')

# 8. Disciplines with the greatest number of Gold Medals

In [None]:
goldMedalsIND = goldMedals.loc[goldMedals['NOC'] == 'IND']

In [None]:
goldMedalsIND.Event.value_counts().reset_index(name='Medal').head(20)

In [None]:
goldMedalsIND['ID'].count()

# 9. What is the median height/weight of an Olympic medalist? 

In [None]:
goldMedals.head()

In [None]:
notNullMedals = goldMedals[(goldMedals['Height'].notnull()) & (goldMedals['Weight'].notnull())]

In [None]:
notNullMedals.head()

In [None]:
notNullMedals.info()

In [None]:
plt.figure(figsize=(12, 10))
ax = sns.scatterplot(x="Height", y="Weight", data=notNullMedals)
plt.title('Height vs Weight of Olympic Medalists')

In [None]:
notNullMedals.loc[notNullMedals['Weight'] > 160]

# 10. Evolution of the Olympics over time

We will now create two dataframes dividing the population of our dataset using Sex and Season (we would like to review only the summer games)

In [None]:
MenOverTime = merged[(merged.Sex == 'M') & (merged.Season == 'Summer')]
WomenOverTime = merged[(merged.Sex == 'F') & (merged.Season == 'Summer')]

In [None]:
MenOverTime.head()

In [None]:
part = MenOverTime.groupby('Year')['Sex'].value_counts()
plt.figure(figsize=(20, 10))
part.loc[:,'M'].plot()
plt.title('Variation of Male Athletes over time')

In [None]:
part = WomenOverTime.groupby('Year')['Sex'].value_counts()
plt.figure(figsize=(20, 10))
part.loc[:,'F'].plot()
plt.title('Variation of Female Athletes over time')

In [None]:
plt.figure(figsize=(20, 10))
sns.boxplot(x='Year', y='Age', data=MenOverTime)
plt.title('Variation of Age for Male Athletes over time')

In [None]:
MenOverTime.loc[MenOverTime['Age'] > 80].head(10)

In [None]:
plt.figure(figsize=(20, 10))
sns.boxplot(x='Year', y='Age', data=WomenOverTime)
plt.title('Variation of Age for Female Athletes over time')

In [None]:
WomenOverTime.loc[WomenOverTime['Year'] == 1904]

In [None]:
plt.figure(figsize=(20, 10))
sns.pointplot(x='Year', y='Weight', data=MenOverTime)
plt.title('Variation of Weight for Male Athletes over time')

In [None]:
import warnings
warnings.filterwarnings('ignore')
plt.figure(figsize=(20, 10))
sns.pointplot(x='Year', y='Weight', data=WomenOverTime)
plt.title('Variation of Weight for Female Athletes over time')

In [None]:
womenInOlympics.loc[womenInOlympics['Year'] < 1924].head(20)

***10.4 Variation of height along time***

In [None]:
plt.figure(figsize=(20, 10))
sns.pointplot(x='Year', y='Height', data=MenOverTime, palette='Set2')
plt.title('Variation of Height for Male Athletes over time')

In [None]:
plt.figure(figsize=(20, 10))
sns.pointplot(x='Year', y='Height', data=WomenOverTime, palette='Set2')
plt.title('Variation of Height for Female Athletes over time')

In [None]:
WomenOverTime.loc[(WomenOverTime['Year'] > 1924) & (WomenOverTime['Year'] < 1952)].head(10)

In [None]:
MenOverTime.head(5)

In [None]:
inMenOverTime = MenOverTime.loc[MenOverTime['region'] == 'India']

In [None]:
inMenOverTime.head(5)

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(20, 10))
sns.countplot(x='Year', data=inMenOverTime, palette='Set2')
plt.title('Variation of Age for Indian Male Athletes over time')

In [None]:
inWomenOverTime = WomenOverTime.loc[WomenOverTime['region'] == 'India']

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(20, 10))
sns.countplot(x='Year', data=inWomenOverTime, palette='Set2')
plt.title('Variation of Age for Indian Female Athletes over time')

In [None]:
import pygwalker as pyg
pyg.walk(merged[(merged.Year > 2000)])

# 11. Model Building

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Model Evaluation
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')


In [None]:
data = merged[['Sex','Age','Height','Weight','Medal']]
data['Medal'] = data['Medal'].apply(lambda x: 1 if str(x) != 'nan' else 0)


def binary_encoder(data, columns, positive_values):
    df = data.copy()
    for col, positive_value in zip(columns, positive_values):
        df[col] = df[col].apply(lambda x: 1 if x == positive_value else 0)
    return df


data = binary_encoder(
    data,
    columns=['Sex'],
    positive_values=['M']
)


In [None]:
def fill_missing_values(data,missing_values_columns):
    df = data.copy()
    '''substitute missing values using the above condition- 
    for example: height for female medalist is 170'''
    for col in missing_values_columns:
        df[col] = df.groupby(['Medal', 'Sex'],group_keys=False)[col].apply(lambda x: x.fillna(x.mean()).astype(int))
     
    return df

# lets use this function to fill the missing values
data = fill_missing_values(data,['Age','Height','Weight'])
print("Total missing values:", data.isna().sum().sum())


In [None]:

X = data.drop('Medal', axis=1)
y = data['Medal']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2022)


In [None]:
sc = StandardScaler()
numerical_columns = ['Age', 'Height', 'Weight']
X_train[numerical_columns] = sc.fit_transform(
    X_train[numerical_columns])
X_test[numerical_columns] = sc.transform(
    X_test[numerical_columns])


In [None]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [None]:
y_pred = rf.predict(X_test)


In [None]:
print("Classification Report:\n\n", classification_report(y_test, y_pred))


In [None]:
def predict(Sex, Age, Height, Weight):
 
    sex_bin = 0 if Sex == 'female' else 1
    input_data = sc.transform([[Age, Height, Weight]])
    prediction = rf.predict_proba([[sex_bin] + input_data[0].tolist()])
    return 'Wins Medal' if prediction[0][1] > 0.2 else 'Does not win medal' 


In [None]:
import gradio as gr
gr.Interface(fn=predict,inputs=[gr.Radio(["male","female"]),gr.Number(),gr.Number(),gr.Number()],outputs=[gr.Text()]).launch()

# 11. Conclusions