# Fantastic Stats and Where to Find Them: Unleashing Machine Learning on Sports

Postgraduate Program in Statistical Systems, with a specialization in Central Banks' Statistics

Predictive Data Mining - Final Project

2023/2024


##### Import used packages and set environment

In [None]:
#Import packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import graphviz
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

#from google.colab import files
#Set environment
%matplotlib inline
pd.set_option('display.max_columns',None)

#Set directory
#os.chdir(r'C:\Users\Documents\2nd semester\Predictive Data Mining\Datasets')
os.chdir(r'C:\Users\danny\Documents\Pós Graduação\2nd semester\Predictive Data Mining\Datasets')

##### Define Functions

In [None]:
def gen_graphs(df, col, data_types):
    #inputs
    df_raw = df
    col = col
    data_types = data_types

    #start graph figure
    fig, axes = plt.subplots(31, 1, figsize=(15, 150), sharey=False)
    fig.suptitle('')

    #start with histogram
    try:
        df_raw[df_raw[col].notna()][col].hist(ax=axes[0], bins=20)
    except:
        ''

    #plot against all variables
    for i in range(len(data_types.keys())):
        #print(i)
        col2 = list(data_types.keys())[i]
        #print(col2)
        if col2 not in [col, 'Athlete Id', 'RecordID']:
            #dummy vs int
                #graph = sns.boxplot
                #var1 = dummy
                #var2 = int
            #dummy vs str
                #graph = sns.histplot
                #var1 = str
                #var2 = dummy
            #dummy vs dummy
                #graph = sns.histplot
                #var1 = dummy
                #var2 = dummy
            #int vs str
                #graph = sns.boxplot
                #var1 = str
                #var2 = int
            #int vs int
                #graph = sns.boxplot
                #var1 = int com menos valores
                #var2 = other int
            #str vs str
                #graph = sns.histplot
                #var1 = str com mais valores
                #var2 = other str
            col_type = 'dummy' if len(df_raw[df_raw[col].notna()][col].unique())==2\
                        else 'int' if data_types[col]==int\
                        else 'str'
            col2_type = 'dummy' if len(df_raw[df_raw[col2].notna()][col2].unique())==2\
                         else 'int' if data_types[col2]==int\
                         else 'str'

            if (col_type=='dummy' and col2_type=='int') or (col_type=='int' and col2_type=='dummy'):
                graph = 'boxplot'
                var1 = col if col_type=='dummy' else col2
                var2 = col2 if col_type=='dummy' else col
            elif (col_type=='str' and col2_type=='int') or (col_type=='int' and col2_type=='str'):
                graph = 'boxplot'
                var1 = col if col_type=='str' else col2
                var2 = col2 if col_type=='str' else col
            elif (col_type=='int' and col2_type=='int'):
                graph = 'boxplot'
                var1 = col if len(df_raw[col].unique())<len(df_raw[col2].unique()) else col2
                var2 = col2 if len(df_raw[col].unique())<len(df_raw[col2].unique()) else col

            elif (col_type=='dummy' and col2_type=='str') or (col_type=='str' and col2_type=='dummy'):
                graph = 'histplot'
                var1 = col if col_type=='str' else col2
                var2 = col2 if col_type=='str' else col
            elif (col_type=='dummy' and col2_type=='dummy'):
                graph = 'histplot'
                var1 = col
                var2 = col2
            elif (col_type=='str' and col2_type=='str'):
                graph = 'histplot'
                var1 = col if len(df_raw[col].unique())>len(df_raw[col2].unique()) else col2
                var2 = col2 if len(df_raw[col].unique())>len(df_raw[col2].unique()) else col

            #if bol vs bol then 2nd graph
            if graph=='boxplot':
                sns.boxplot(ax=axes[i+1], x=var1, y=var2, data=df_raw[(df_raw[var1].notna()) & (df_raw[var2].notna())])
#                y_min = min([np.nanpercentile(df_raw[df_raw[var1]==i][var2], 10) for i in df_raw[df_raw[var1].notna()][var1].unique()])
#                y_max = max([np.nanpercentile(df_raw[df_raw[var1]==i][var2], 90) for i in df_raw[df_raw[var1].notna()][var1].unique()])
#                axes[i+1].set_ylim(y_min,y_max)
            elif graph=='histplot':
                sns.histplot(ax=axes[i+1], x=var1,hue=var2,multiple='stack',data=df_raw[(df_raw[var1].notna()) & (df_raw[var2].notna())])

            axes[i+1].set_title(col2)

##### Import and describe the Raw data

In [None]:
#Import raw DF - Trasin
df_raw=pd.read_csv(r'train.csv')

#Import Test DF
df_test=pd.read_csv(r'test.csv')

In [None]:
#Brief view of DF
df_raw.head()

In [None]:
#Shape of DF
df_raw.shape

In [None]:
#Explore the columns of DF
df_raw.info()

In [None]:
#Explore the values of columns
df_raw.describe()

In [None]:
df_raw.describe(include = ['O'])

In [None]:
{i: list(df_raw[i].unique()) for i in df_raw.describe(include = ['O']).columns}

### Metadata
After analysing the data types and field lists, as well with a brif glimpse of the data, we have come up with the metadata:



| Variable | Description | Type | Field List | Comments |
| --- | --- | --- | --- | --- |
| Athlete Id | ID | INT | [0, ∞[ |  |
| Age group | Athlete age range | STRING | ['0-35', '35-55', '55<='] |  |
| Athlete score | Athlete score from previous competitions | INT | [0, ∞[ |  |
| Cancelled enrollment | Athlete cancelled the competition enrollment | BOOLEAN | [True, False] |  |
| Cardiovascular training | Number of training sessions such as running, cycling, or swimming | INT | [0, ∞[ |  |
| Competition | Type of competition | STRING | ['Federation League', 'Regional Tournament', 'Olympic Games', 'National Cup', 'Local Match', 'Continental Championship', 'World Championship'] |  |
| Disability | Athlete with disability | BOOLEAN | [True, False] |  |
| Edition | The year of the edition competition | STRING |['2019', '2020', '2021', '2022'] |  |
| Education | Athlete education level | STRING | ['High school', 'University Degree', 'Middle school', 'Elementary school', 'Post Graduate'] |  |
| Income | Athlete income level | STRING | ['Middle', 'Low', 'Middle-High', 'High', 'Middle-Low'] |  |
| Late enrollment | Athlete enrolled in the competition belatedly | BOOLEAN | [True, False] |  |
| Mental preparation | Athlete has developed strategies for handling with stress and pressure | BOOLEAN | [True, False] |  |
| No coach | Athlete does not have a coach | BOOLEAN | [True, False] |  |
| Other training | Number of training sessions using non-standard approaches | INT | [0, ∞[ |  |
| Outcome | Competition result | INT | [0, 1] |  |
| Outdoor Workout | Training conducted outdoors in parks or forests | BOOLEAN | [True, False] |  |
| Past injuries | Athlete had sport injuries | BOOLEAN | [True, False] |  |
| Physiotherapy | Number of physiotherapy sessions | INT | [0, ∞[ |  |
| Plyometric training | Number of training sessions involving explosive, high-intensity movements | INT | [0, ∞[ |  |
| Previous attempts | Number of previous competitions attempts | INT | [0, 6] |  |
| RecordID | ID of the registration of one athlete into an edition of a given competition | INT | [0, ∞[ |  |
| Recovery | Number of recovery sessions using stretching and massages techniques | INT | [0, ∞[ |  |
| Region | Athlete region | STRING | ['Southern Africa', 'Oceania', 'Middle East', 'Southern Europe', 'Western Europe', 'East Asia', 'North America', 'Central Asia', 'Northern Africa', 'South Asia', 'South America', 'Central America', 'Eastern Europe'] |  |
| Sand training | Number of training sessions involving sand drills | INT | [0, ∞[ |  |
| Sex | Athlete sex | STRING | ['M', 'F'] |  |
| Sport-specific training | Number of training sessions that mimic competition scenarios | INT | [0, ∞[ |  |
| Squad training | Number of training sessions that involve a group of athletes working together to prepare for competition | INT | [0, ∞[ |  |
| Strength training | Number of training sessions using weightlifting and bodyweight exercises | INT | [0, ∞[ |  |
| Supplements | Number of nutritional supplements taken to aid performance | INT | [0, ∞[ |  |
| Train bf competition | Number of pre-competition preparation sessions | INT | [0, ∞[ |  |

First observations on data quality
- Missing athlete ID
- Cells with NaN's
- Cells with values outside the field list
    - negative values for field lists [0, ∞[ (Physiotherapy, athlete score)
    - typos (e.g. 'FASE')
    - wrong data type (e.g. 'FALSE' instead of False)
- Need to convert Boolean variables to [0, 1]
- Need to convert Categorical variables into dummies.


### **Data Cleaning - Train Dataset**

Based on the first observation we will first solve the following issues:
- Age Group cannot be 0. Set as 'nan' to correct after.
- Correct detected typos: FASE->FALSE in Mental preparation.
- Correct wrong data types: STRING->BOOLEAN in Mental preparation (FALSE -> False and TRUE -> True).
- Convert boolean to [0, 1].
- Convert negative values to absolute value for variables with field list [0, ∞[ (assume it is a recording error)
- Edition was stored as a float. Since the variable represents a year, it does not make sence to have decimal places in the number.
- Enforce expected data types in columns.

Later we will proceed with cleaning variable by variable

In [None]:
#Set age group 0 to 'nan'
df_raw.loc[df_raw['Age group']=='0','Age group']=np.nan

In [None]:
#Correct detected typos: FASE->FALSE in Mental preparation
df_raw.replace({'Mental preparation': {'FASE': 'FALSE'}}, inplace=True)

In [None]:
#Correct wrong data types: STRING->BOOLEAN in Mental preparation
df_raw.replace({'Mental preparation': {'FALSE': False}}, inplace=True)
df_raw.replace({'Mental preparation': {'TRUE': True}}, inplace=True)

In [None]:
#Convert boolean to [0, 1]
for col in ['Cancelled enrollment','Disability','Late enrollment','Mental preparation','No coach','Outdoor Workout','Past injuries']:
    df_raw.replace({col: {False: 0}}, inplace=True)
    df_raw.replace({col: {True: 1}}, inplace=True)

In [None]:
#Convert negative values to absolute value for variables with field list [0, ∞[ (assume it is a recording error)
for col in ['Athlete score','Physiotherapy']:
    df_raw[col] = df_raw[col].abs()

In [None]:
#Delete the decimals from the year
df_raw['Edition']=df_raw['Edition'].astype('Int64',errors='ignore')

In [None]:
#Enforce expected data types in columns
data_types = {'Athlete Id': int,'Age group': str,'Athlete score': int,'Cancelled enrollment': int,'Cardiovascular training': int,'Competition': str,'Disability': int,'Edition': str,'Education': str,'Income': str,'Late enrollment': int,'Mental preparation': int,'No coach': int,'Other training': int,'Outdoor Workout': int,'Past injuries': int,'Physiotherapy': int,'Plyometric training': int,'Previous attempts': int,'RecordID': int,'Recovery': int,'Region': str,'Sand training': int,'Sex': str,'Sport-specific training': int,'Squad training': int,'Strength training': int,'Supplements': int,'Train bf competition': int, 'Outcome': int}
for col in data_types.keys():
    df_raw[col] = df_raw[col].astype(data_types[col], errors='ignore')
    if data_types[col]==str:
        df_raw.loc[df_raw[col]=='nan',col]=np.nan
    if data_types[col]==str:
        df_raw.loc[df_raw[col]=='<NA>',col]=np.nan
    if data_types[col]==int:
        df_raw.loc[df_raw[col].isna(),col]=np.nan

In [None]:
#Correlation of the variables
corr = df_raw.corr()
corr.style.background_gradient(cmap='coolwarm')

Variable 1: Age group

In [None]:
col='Age group'

In [None]:
df_raw[col].describe(include = ['O'])

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#For the boxplots that appear to be relevant, we zoom in, in order to take a closer look at tha data.
vars=['Physiotherapy','Recovery','Sport-specific training', 'Squad training', 'Supplements', 'Train bf competition','Other training','Strength training']
fig, axes = plt.subplots(len(vars), 1, figsize=(15, len(vars)*5), sharey=False)
i=0
for var in vars:
    sns.boxplot(ax=axes[i], x=col, y=var, data=df_raw[(df_raw[col].notna()) & (df_raw[var].notna())])
    axes[i].set_ylim(0,800)
    i+=1


In [None]:
#For athletes that have more than one observation, and one of them is not null Age group, it is possible to infere the Age group level for the null observation.
for i in [i for i in df_raw[(df_raw[col].isnull()) & (df_raw['Athlete Id'].isin(df_raw[(df_raw[col].notnull())]['Athlete Id']))]['Athlete Id'] if not np.isnan(i)]:
    df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete Id']==i),col]=df_raw[(df_raw[col].notnull()) & (df_raw['Athlete Id']==i)][col].tolist()[0]

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is higher than 40, the Age group is likely to be 0-35
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete score']>=40),col]='0-35'

In [None]:
#From the graphs, it is possible to infere that when the Education level is elementary scholl, the Age group is likely to be 0-35
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Education']=='Elementary school'),col]='0-35'

In [None]:
#From the graphs, it is possible to infere that when the Physiotherapy is >=40, the Age group is likely to be 55<=
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Physiotherapy']>=40),col]='55<='
#From the graphs, it is possible to infere that when the Recovery is >=500, the Age group is likely to be 55<=
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Recovery']>=500),col]='55<='
#From the graphs, it is possible to infere that when the Supplements is >250, the Age group is likely to be 55<=
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Supplements']>=250),col]='55<='
#From the graphs, it is possible to infere that when the Train bf competition is >450, the Age group is likely to be 55<=
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Train bf competition']>=450),col]='55<='
#From the graphs, it is possible to infere that when the Strength training is <75, the Age group is likely to be 0-35
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']<=75),col]='0-35'
#From the graphs, it is possible to infere that when the Strength training is >750, the Age group is likely to be 55<=
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']>=750),col]='55<='
#From the graphs, it is possible to infere that when the Sport specific training is >=50, the Age group is likely to be 55<=
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Sport-specific training']>=50),col]='55<='
#From the graphs, it is possible to infere that when the Squad training is >5, the Age group is likely to be 55<=
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Squad training']>=5),col]='55<='

Variable 2:Competition

In [None]:
col='Competition'

In [None]:
df_raw[col].describe(include = ['O'])

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#For the boxplots that appear to be relevant, we zoom in, in order to take a closer look at tha data.
vars=['Physiotherapy','Recovery','Sport-specific training', 'Supplements', 'Train bf competition','Other training','Strength training']
fig, axes = plt.subplots(len(vars), 1, figsize=(15, len(vars)*5), sharey=False)
i=0
for var in vars:
    sns.boxplot(ax=axes[i], x=col, y=var, data=df_raw[(df_raw[col].notna()) & (df_raw[var].notna())])
    axes[i].set_ylim(0,1100)
    i+=1

In [None]:
#From the graphs, it is possible to infere that when there is late enrollment, competition is a Regional match
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Late enrollment']==1),col]='Regional Match'
#From the graphs, it is possible to infere that when there is Mental preparation, competition is a local match
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Mental preparation']==1),col]='Local Match'
#From the graphs, it is possible to infere that when there is Outdoor Workout, competition is a local match
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Outdoor Workout']==1),col]='Local Match'
#From the graphs, it is possible to infere that when there is Plyometric training, competition is a local match
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Plyometric training']>0),col]='Local Match'

In [None]:
#From the graphs, it is possible to infere that when the Physiotherapy is >40, competition is a Continental championship
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Physiotherapy']>=100) & (df_raw['Edition']!=2020),col]='Continental Championship'
#From the graphs, it is possible to infere that when the Strength training is >1100, competition is a local match
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']>=1100),col]='Local Match'

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
pd.crosstab(df_raw['Edition'].fillna('Missing'),df_raw[col].fillna('Missing'),margins=True)

In [None]:
#For each edition, what is the most frequent competition? -> Apply that competition to the missing values.
competition_mapping = {'2019':'Local Match', '2020':'Local Match', '2021':'National Cup', '2022':'National Cup'}

# Replace missing values in 'competition' column based on 'edition' column
df_raw[col] = df_raw[col].fillna(df_raw['Edition'].map(competition_mapping))

pd.crosstab(df_raw['Edition'].fillna('Missing'),df_raw[col].fillna('Missing'),margins=True)

Variable 3:Education

In [None]:
col='Education'

In [None]:
df_raw[col].describe(include = ['O'])

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#For the boxplots that appear to be relevant, we zoom in, in order to take a closer look at tha data.
vars=['Cardiovascular training','Physiotherapy','Recovery','Sport-specific training', 'Supplements', 'Train bf competition','Other training','Strength training']
fig, axes = plt.subplots(len(vars), 1, figsize=(15, len(vars)*5), sharey=False)
i=0
for var in vars:
    sns.boxplot(ax=axes[i], x=col, y=var, data=df_raw[(df_raw[col].notna()) & (df_raw[var].notna())])
    axes[i].set_ylim(0,650)
    i+=1

In [None]:
#For athletes that have more than one observation, and one of them is not null Education, it is possible to infere the Education level for the null observation.
for i in [i for i in df_raw[(df_raw[col].isnull()) & (df_raw['Athlete Id'].isin(df_raw[(df_raw[col].notnull())]['Athlete Id']))]['Athlete Id'] if not np.isnan(i)]:
    df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete Id']==i),col]=df_raw[(df_raw[col].notnull()) & (df_raw['Athlete Id']==i)][col].tolist()[0]

In [None]:
#From the graphs, it is possible to infere that when the Cardiovascular Training is >=500, Education is likely Post Graduate
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Cardiovascular training']>=500), col]='Post Graduate'
#From the graphs, it is possible to infere that when the Supplements is >=250,Education is likely Post Graduate
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Supplements']>=250),col]='Post Graduate'
#From the graphs, it is possible to infere that when the Strength training is >1100, competition is a local match
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']>=700),col]='University Degree'

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 4: Income

In [None]:
col='Income'

In [None]:
df_raw[col].describe(include = ['O'])

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#For the boxplots that appear to be relevant, we zoom in, in order to take a closer look at tha data.
vars=['Physiotherapy','Recovery','Sport-specific training', 'Squad training', 'Supplements', 'Train bf competition','Other training','Strength training']
fig, axes = plt.subplots(len(vars), 1, figsize=(15, len(vars)*5), sharey=False)
i=0
for var in vars:
    sns.boxplot(ax=axes[i], x=col, y=var, data=df_raw[(df_raw[col].notna()) & (df_raw[var].notna())])
    axes[i].set_ylim(0,650)
    i+=1

In [None]:
#For athletes that have more than one observation, and one of them is not null Education, it is possible to infere the Education level for the null observation.
for i in [i for i in df_raw[(df_raw[col].isnull()) & (df_raw['Athlete Id'].isin(df_raw[(df_raw[col].notnull())]['Athlete Id']))]['Athlete Id'] if not np.isnan(i)]:
    df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete Id']==i),col]=df_raw[(df_raw[col].notnull()) & (df_raw['Athlete Id']==i)][col].tolist()[0]

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is higher than 40, the Income is likely to be Low
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete score']>=40),col]='Low'

In [None]:
#From the graphs, it is possible to infere that when the Strength training is >650, the Education level is likely to be High
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']>=650),col]='High'

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 5: Sex

In [None]:
col='Sex'

In [None]:
df_raw[col].describe(include = ['O'])

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#For the boxplots that appear to be relevant, we zoom in, in order to take a closer look at tha data.
vars=['Physiotherapy','Recovery','Sport-specific training', 'Cardiovascular training', 'Supplements', 'Train bf competition','Other training','Strength training']
fig, axes = plt.subplots(len(vars), 1, figsize=(15, len(vars)*5), sharey=False)
i=0
for var in vars:
    sns.boxplot(ax=axes[i], x=col, y=var, data=df_raw[(df_raw[col].notna()) & (df_raw[var].notna())])
    axes[i].set_ylim(0,600)
    i+=1

In [None]:
#For athletes that have more than one observation, and one of them not null Sex, it is possible to infere the Sex for the null observation.
for i in [i for i in df_raw[(df_raw[col].isnull()) & (df_raw['Athlete Id'].isin(df_raw[(df_raw[col].notnull())]['Athlete Id']))]['Athlete Id'] if not np.isnan(i)]:
    df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete Id']==i),col]=df_raw[(df_raw[col].notnull()) & (df_raw['Athlete Id']==i)][col].tolist()[0]

In [None]:
#From the graphs, it is possible to infere that when the Physiotherapy is >=0, the Sex is likely to be M
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Physiotherapy']>0),col]='M'
#From the graphs, it is possible to infere that when the Cardiovascular is >=150, the Sex is likely to be M
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Cardiovascular training']>=150),col]='M'
#From the graphs, it is possible to infere that when the Supplements is >=150, the Sex is likely to be M
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Supplements']>=150),col]='M'
#From the graphs, it is possible to infere that when the Train bf competition is >=350, is likely to be M
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Train bf competition']>=350),col]='M'
#From the graphs, it is possible to infere that when the Strength training is <=100, the Sex is likely to be F
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']<=100),col]='F'
#From the graphs, it is possible to infere that when the Strength training is >400, the Sex is likely to be M
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']>=400),col]='M'

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 6: Region

In [None]:
col='Region'

In [None]:
df_raw[col].describe(include = ['O'])

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#For the boxplots that appear to be relevant, we zoom in, in order to take a closer look at tha data.
vars=['Recovery', 'Supplements', 'Train bf competition','Strength training']
fig, axes = plt.subplots(len(vars), 1, figsize=(15, len(vars)*5), sharey=False)
i=0
for var in vars:
    sns.boxplot(ax=axes[i], x=col, y=var, data=df_raw[(df_raw[col].notna()) & (df_raw[var].notna())])
    axes[i].set_ylim(0,700)
    i+=1

In [None]:
#For athletes that have more than one observation, and one of them is not a null Region, it is possible to infere the Region for the null observation.
for i in [i for i in df_raw[(df_raw[col].isnull()) & (df_raw['Athlete Id'].isin(df_raw[(df_raw[col].notnull())]['Athlete Id']))]['Athlete Id'] if not np.isnan(i)]:
    df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete Id']==i),col]=df_raw[(df_raw[col].notnull()) & (df_raw['Athlete Id']==i)][col].tolist()[0]

In [None]:
#From the graphs, it is possible to infere that when the Athlete education is Elementary School, the Region is likely to be South America
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Education']=='Elementary School'),col]='South America'

Variable 7: Cancelled enrollment

In [None]:
col='Cancelled enrollment'

In [None]:
df_raw[col].describe(include = ['O'])

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#For the boxplots that appear to be relevant, we zoom in, in order to take a closer look at tha data.
vars=['Physiotherapy','Recovery','Sport-specific training','Cardiovascular training','Supplements', 'Train bf competition','Strength training']
fig, axes = plt.subplots(len(vars), 1, figsize=(15, len(vars)*5), sharey=False)
i=0
for var in vars:
    sns.boxplot(ax=axes[i], x=col, y=var, data=df_raw[(df_raw[col].notna()) & (df_raw[var].notna())])
    axes[i].set_ylim(0,300)
    i+=1

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is >=60, Cancelled enrollment is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete score']>=60),col]=1
#From the graphs, it is possible to infere that when the Outcome is 1, Cancelled enrollment is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Outcome']==1),col]=0

In [None]:
#From the graphs, it is possible to infere that when the Physiotherapy is >=10, the Cancelled enrollment is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Physiotherapy']>=20),col]=0
#From the graphs, it is possible to infere that when the Recovery is <=40, the Cancelled enrollment is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Recovery']<=40),col]=1
#From the graphs, it is possible to infere that when the Recovery is >=175, the Cancelled enrollment is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Recovery']>=175),col]=0
#From the graphs, it is possible to infere that when the Cardiovascular training is >=125, the Cancelled enrollment is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Cardiovascular training']>=125),col]=0
#From the graphs, it is possible to infere that when the Supplements is >=100, Cancelled enrollment is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Supplements']>=100),col]=0
#From the graphs, it is possible to infere that when the Train bf competition is >=100, Cancelled enrollment is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Train bf competition']<=100),col]=1
#From the graphs, it is possible to infere that when the Train bf competition is >=175, Cancelled enrollment is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Train bf competition']>=175),col]=0
#From the graphs, it is possible to infere that when the Strength training is <=70, Cancelled enrollment is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']<=70),col]=1
#From the graphs, it is possible to infere that when the Strength training is >=250, Cancelled enrollment is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']>=250),col]=0
#From the graphs, it is possible to infere that when the Sport specific training is >=20, the Cancelled enrollment is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Sport-specific training']>=20),col]=0

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 8: Disability

In [None]:
col='Disability'

In [None]:
df_raw[col].describe(include = ['O'])

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#For the boxplots that appear to be relevant, we zoom in, in order to take a closer look at tha data.
vars=['Supplements', 'Train bf competition','Strength training']
fig, axes = plt.subplots(len(vars), 1, figsize=(15, len(vars)*5), sharey=False)
i=0
for var in vars:
    sns.boxplot(ax=axes[i], x=col, y=var, data=df_raw[(df_raw[col].notna()) & (df_raw[var].notna())])
    axes[i].set_ylim(0,800)
    i+=1

In [None]:
#For athletes that have more than one observation, and one of them is not a null Region, it is possible to infere the Region for the null observation.
for i in [i for i in df_raw[(df_raw[col].isnull()) & (df_raw['Athlete Id'].isin(df_raw[(df_raw[col].notnull())]['Athlete Id']))]['Athlete Id'] if not np.isnan(i)]:
    df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete Id']==i),col]=df_raw[(df_raw[col].notnull()) & (df_raw['Athlete Id']==i)][col].tolist()[0]

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is >=60, Disability is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete score']>=60),col]=1
#From the graphs, it is possible to infere that when the Competition is Olympic Games, Disability is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Competition']=='Olympic Games'),col]=0
#From the graphs, it is possible to infere that when the Late Enrollment is True, Disability is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Late enrollment']==1),col]=0
#From the graphs, it is possible to infere that when the Outdoor Workout is True, Disability is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Outdoor Workout']==1),col]=0

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 9: Late enrollment

In [None]:
col='Late enrollment'

In [None]:
df_raw[col].describe(include = ['O'])

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#For the boxplots that appear to be relevant, we zoom in, in order to take a closer look at tha data.
vars=['Physiotherapy','Recovery','Cardiovascular training', 'Supplements', 'Train bf competition','Strength training']
fig, axes = plt.subplots(len(vars), 1, figsize=(15, len(vars)*5), sharey=False)
i=0
for var in vars:
    sns.boxplot(ax=axes[i], x=col, y=var, data=df_raw[(df_raw[col].notna()) & (df_raw[var].notna())])
    axes[i].set_ylim(0,700)
    i+=1

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is >=40, Late enrollment is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete score']>=40),col]=1
#From the graphs, it is possible to infere that when the Competition is Regional Tournment, Late enrollment is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Competition']=='Regional Tournment'),col]=1
#From the graphs, it is possible to infere that when Previous attempts>=0, Late enrollment is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Previous attempts']>0),col]=1

In [None]:
#From the graphs, it is possible to infere that when the Physiotherapy is >=0, Late enrollment is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Physiotherapy']>0),col]=0
#From the graphs, it is possible to infere that when the Recovery is >=150, Late enrollment is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Recovery']>=150),col]=0
#From the graphs, it is possible to infere that when the Cardiovascular training is >=150, Late enrollment is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Cardiovascular training']>=150),col]=0
#From the graphs, it is possible to infere that when Suplements is >=100, Late enrollment is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Supplements']>=100),col]=0
#From the graphs, it is possible to infere that when Train bf competition is >=200, Late enrollment is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Train bf competition']>=200),col]=0

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 10: Mental preparation

In [None]:
col='Mental preparation'

In [None]:
df_raw[col].describe(include = ['O'])

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#For the boxplots that appear to be relevant, we zoom in, in order to take a closer look at tha data.
vars=['Recovery', 'Supplements', 'Train bf competition','Strength training']
fig, axes = plt.subplots(len(vars), 1, figsize=(15, len(vars)*5), sharey=False)
i=0
for var in vars:
    sns.boxplot(ax=axes[i], x=col, y=var, data=df_raw[(df_raw[col].notna()) & (df_raw[var].notna())])
    axes[i].set_ylim(0,700)
    i+=1

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is >=40, Mental preparation is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete score']>=40),col]=1
#From the graphs, it is possible to infere that when the Cancelled enrollment true, Mental preparation is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Cancelled enrollment']==1),col]=0
#From the graphs, it is possible to infere that when the Late Enrollment is True, Mental preparation is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Late enrollment']==1),col]=0
#From the graphs, it is possible to infere that when the Other training is >=0, Mental preparation is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Other training']>0),col]=1
#From the graphs, it is possible to infere that when Outdoor Workout is true, Mental preparation is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Outdoor Workout']==1),col]=0
#From the graphs, it is possible to infere that when the Plyometric training is >=0, Mental preparation is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Plyometric training']>0),col]=1

In [None]:
#From the graphs, it is possible to infere that when the Recovery is >=350, Mental preparation is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Recovery']>=350),col]=1
#From the graphs, it is possible to infere that when the Supplements is >=400, Mental preparation is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Supplements']>=200),col]=1
#From the graphs, it is possible to infere that when the Supplements is <=150, Mental preparation is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Supplements']<=150),col]=0
#From the graphs, it is possible to infere that when the Train bf competition is <=200, Mental preparation is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Train bf competition']<=200),col]=0
#From the graphs, it is possible to infere that when the Train bf competition is >=350, Mental preparation is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Train bf competition']>=350),col]=1
#From the graphs, it is possible to infere that when the Strength training is <=500, Mental preparation is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']<=550),col]=0
#From the graphs, it is possible to infere that when the Strength training is >=500, Mental preparation is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']>550),col]=1

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 11: No coach

In [None]:
col='No coach'

In [None]:
df_raw[col].describe(include = ['O'])

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#For the boxplots that appear to be relevant, we zoom in, in order to take a closer look at tha data.
vars=['Recovery', 'Supplements', 'Train bf competition','Strength training']
fig, axes = plt.subplots(len(vars), 1, figsize=(15, len(vars)*5), sharey=False)
i=0
for var in vars:
    sns.boxplot(ax=axes[i], x=col, y=var, data=df_raw[(df_raw[col].notna()) & (df_raw[var].notna())])
    axes[i].set_ylim(0,1500)
    i+=1

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is >=60, No coach is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete score']>=60),col]=1
#From the graphs, it is possible to infere that when the Athlete score is <60, No coach is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete score']<60),col]=0
#From the graphs, it is possible to infere that when the Other training is >=0, No coach is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Other training']>0),col]=1

In [None]:
#From the graphs, it is possible to infere that when the Recovery is >=500, No coach is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Recovery']>=500),col]=1
#From the graphs, it is possible to infere that when the Recovery is <500, No coach is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Recovery']<500),col]=0
#From the graphs, it is possible to infere that when the Supplements is >=350, No coach is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Supplements']>=350),col]=1
#From the graphs, it is possible to infere that when the Supplements is <350, No coach is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Supplements']<350),col]=0
#From the graphs, it is possible to infere that when the Train bf competition is <500,  No coach is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Train bf competition']<500),col]=0
#From the graphs, it is possible to infere that when the Train bf competition is >=500, No coach is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Train bf competition']>=500),col]=1
#From the graphs, it is possible to infere that when the Strength training is <1500, No coach is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']<1500),col]=0
#From the graphs, it is possible to infere that when the Strength training is >=1500,  No coach is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']>=1500),col]=1

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 12: Outdoor Workout

In [None]:
col='Outdoor Workout'

In [None]:
df_raw[col].describe(include = ['O'])

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#For the boxplots that appear to be relevant, we zoom in, in order to take a closer look at tha data.
vars=['Recovery', 'Cardiovascular training','Supplements', 'Train bf competition','Strength training']
fig, axes = plt.subplots(len(vars), 1, figsize=(15, len(vars)*5), sharey=False)
i=0
for var in vars:
    sns.boxplot(ax=axes[i], x=col, y=var, data=df_raw[(df_raw[col].notna()) & (df_raw[var].notna())])
    axes[i].set_ylim(0,900)
    i+=1

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is >=60, Outdoor Workout is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete score']>=40),col]=1
#From the graphs, it is possible to infere that when the Late Enrollment is True, Outdoor Workout is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Late enrollment']==1),col]=0
#From the graphs, it is possible to infere that when Mental Preparation is True, Outdoor Workout is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Mental preparation']==1),col]=0
#From the graphs, it is possible to infere that when the Other training is >=0, Outdoor Workout is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Other training']>0),col]=1
#From the graphs, it is possible to infere that when the Plyometric training is >=0, Outdoor Workout is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Plyometric training']>0),col]=1

In [None]:
#From the graphs, it is possible to infere that when the Cardiovascular training is >=300, Outdoor Workout is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Cardiovascular training']>=300),col]=1
#From the graphs, it is possible to infere that when the Recovery is <=150, Outdoor Workout is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Cardiovascular training']<150),col]=0
#From the graphs, it is possible to infere that when the Supplements is <=100, Outdoor Workout is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Supplements']<=100),col]=0
#From the graphs, it is possible to infere that when the Supplements is >=200, Outdoor Workout is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Supplements']>=200),col]=1
#From the graphs, it is possible to infere that when the Train bf competition is >=350, Outdoor Workout is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Train bf competition']>=350),col]=1
#From the graphs, it is possible to infere that when the Strength training is <=300, Outdoor Workout is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']<=300),col]=0
#From the graphs, it is possible to infere that when the Strength training is >=550, Outdoor Workout is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']>=550),col]=1

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 13: Past injuries

In [None]:
col='Past injuries'

In [None]:
df_raw[col].describe(include = ['O'])

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#For the boxplots that appear to be relevant, we zoom in, in order to take a closer look at tha data.
vars=['Physiotherapy','Recovery','Cardiovascular training', 'Supplements', 'Train bf competition','Strength training']
fig, axes = plt.subplots(len(vars), 1, figsize=(15, len(vars)*5), sharey=False)
i=0
for var in vars:
    sns.boxplot(ax=axes[i], x=col, y=var, data=df_raw[(df_raw[col].notna()) & (df_raw[var].notna())])
    axes[i].set_ylim(0,900)
    i+=1

In [None]:
#For athletes that have more than one observation, and one of them is not null for past injuries, it is possible to infere the past injuries outcome in the null observation.
for i in [i for i in df_raw[(df_raw[col].isnull()) & (df_raw['Athlete Id'].isin(df_raw[(df_raw[col].notnull())]['Athlete Id']))]['Athlete Id'] if not np.isnan(i)]:
    df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete Id']==i),col]=df_raw[(df_raw[col].notnull()) & (df_raw['Athlete Id']==i)][col].tolist()[0]

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is >=60, Past Injuries is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete score']>=40),col]=0
#From the graphs, it is possible to infere that when the Plyometric training is >=0, Past Injuries is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Plyometric training']>0),col]=1

In [None]:
#From the graphs, it is possible to infere that when the Physiotherapy is >0, Past Injuries is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Physiotherapy']>0),col]=1
#From the graphs, it is possible to infere that when the Recovery is >=300, Past Injuries is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Recovery']>=300),col]=1
#From the graphs, it is possible to infere that when the Cardiovascular training is >=200,  Past Injuries is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Cardiovascular training']>=200),col]=1
#From the graphs, it is possible to infere that when the Supplements is >=200, Past Injuries is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Supplements']>=150),col]=1
#From the graphs, it is possible to infere that when the Train bf competition is >=250, Past Injuries is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Train bf competition']>=250),col]=1
#From the graphs, it is possible to infere that when the Train bf competition is <=250, Past Injuries is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Train bf competition']<=100),col]=0
#From the graphs, it is possible to infere that when the Strength training is >=300, Past Injuries is likely to be True
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']>=300),col]=1
#From the graphs, it is possible to infere that when the Strength training is <=150, Past Injuries is likely to be False
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Strength training']<=150),col]=0

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 14: Edition

In [None]:
col='Edition'

In [None]:
df_raw[col].describe()

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#For the boxplots that appear to be relevant, we zoom in, in order to take a closer look at tha data.
vars=['Recovery','Supplements', 'Train bf competition','Other training','Strength training']
fig, axes = plt.subplots(len(vars), 1, figsize=(15, len(vars)*5), sharey=False)
i=0
for var in vars:
    sns.boxplot(ax=axes[i], x=col, y=var, data=df_raw[(df_raw[col].notna()) & (df_raw[var].notna())])
    axes[i].set_ylim(0,800)
    i+=1

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is higher than 40, the Edition is likely to be 2020
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Athlete score']>=40),col]='2020'
#From the graphs, it is possible to infere that when Outdoor workout is true, Edition is likely to be 2022
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Outdoor Workout']==1),col]='2022'

In [None]:
#From the graphs, it is possible to infere that when the Supplements is >=200, the Edition is liketly to be 2020
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Supplements']>=200),col]='2020'
#From the graphs, it is possible to infere that when the Other training is >=5, the Edition is liketly to be 2020
df_raw.loc[(df_raw[col].isnull()) & (df_raw['Other training']>=5),col]='2020'

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 15: Previous Attempts

In [None]:
col='Previous attempts'

In [None]:
df_raw[col].describe()

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_raw[col].mean()+3*df_raw[col].std()
Min=df_raw[col].mean()-3*df_raw[col].std()
max_len=len(df_raw[df_raw[col]>=Max])
min_len=len(df_raw[df_raw[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')
#The variable only has 6 possible values, it does not make sense to remove outliers, since this would lead the varible to having an unique value:0.

In [None]:
gen_graphs(df_raw, col, data_types)

Variable 16: Athlete score

In [None]:
col='Athlete score'

In [None]:
df_raw[col].describe()

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_raw[col].mean()+3*df_raw[col].std()
Min=df_raw[col].mean()-3*df_raw[col].std()
max_len=len(df_raw[df_raw[col]>=Max])
min_len=len(df_raw[df_raw[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#Changing outliers for null
df_raw.loc[df_raw[col]>=Max,col]=np.nan
df_raw.loc[df_raw[col]<=Min,col]=np.nan

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Athlete score is higher if Age group=0-35
#Athlete score is higher if Cancelled enrollment=1
#Athlete score is higher if Disability=1
#Athlete score is higher if Income=Low
#Athlete score is higher if Late enrollment=1
#Athlete score is higher if Mental preparation=1
#Athlete score is higher if No coach=1
cols_groupby = ['Age group', 'Cancelled enrollment', 'Disability', 'Income', 'Late enrollment', 'Mental preparation', 'No coach']
df_means = df_raw.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_raw = df_raw.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_raw.loc[df_raw[col].isna(), col] = df_raw[df_raw[col].isna()][col+'_median']
df_raw.drop(columns = [col+'_median'], inplace = True)

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 17: Cardiovascular training

In [None]:
col = 'Cardiovascular training'

In [None]:
df_raw[col].describe()

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_raw[col].mean()+3*df_raw[col].std()
Min=df_raw[col].mean()-3*df_raw[col].std()
max_len=len(df_raw[df_raw[col]>=Max])
min_len=len(df_raw[df_raw[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#Changing outliers for null
df_raw.loc[df_raw[col]>=Max,col]=np.nan
df_raw.loc[df_raw[col]<=Min,col]=np.nan

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Cardiovascular training is higher if Age group is higher
#Cardiovascular training is higher if Cancelled enrollment=0
#Cardiovascular training depends on Competition
#Cardiovascular training depends on Education
#Cardiovascular training is higher if Mental preparation=1
#Cardiovascular training is higher if Outdoor Workout=1
#Cardiovascular training is higher if Sex=M

cols_groupby = ['Age group', 'Cancelled enrollment', 'Competition', 'Education', 'Mental preparation', 'Outdoor Workout', 'Sex']
df_means = df_raw.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_raw = df_raw.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_raw.loc[df_raw[col].isna(), col] = df_raw[df_raw[col].isna()][col+'_median']
df_raw.drop(columns = [col+'_median'], inplace = True)

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 18: Other training

In [None]:
col = 'Other training'

In [None]:
df_raw[col].describe()

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_raw[col].mean()+3*df_raw[col].std()
Min=df_raw[col].mean()-3*df_raw[col].std()
max_len=len(df_raw[df_raw[col]>=Max])
min_len=len(df_raw[df_raw[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#Changing outliers for null
df_raw.loc[df_raw[col]>=Max,col]=np.nan
df_raw.loc[df_raw[col]<=Min,col]=np.nan

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Other training depends on Competition
#Other training depends on Edition
#Other training is higher if Mental preparation=1
#Other training is higher if Outdoor Workout=1

cols_groupby = ['Competition', 'Edition', 'Mental preparation', 'Outdoor Workout']
df_means = df_raw.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_raw = df_raw.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_raw.loc[df_raw[col].isna(), col] = df_raw[df_raw[col].isna()][col+'_median']
df_raw.drop(columns = [col+'_median'], inplace = True)

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 19: Physiotherapy

In [None]:
col = 'Physiotherapy'

In [None]:
df_raw[col].describe()

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_raw[col].mean()+3*df_raw[col].std()
Min=df_raw[col].mean()-3*df_raw[col].std()
max_len=len(df_raw[df_raw[col]>=Max])
min_len=len(df_raw[df_raw[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#Changing outliers for null
df_raw.loc[df_raw[col]>=Max,col]=np.nan
df_raw.loc[df_raw[col]<=Min,col]=np.nan

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Physiotherapy is higher if Age group is higher
#Physiotherapy depends on Competition
#Physiotherapy depends on Edition
#Physiotherapy is higher if Past injuries=1
#Physiotherapy is higher the lower Previous attempts
#Physiotherapy is higher if Sex=M

cols_groupby = ['Age group', 'Competition', 'Edition', 'Past injuries', 'Previous attempts', 'Sex']
df_means = df_raw.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_raw = df_raw.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_raw.loc[df_raw[col].isna(), col] = df_raw[df_raw[col].isna()][col+'_median']
df_raw.drop(columns = [col+'_median'], inplace = True)

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 20: Plyometric training

In [None]:
col = 'Plyometric training'

In [None]:
df_raw[col].describe()

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_raw[col].mean()+3*df_raw[col].std()
Min=df_raw[col].mean()-3*df_raw[col].std()
max_len=len(df_raw[df_raw[col]>=Max])
min_len=len(df_raw[df_raw[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#Changing outliers for null
df_raw.loc[df_raw[col]>=Max,col]=np.nan
df_raw.loc[df_raw[col]<=Min,col]=np.nan

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Plyometric training depends on Competition
#Plyometric training is higher if Mental preparation=1
#Plyometric training is higher if Outdoor Workout=1
#Plyometric training is higher if Past injuries=1

cols_groupby = ['Competition', 'Mental preparation', 'Outdoor Workout', 'Past injuries']
df_means = df_raw.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_raw = df_raw.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_raw.loc[df_raw[col].isna(), col] = df_raw[df_raw[col].isna()][col+'_median']
df_raw.drop(columns = [col+'_median'], inplace = True)

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 21: Recovery

In [None]:
col = 'Recovery'

In [None]:
df_raw[col].describe()

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_raw[col].mean()+3*df_raw[col].std()
Min=df_raw[col].mean()-3*df_raw[col].std()
max_len=len(df_raw[df_raw[col]>=Max])
min_len=len(df_raw[df_raw[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#Changing outliers for null
df_raw.loc[df_raw[col]>=Max,col]=np.nan
df_raw.loc[df_raw[col]<=Min,col]=np.nan

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Recovery depends on Competition
#Recovery is higher if Mental preparation=1
#Recovery is higher if Outdoor Workout=1
#Recovery is higher if Past injuries=1
#Recovery is higher if Previous attempts is lower

cols_groupby = ['Competition', 'Mental preparation', 'Outdoor Workout', 'Past injuries', 'Previous attempts']
df_means = df_raw.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_raw = df_raw.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_raw.loc[df_raw[col].isna(), col] = df_raw[df_raw[col].isna()][col+'_median']
df_raw.drop(columns = [col+'_median'], inplace = True)

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 22: Sand training

In [None]:
col = 'Sand training'

In [None]:
df_raw[col].describe()

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_raw[col].mean()+3*df_raw[col].std()
Min=df_raw[col].mean()-3*df_raw[col].std()
max_len=len(df_raw[df_raw[col]>=Max])
min_len=len(df_raw[df_raw[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#Changing outliers for null
df_raw.loc[df_raw[col]>=Max,col]=np.nan
df_raw.loc[df_raw[col]<=Min,col]=np.nan

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 23: Sport-specific training

In [None]:
col = 'Sport-specific training'

In [None]:
df_raw[col].describe()

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_raw[col].mean()+3*df_raw[col].std()
Min=df_raw[col].mean()-3*df_raw[col].std()
max_len=len(df_raw[df_raw[col]>=Max])
min_len=len(df_raw[df_raw[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#Changing outliers for null
df_raw.loc[df_raw[col]>=Max,col]=np.nan
df_raw.loc[df_raw[col]<=Min,col]=np.nan

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Sport-specific training depends on Age group
#Sport-specific training depends on Competition
#Sport-specific training is higher when Mental preparation=1
#Sport-specific training is higher when No coach=1
#Sport-specific training is higher when Outdoor workout=1
#Sport-specific training is higher when Previous attempts is lower

cols_groupby = ['Age group', 'Competition', 'Mental preparation', 'No coach', 'Outdoor Workout', 'Previous attempts']
df_means = df_raw.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_raw = df_raw.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_raw.loc[df_raw[col].isna(), col] = df_raw[df_raw[col].isna()][col+'_median']
df_raw.drop(columns = [col+'_median'], inplace = True)

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 24: Squad training

In [None]:
col = 'Squad training'

In [None]:
df_raw[col].describe()

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_raw[col].mean()+3*df_raw[col].std()
Min=df_raw[col].mean()-3*df_raw[col].std()
max_len=len(df_raw[df_raw[col]>=Max])
min_len=len(df_raw[df_raw[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#Changing outliers for null
df_raw.loc[df_raw[col]>=Max,col]=np.nan
df_raw.loc[df_raw[col]<=Min,col]=np.nan

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Squad training depends on Age group
#Squad training depends on Competition
#Squad training depends on Edition
#Squad training is higher when Mental preparation = 1
#Squad training is higher when Outdoor Workout = 1
#Squad training is higher when Past injuries = 1

cols_groupby = ['Age group', 'Competition', 'Edition', 'Mental preparation', 'Outdoor Workout', 'Past injuries']
df_means = df_raw.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_raw = df_raw.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_raw.loc[df_raw[col].isna(), col] = df_raw[df_raw[col].isna()][col+'_median']
df_raw.drop(columns = [col+'_median'], inplace = True)

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 25: Strength training

In [None]:
col = 'Strength training'

In [None]:
df_raw[col].describe()

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_raw[col].mean()+3*df_raw[col].std()
Min=df_raw[col].mean()-3*df_raw[col].std()
max_len=len(df_raw[df_raw[col]>=Max])
min_len=len(df_raw[df_raw[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#Changing outliers for null
df_raw.loc[df_raw[col]>=Max,col]=np.nan
df_raw.loc[df_raw[col]<=Min,col]=np.nan

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Strength training is higher with Age group
#Strength training is higher when Cancelled enrollment = 0
#Strength training depends on Competition
#Strength training is higher when Mental preparation = 1
#Strength training is higher when Outdoor Workout = 1
#Strength training is higher when Past injuries = 1
#Strength training is higher when Previous attempts = 1
#Strength training is higher when Sex = M

cols_groupby = ['Age group', 'Cancelled enrollment', 'Competition', 'Mental preparation', 'Outdoor Workout', 'Past injuries', 'Previous attempts', 'Sex']
df_means = df_raw.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_raw = df_raw.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_raw.loc[df_raw[col].isna(), col] = df_raw[df_raw[col].isna()][col+'_median']
df_raw.drop(columns = [col+'_median'], inplace = True)

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 26: Supplements

In [None]:
col = 'Supplements'

In [None]:
df_raw[col].describe()

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_raw[col].mean()+3*df_raw[col].std()
Min=df_raw[col].mean()-3*df_raw[col].std()
max_len=len(df_raw[df_raw[col]>=Max])
min_len=len(df_raw[df_raw[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#Changing outliers for null
df_raw.loc[df_raw[col]>=Max,col]=np.nan
df_raw.loc[df_raw[col]<=Min,col]=np.nan

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Supplements is higher with Age group
#Supplements depends on Competition
#Supplements is higher when Mental preparation = 1
#Supplements is higher when No coach = 1
#Supplements is higher when Outdoor Workout = 1
#Supplements is higher when Past injuries = 1

cols_groupby = ['Age group', 'Competition', 'Mental preparation', 'No coach', 'Outdoor Workout', 'Past injuries']
df_means = df_raw.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_raw = df_raw.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_raw.loc[df_raw[col].isna(), col] = df_raw[df_raw[col].isna()][col+'_median']
df_raw.drop(columns = [col+'_median'], inplace = True)

In [None]:
len(df_raw[df_raw[col].isnull()])

Variable 27: Train bf competition

In [None]:
col = 'Train bf competition'

In [None]:
df_raw[col].describe()

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_raw[col].mean()+3*df_raw[col].std()
Min=df_raw[col].mean()-3*df_raw[col].std()
max_len=len(df_raw[df_raw[col]>=Max])
min_len=len(df_raw[df_raw[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
gen_graphs(df_raw, col, data_types)

In [None]:
#Changing outliers for null
df_raw.loc[df_raw[col]>=Max,col]=np.nan
df_raw.loc[df_raw[col]<=Min,col]=np.nan

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Train bf competition is higher with Cancelled enrollment==0
#Train bf competition is higher with Mental preparation==1
#Train bf competition is higher with No coach==1
#Train bf competition is higher with Outdoor Workout==1
#Train bf competition is higher with Past injuries==1
#Train bf competition is higher with Previous attempts
cols_groupby = ['Cancelled enrollment','Mental preparation','No coach','Outdoor Workout','Past injuries','Previous attempts']
df_means = df_raw.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_raw = df_raw.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_raw.loc[df_raw[col].isna(), col] = df_raw[df_raw[col].isna()][col+'_median']
df_raw.drop(columns = [col+'_median'], inplace = True)

In [None]:
len(df_raw[df_raw[col].isnull()])

In [None]:
#Maximum of observations that may be dropped when modeling
a=(df_raw.shape[0]-len(df_raw.dropna()))/df_raw.shape[0]
print('The maximum percentage that may be dropped when modeling is' +str(a))
#Maximum of observations that may be dropped when modeling, not considering Athlete Id, that has no value
df_raw2=df_raw.drop('Athlete Id',axis=1)
b=(df_raw2.shape[0]-len(df_raw2.dropna()))/df_raw2.shape[0]
print("The maximum percentage that may be dropped when modeling, and not considering the missing values of Athlete Id that won't affect the number of missing values is "+str(b))

In [None]:
df_raw.info()

####**Label**

In [None]:
# Encode the train dataset

# Competition
#Replacement mapping
Competition_mapping = {
    'Local Match': 0,
    'Regional Match': 1,
    'Regional Tournament': 2,
    'National Cup': 3,
    'Federation League': 4,
    'Continental Championship': 5,
    'Olympic Games': 6,
    'World Championship': 7
}

# Replace values in the "Competition" column using the mapping
df_raw['Competition'] = df_raw['Competition'].replace(Competition_mapping)


# Edition
#Replacement mapping
Edition_mapping = {
    2019: 0,
    2020: 1,
    2021: 2,
    2022: 3
}

# Replace values in the "Edition" column using the mapping
df_raw['Edition'] = df_raw['Edition'].replace(Edition_mapping)


# Sex
# Replacement mapping
Sex_mapping = {
    'M': 0,
    'F': 1
}

# Replace values in the "Sex" column using the mapping
df_raw['Sex'] = df_raw['Sex'].replace(Sex_mapping)



# Region
# Replacement mapping
Region_mapping = {
    'Oceania': 0,
    'Middle East': 1,
    'Northern Africa': 2,
    'Southern Africa': 3,
    'Southern Europe': 4,
    'Western Europe': 5,
    'Eastern Europe': 6,
    'East Asia': 7,
    'Central Asia': 8,
    'South Asia': 9,
    'Central America': 10,
    'North America': 11,
    'South America': 12
}

# Replace values in the "Region" column using the mapping
df_raw['Region'] = df_raw['Region'].replace(Region_mapping)



# Education
# Replacement mapping
Education_mapping = {
    'Elementary school': 0,
    'Middle school': 1,
    'High school': 2,
    'University Degree': 3,
    'Post Graduate': 4
}

# Replace values in the "Education" column using the mapping
df_raw['Education'] = df_raw['Education'].replace(Education_mapping)



# Age
# Replacement mapping
Age_mapping = {
    '0-35': 0,
    '35-55': 1,
    '55<=': 2
}

# Replace values in the "Age group" column using the mapping
df_raw['Age group'] = df_raw['Age group'].replace(Age_mapping)



# Income
# Replacement mapping
Income_mapping = {
    'Low': 0,
    'Middle-Low': 1,
    'Middle': 2,
    'Middle-High': 3,
    'High': 4
}

# Replace values in the "Income" column using the mapping
df_raw['Income'] = df_raw['Income'].replace(Income_mapping)


### Data Cleaning - Test

Apply the same cleaning rules that were applied to the training database.

In [None]:
#Set age group 0 to 'nan'
df_test.loc[df_test['Age group']=='0','Age group']=np.nan

In [None]:
#Correct detected typos: FASE->FALSE in Mental preparation
df_test.replace({'Mental preparation': {'FASE': 'FALSE'}}, inplace=True)

In [None]:
#Correct wrong data types: STRING->BOOLEAN in Mental preparation
df_test.replace({'Mental preparation': {'FALSE': False}}, inplace=True)
df_test.replace({'Mental preparation': {'TRUE': True}}, inplace=True)

In [None]:
#Convert boolean to [0, 1]
for col in ['Cancelled enrollment','Disability','Late enrollment','Mental preparation','No coach','Outdoor Workout','Past injuries']:
    df_test.replace({col: {False: 0}}, inplace=True)
    df_test.replace({col: {True: 1}}, inplace=True)

In [None]:
#Convert negative values to absolute value for variables with field list [0, ∞[ (assume it is a recording error)
for col in ['Athlete score','Physiotherapy']:
    df_test[col] = df_test[col].abs()

In [None]:
#Delete the decimals from the year
df_test['Edition']=df_test['Edition'].astype('Int64',errors='ignore')

In [None]:
#Enforce expected data types in columns
data_types = {'Athlete Id': int,'Age group': str,'Athlete score': int,'Cancelled enrollment': int,'Cardiovascular training': int,'Competition': str,'Disability': int,'Edition': str,'Education': str,'Income': str,'Late enrollment': int,'Mental preparation': int,'No coach': int,'Other training': int,'Outdoor Workout': int,'Past injuries': int,'Physiotherapy': int,'Plyometric training': int,'Previous attempts': int,'RecordID': int,'Recovery': int,'Region': str,'Sand training': int,'Sex': str,'Sport-specific training': int,'Squad training': int,'Strength training': int,'Supplements': int,'Train bf competition': int}
for col in data_types.keys():
    df_test[col] = df_test[col].astype(data_types[col], errors='ignore')
    if data_types[col]==str:
        df_test.loc[df_test[col]=='nan',col]=np.nan
    if data_types[col]==str:
        df_test.loc[df_test[col]=='<NA>',col]=np.nan
    if data_types[col]==int:
        df_test.loc[df_test[col].isna(),col]=np.nan

Variable 1: Age group

In [None]:
col='Age group'

In [None]:
#For athletes that have more than one observation, and one of them is not null Age group, it is possible to infere the Age group level for the null observation.
for i in [i for i in df_test[(df_test[col].isnull()) & (df_test['Athlete Id'].isin(df_test[(df_test[col].notnull())]['Athlete Id']))]['Athlete Id'] if not np.isnan(i)]:
    df_test.loc[(df_test[col].isnull()) & (df_test['Athlete Id']==i),col]=df_test[(df_test[col].notnull()) & (df_test['Athlete Id']==i)][col].tolist()[0]

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is higher than 40, the Age group is likely to be 0-35
df_test.loc[(df_test[col].isnull()) & (df_test['Athlete score']>=40),col]='0-35'

In [None]:
#From the graphs, it is possible to infere that when the Education level is elementary scholl, the Age group is likely to be 0-35
df_test.loc[(df_test[col].isnull()) & (df_test['Education']=='Elementary school'),col]='0-35'

In [None]:
#From the graphs, it is possible to infere that when the Physiotherapy is >=40, the Age group is likely to be 55<=
df_test.loc[(df_test[col].isnull()) & (df_test['Physiotherapy']>=40),col]='55<='
#From the graphs, it is possible to infere that when the Recovery is >=500, the Age group is likely to be 55<=
df_test.loc[(df_test[col].isnull()) & (df_test['Recovery']>=500),col]='55<='
#From the graphs, it is possible to infere that when the Supplements is >250, the Age group is likely to be 55<=
df_test.loc[(df_test[col].isnull()) & (df_test['Supplements']>=250),col]='55<='
#From the graphs, it is possible to infere that when the Train bf competition is >450, the Age group is likely to be 55<=
df_test.loc[(df_test[col].isnull()) & (df_test['Train bf competition']>=450),col]='55<='
#From the graphs, it is possible to infere that when the Strength training is <75, the Age group is likely to be 0-35
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']<=75),col]='0-35'
#From the graphs, it is possible to infere that when the Strength training is >750, the Age group is likely to be 55<=
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']>=750),col]='55<='
#From the graphs, it is possible to infere that when the Sport specific training is >=50, the Age group is likely to be 55<=
df_test.loc[(df_test[col].isnull()) & (df_test['Sport-specific training']>=50),col]='55<='
#From the graphs, it is possible to infere that when the Squad training is >5, the Age group is likely to be 55<=
df_test.loc[(df_test[col].isnull()) & (df_test['Squad training']>=5),col]='55<='

Variable 2:Competition

In [None]:
col='Competition'

In [None]:
#From the graphs, it is possible to infere that when there is late enrollment, competition is a Regional match
df_test.loc[(df_test[col].isnull()) & (df_test['Late enrollment']==1),col]='Regional Match'
#From the graphs, it is possible to infere that when there is Mental preparation, competition is a local match
df_test.loc[(df_test[col].isnull()) & (df_test['Mental preparation']==1),col]='Local Match'
#From the graphs, it is possible to infere that when there is Outdoor Workout, competition is a local match
df_test.loc[(df_test[col].isnull()) & (df_test['Outdoor Workout']==1),col]='Local Match'
#From the graphs, it is possible to infere that when there is Plyometric training, competition is a local match
df_test.loc[(df_test[col].isnull()) & (df_test['Plyometric training']>0),col]='Local Match'

In [None]:
#From the graphs, it is possible to infere that when the Physiotherapy is >40, competition is a Continental championship
df_test.loc[(df_test[col].isnull()) & (df_test['Physiotherapy']>=100) & (df_test['Edition']!=2020),col]='Continental Championship'
#From the graphs, it is possible to infere that when the Strength training is >1100, competition is a local match
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']>=1100),col]='Local Match'

In [None]:
#For each edition, what is the most frequent competition? -> Apply that competition to the missing values.
competition_mapping = {'2019':'Local Match', '2020':'Local Match', '2021':'National Cup', '2022':'National Cup'}

# Replace missing values in 'competition' column based on 'edition' column
df_test[col] = df_test[col].fillna(df_test['Edition'].map(competition_mapping))

pd.crosstab(df_test['Edition'].fillna('Missing'),df_test[col].fillna('Missing'),margins=True)

Variable 3:Education

In [None]:
col='Education'

In [None]:
#For athletes that have more than one observation, and one of them is not null Education, it is possible to infere the Education level for the null observation.
for i in [i for i in df_test[(df_test[col].isnull()) & (df_test['Athlete Id'].isin(df_test[(df_test[col].notnull())]['Athlete Id']))]['Athlete Id'] if not np.isnan(i)]:
    df_test.loc[(df_test[col].isnull()) & (df_test['Athlete Id']==i),col]=df_test[(df_test[col].notnull()) & (df_test['Athlete Id']==i)][col].tolist()[0]

In [None]:
#From the graphs, it is possible to infere that when the Cardiovascular Training is >=500, Education is likely Post Graduate
df_test.loc[(df_test[col].isnull()) & (df_test['Cardiovascular training']>=500), col]='Post Graduate'
#From the graphs, it is possible to infere that when the Supplements is >=250,Education is likely Post Graduate
df_test.loc[(df_test[col].isnull()) & (df_test['Supplements']>=250),col]='Post Graduate'
#From the graphs, it is possible to infere that when the Strength training is >1100, competition is a local match
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']>=700),col]='University Degree'

Variable 4: Income

In [None]:
col='Income'

In [None]:
#For athletes that have more than one observation, and one of them is not null Education, it is possible to infere the Education level for the null observation.
for i in [i for i in df_test[(df_test[col].isnull()) & (df_test['Athlete Id'].isin(df_test[(df_test[col].notnull())]['Athlete Id']))]['Athlete Id'] if not np.isnan(i)]:
    df_test.loc[(df_test[col].isnull()) & (df_test['Athlete Id']==i),col]=df_test[(df_test[col].notnull()) & (df_test['Athlete Id']==i)][col].tolist()[0]

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is higher than 40, the Income is likely to be Low
df_test.loc[(df_test[col].isnull()) & (df_test['Athlete score']>=40),col]='Low'

In [None]:
#From the graphs, it is possible to infere that when the Strength training is >650, the Education level is likely to be High
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']>=650),col]='High'

Variable 5: Sex

In [None]:
col='Sex'

In [None]:
#For athletes that have more than one observation, and one of them not null Sex, it is possible to infere the Sex for the null observation.
for i in [i for i in df_test[(df_test[col].isnull()) & (df_test['Athlete Id'].isin(df_test[(df_test[col].notnull())]['Athlete Id']))]['Athlete Id'] if not np.isnan(i)]:
    df_test.loc[(df_test[col].isnull()) & (df_test['Athlete Id']==i),col]=df_test[(df_test[col].notnull()) & (df_test['Athlete Id']==i)][col].tolist()[0]

In [None]:
#From the graphs, it is possible to infere that when the Physiotherapy is >=0, the Sex is likely to be M
df_test.loc[(df_test[col].isnull()) & (df_test['Physiotherapy']>0),col]='M'
#From the graphs, it is possible to infere that when the Cardiovascular is >=150, the Sex is likely to be M
df_test.loc[(df_test[col].isnull()) & (df_test['Cardiovascular training']>=150),col]='M'
#From the graphs, it is possible to infere that when the Supplements is >=150, the Sex is likely to be M
df_test.loc[(df_test[col].isnull()) & (df_test['Supplements']>=150),col]='M'
#From the graphs, it is possible to infere that when the Train bf competition is >=350, is likely to be M
df_test.loc[(df_test[col].isnull()) & (df_test['Train bf competition']>=350),col]='M'
#From the graphs, it is possible to infere that when the Strength training is <=100, the Sex is likely to be F
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']<=100),col]='F'
#From the graphs, it is possible to infere that when the Strength training is >400, the Sex is likely to be M
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']>=400),col]='M'

Variable 6: Region

In [None]:
col='Region'

In [None]:
#For athletes that have more than one observation, and one of them is not a null Region, it is possible to infere the Region for the null observation.
for i in [i for i in df_test[(df_test[col].isnull()) & (df_test['Athlete Id'].isin(df_test[(df_test[col].notnull())]['Athlete Id']))]['Athlete Id'] if not np.isnan(i)]:
    df_test.loc[(df_test[col].isnull()) & (df_test['Athlete Id']==i),col]=df_test[(df_test[col].notnull()) & (df_test['Athlete Id']==i)][col].tolist()[0]

In [None]:
#From the graphs, it is possible to infere that when the Athlete education is Elementary School, the Region is likely to be South America
df_test.loc[(df_test[col].isnull()) & (df_test['Education']=='Elementary School'),col]='South America'

Variable 7: Cancelled enrollment

In [None]:
col='Cancelled enrollment'

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is >=60, Cancelled enrollment is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Athlete score']>=60),col]=1

In [None]:
#From the graphs, it is possible to infere that when the Physiotherapy is >=10, the Cancelled enrollment is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Physiotherapy']>=20),col]=0
#From the graphs, it is possible to infere that when the Recovery is <=40, the Cancelled enrollment is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Recovery']<=40),col]=1
#From the graphs, it is possible to infere that when the Recovery is >=175, the Cancelled enrollment is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Recovery']>=175),col]=0
#From the graphs, it is possible to infere that when the Cardiovascular training is >=125, the Cancelled enrollment is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Cardiovascular training']>=125),col]=0
#From the graphs, it is possible to infere that when the Supplements is >=100, Cancelled enrollment is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Supplements']>=100),col]=0
#From the graphs, it is possible to infere that when the Train bf competition is >=100, Cancelled enrollment is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Train bf competition']<=100),col]=1
#From the graphs, it is possible to infere that when the Train bf competition is >=175, Cancelled enrollment is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Train bf competition']>=175),col]=0
#From the graphs, it is possible to infere that when the Strength training is <=70, Cancelled enrollment is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']<=70),col]=1
#From the graphs, it is possible to infere that when the Strength training is >=250, Cancelled enrollment is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']>=250),col]=0
#From the graphs, it is possible to infere that when the Sport specific training is >=20, the Cancelled enrollment is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Sport-specific training']>=20),col]=0

Variable 8: Disability

In [None]:
col='Disability'

In [None]:
#For athletes that have more than one observation, and one of them is not a null Region, it is possible to infere the Region for the null observation.
for i in [i for i in df_test[(df_test[col].isnull()) & (df_test['Athlete Id'].isin(df_test[(df_test[col].notnull())]['Athlete Id']))]['Athlete Id'] if not np.isnan(i)]:
    df_test.loc[(df_test[col].isnull()) & (df_test['Athlete Id']==i),col]=df_test[(df_test[col].notnull()) & (df_test['Athlete Id']==i)][col].tolist()[0]

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is >=60, Disability is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Athlete score']>=60),col]=1
#From the graphs, it is possible to infere that when the Competition is Olympic Games, Disability is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Competition']=='Olympic Games'),col]=0
#From the graphs, it is possible to infere that when the Late Enrollment is True, Disability is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Late enrollment']==1),col]=0
#From the graphs, it is possible to infere that when the Outdoor Workout is True, Disability is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Outdoor Workout']==1),col]=0

Variable 9: Late enrollment

In [None]:
col='Late enrollment'

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is >=40, Late enrollment is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Athlete score']>=40),col]=1
#From the graphs, it is possible to infere that when the Competition is Regional Tournment, Late enrollment is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Competition']=='Regional Tournment'),col]=1
#From the graphs, it is possible to infere that when Previous attempts>=0, Late enrollment is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Previous attempts']>0),col]=1

In [None]:
#From the graphs, it is possible to infere that when the Physiotherapy is >=0, Late enrollment is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Physiotherapy']>0),col]=0
#From the graphs, it is possible to infere that when the Recovery is >=150, Late enrollment is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Recovery']>=150),col]=0
#From the graphs, it is possible to infere that when the Cardiovascular training is >=150, Late enrollment is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Cardiovascular training']>=150),col]=0
#From the graphs, it is possible to infere that when Suplements is >=100, Late enrollment is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Supplements']>=100),col]=0
#From the graphs, it is possible to infere that when Train bf competition is >=200, Late enrollment is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Train bf competition']>=200),col]=0

Variable 10: Mental preparation

In [None]:
col='Mental preparation'

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is >=40, Mental preparation is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Athlete score']>=40),col]=1
#From the graphs, it is possible to infere that when the Cancelled enrollment true, Mental preparation is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Cancelled enrollment']==1),col]=0
#From the graphs, it is possible to infere that when the Late Enrollment is True, Mental preparation is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Late enrollment']==1),col]=0
#From the graphs, it is possible to infere that when the Other training is >=0, Mental preparation is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Other training']>0),col]=1
#From the graphs, it is possible to infere that when Outdoor Workout is true, Mental preparation is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Outdoor Workout']==1),col]=0
#From the graphs, it is possible to infere that when the Plyometric training is >=0, Mental preparation is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Plyometric training']>0),col]=1

In [None]:
#From the graphs, it is possible to infere that when the Recovery is >=350, Mental preparation is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Recovery']>=350),col]=1
#From the graphs, it is possible to infere that when the Supplements is >=400, Mental preparation is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Supplements']>=200),col]=1
#From the graphs, it is possible to infere that when the Supplements is <=150, Mental preparation is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Supplements']<=150),col]=0
#From the graphs, it is possible to infere that when the Train bf competition is <=200, Mental preparation is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Train bf competition']<=200),col]=0
#From the graphs, it is possible to infere that when the Train bf competition is >=350, Mental preparation is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Train bf competition']>=350),col]=1
#From the graphs, it is possible to infere that when the Strength training is <=500, Mental preparation is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']<=550),col]=0
#From the graphs, it is possible to infere that when the Strength training is >=500, Mental preparation is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']>550),col]=1

Variable 11: No coach

In [None]:
col='No coach'

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is >=60, No coach is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Athlete score']>=60),col]=1
#From the graphs, it is possible to infere that when the Athlete score is <60, No coach is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Athlete score']<60),col]=0
#From the graphs, it is possible to infere that when the Other training is >=0, No coach is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Other training']>0),col]=1

In [None]:
#From the graphs, it is possible to infere that when the Recovery is >=500, No coach is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Recovery']>=500),col]=1
#From the graphs, it is possible to infere that when the Recovery is <500, No coach is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Recovery']<500),col]=0
#From the graphs, it is possible to infere that when the Supplements is >=350, No coach is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Supplements']>=350),col]=1
#From the graphs, it is possible to infere that when the Supplements is <350, No coach is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Supplements']<350),col]=0
#From the graphs, it is possible to infere that when the Train bf competition is <500,  No coach is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Train bf competition']<500),col]=0
#From the graphs, it is possible to infere that when the Train bf competition is >=500, No coach is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Train bf competition']>=500),col]=1
#From the graphs, it is possible to infere that when the Strength training is <1500, No coach is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']<1500),col]=0
#From the graphs, it is possible to infere that when the Strength training is >=1500,  No coach is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']>=1500),col]=1

Variable 12: Outdoor Workout

In [None]:
col='Outdoor Workout'

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is >=60, Outdoor Workout is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Athlete score']>=40),col]=1
#From the graphs, it is possible to infere that when the Late Enrollment is True, Outdoor Workout is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Late enrollment']==1),col]=0
#From the graphs, it is possible to infere that when Mental Preparation is True, Outdoor Workout is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Mental preparation']==1),col]=0
#From the graphs, it is possible to infere that when the Other training is >=0, Outdoor Workout is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Other training']>0),col]=1
#From the graphs, it is possible to infere that when the Plyometric training is >=0, Outdoor Workout is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Plyometric training']>0),col]=1

In [None]:
#From the graphs, it is possible to infere that when the Cardiovascular training is >=300, Outdoor Workout is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Cardiovascular training']>=300),col]=1
#From the graphs, it is possible to infere that when the Recovery is <=150, Outdoor Workout is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Cardiovascular training']<150),col]=0
#From the graphs, it is possible to infere that when the Supplements is <=100, Outdoor Workout is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Supplements']<=100),col]=0
#From the graphs, it is possible to infere that when the Supplements is >=200, Outdoor Workout is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Supplements']>=200),col]=1
#From the graphs, it is possible to infere that when the Train bf competition is >=350, Outdoor Workout is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Train bf competition']>=350),col]=1
#From the graphs, it is possible to infere that when the Strength training is <=300, Outdoor Workout is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']<=300),col]=0
#From the graphs, it is possible to infere that when the Strength training is >=550, Outdoor Workout is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']>=550),col]=1

Variable 13: Past injuries

In [None]:
col='Past injuries'

In [None]:
#For athletes that have more than one observation, and one of them is not null for past injuries, it is possible to infere the past injuries outcome in the null observation.
for i in [i for i in df_test[(df_test[col].isnull()) & (df_test['Athlete Id'].isin(df_test[(df_test[col].notnull())]['Athlete Id']))]['Athlete Id'] if not np.isnan(i)]:
    df_test.loc[(df_test[col].isnull()) & (df_test['Athlete Id']==i),col]=df_test[(df_test[col].notnull()) & (df_test['Athlete Id']==i)][col].tolist()[0]

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is >=60, Past Injuries is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Athlete score']>=40),col]=0
#From the graphs, it is possible to infere that when the Plyometric training is >=0, Past Injuries is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Plyometric training']>0),col]=1

In [None]:
#From the graphs, it is possible to infere that when the Physiotherapy is >0, Past Injuries is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Physiotherapy']>0),col]=1
#From the graphs, it is possible to infere that when the Recovery is >=300, Past Injuries is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Recovery']>=300),col]=1
#From the graphs, it is possible to infere that when the Cardiovascular training is >=200,  Past Injuries is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Cardiovascular training']>=200),col]=1
#From the graphs, it is possible to infere that when the Supplements is >=200, Past Injuries is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Supplements']>=150),col]=1
#From the graphs, it is possible to infere that when the Train bf competition is >=250, Past Injuries is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Train bf competition']>=250),col]=1
#From the graphs, it is possible to infere that when the Train bf competition is <=250, Past Injuries is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Train bf competition']<=100),col]=0
#From the graphs, it is possible to infere that when the Strength training is >=300, Past Injuries is likely to be True
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']>=300),col]=1
#From the graphs, it is possible to infere that when the Strength training is <=150, Past Injuries is likely to be False
df_test.loc[(df_test[col].isnull()) & (df_test['Strength training']<=150),col]=0

Variable 14: Edition

In [None]:
col='Edition'

In [None]:
#From the graphs, it is possible to infere that when the Athlete score is higher than 40, the Edition is likely to be 2020
df_test.loc[(df_test[col].isnull()) & (df_test['Athlete score']>=40),col]='2020'
#From the graphs, it is possible to infere that when Outdoor workout is true, Edition is likely to be 2022
df_test.loc[(df_test[col].isnull()) & (df_test['Outdoor Workout']==1),col]='2022'

In [None]:
#From the graphs, it is possible to infere that when the Supplements is >=200, the Edition is liketly to be 2020
df_test.loc[(df_test[col].isnull()) & (df_test['Supplements']>=200),col]='2020'
#From the graphs, it is possible to infere that when the Other training is >=5, the Edition is liketly to be 2020
df_test.loc[(df_test[col].isnull()) & (df_test['Other training']>=5),col]='2020'

Variable 15: Previous Attempts

In [None]:
col='Previous attempts'

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_test[col].mean()+3*df_test[col].std()
Min=df_test[col].mean()-3*df_test[col].std()
max_len=len(df_test[df_test[col]>=Max])
min_len=len(df_test[df_test[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')
#The variable only has 6 possible values, it does not make sense to remove outliers, since this would lead the varible to having an unique value:0.

Variable 16: Athlete score

In [None]:
col='Athlete score'

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_test[col].mean()+3*df_test[col].std()
Min=df_test[col].mean()-3*df_test[col].std()
max_len=len(df_test[df_test[col]>=Max])
min_len=len(df_test[df_test[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
#Changing outliers for null
df_test.loc[df_test[col]>=Max,col]=np.nan
df_test.loc[df_test[col]<=Min,col]=np.nan

In [None]:
#Athlete score is higher if Age group=0-35
#Athlete score is higher if Cancelled enrollment=1
#Athlete score is higher if Disability=1
#Athlete score is higher if Income=Low
#Athlete score is higher if Late enrollment=1
#Athlete score is higher if Mental preparation=1
#Athlete score is higher if No coach=1
cols_groupby = ['Age group', 'Cancelled enrollment', 'Disability', 'Income', 'Late enrollment', 'Mental preparation', 'No coach']
df_means = df_test.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_test = df_test.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_test.loc[df_test[col].isna(), col] = df_test[df_test[col].isna()][col+'_median']
df_test.drop(columns = [col+'_median'], inplace = True)

Variable 17: Cardiovascular training

In [None]:
col = 'Cardiovascular training'

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_test[col].mean()+3*df_test[col].std()
Min=df_test[col].mean()-3*df_test[col].std()
max_len=len(df_test[df_test[col]>=Max])
min_len=len(df_test[df_test[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
#Changing outliers for null
df_test.loc[df_test[col]>=Max,col]=np.nan
df_test.loc[df_test[col]<=Min,col]=np.nan

In [None]:
#Cardiovascular training is higher if Age group is higher
#Cardiovascular training is higher if Cancelled enrollment=0
#Cardiovascular training depends on Competition
#Cardiovascular training depends on Education
#Cardiovascular training is higher if Mental preparation=1
#Cardiovascular training is higher if Outdoor Workout=1
#Cardiovascular training is higher if Sex=M

cols_groupby = ['Age group', 'Cancelled enrollment', 'Competition', 'Education', 'Mental preparation', 'Outdoor Workout', 'Sex']
df_means = df_test.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_test = df_test.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_test.loc[df_test[col].isna(), col] = df_test[df_test[col].isna()][col+'_median']
df_test.drop(columns = [col+'_median'], inplace = True)

Variable 18: Other training

In [None]:
col = 'Other training'

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_test[col].mean()+3*df_test[col].std()
Min=df_test[col].mean()-3*df_test[col].std()
max_len=len(df_test[df_test[col]>=Max])
min_len=len(df_test[df_test[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
#Changing outliers for null
df_test.loc[df_test[col]>=Max,col]=np.nan
df_test.loc[df_test[col]<=Min,col]=np.nan

In [None]:
#Other training depends on Competition
#Other training depends on Edition
#Other training is higher if Mental preparation=1
#Other training is higher if Outdoor Workout=1

cols_groupby = ['Competition', 'Edition', 'Mental preparation', 'Outdoor Workout']
df_means = df_test.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_test = df_test.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_test.loc[df_test[col].isna(), col] = df_test[df_test[col].isna()][col+'_median']
df_test.drop(columns = [col+'_median'], inplace = True)

Variable 19: Physiotherapy

In [None]:
col = 'Physiotherapy'

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_test[col].mean()+3*df_test[col].std()
Min=df_test[col].mean()-3*df_test[col].std()
max_len=len(df_test[df_test[col]>=Max])
min_len=len(df_test[df_test[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
#Changing outliers for null
df_test.loc[df_test[col]>=Max,col]=np.nan
df_test.loc[df_test[col]<=Min,col]=np.nan

In [None]:
#Physiotherapy is higher if Age group is higher
#Physiotherapy depends on Competition
#Physiotherapy depends on Edition
#Physiotherapy is higher if Past injuries=1
#Physiotherapy is higher the lower Previous attempts
#Physiotherapy is higher if Sex=M

cols_groupby = ['Age group', 'Competition', 'Edition', 'Past injuries', 'Previous attempts', 'Sex']
df_means = df_test.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_test = df_test.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_test.loc[df_test[col].isna(), col] = df_test[df_test[col].isna()][col+'_median']
df_test.drop(columns = [col+'_median'], inplace = True)

Variable 20: Plyometric training

In [None]:
col = 'Plyometric training'

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_test[col].mean()+3*df_test[col].std()
Min=df_test[col].mean()-3*df_test[col].std()
max_len=len(df_test[df_test[col]>=Max])
min_len=len(df_test[df_test[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
#Changing outliers for null
df_test.loc[df_test[col]>=Max,col]=np.nan
df_test.loc[df_test[col]<=Min,col]=np.nan

In [None]:
#Plyometric training depends on Competition
#Plyometric training is higher if Mental preparation=1
#Plyometric training is higher if Outdoor Workout=1
#Plyometric training is higher if Past injuries=1

cols_groupby = ['Competition', 'Mental preparation', 'Outdoor Workout', 'Past injuries']
df_means = df_test.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_test = df_test.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_test.loc[df_test[col].isna(), col] = df_test[df_test[col].isna()][col+'_median']
df_test.drop(columns = [col+'_median'], inplace = True)

Variable 21: Recovery

In [None]:
col = 'Recovery'

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_test[col].mean()+3*df_test[col].std()
Min=df_test[col].mean()-3*df_test[col].std()
max_len=len(df_test[df_test[col]>=Max])
min_len=len(df_test[df_test[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
#Changing outliers for null
df_test.loc[df_test[col]>=Max,col]=np.nan
df_test.loc[df_test[col]<=Min,col]=np.nan

In [None]:
#Recovery depends on Competition
#Recovery is higher if Mental preparation=1
#Recovery is higher if Outdoor Workout=1
#Recovery is higher if Past injuries=1
#Recovery is higher if Previous attempts is lower

cols_groupby = ['Competition', 'Mental preparation', 'Outdoor Workout', 'Past injuries', 'Previous attempts']
df_means = df_test.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_test = df_test.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_test.loc[df_test[col].isna(), col] = df_test[df_test[col].isna()][col+'_median']
df_test.drop(columns = [col+'_median'], inplace = True)

Variable 22: Sand training

In [None]:
col = 'Sand training'

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_test[col].mean()+3*df_test[col].std()
Min=df_test[col].mean()-3*df_test[col].std()
max_len=len(df_test[df_test[col]>=Max])
min_len=len(df_test[df_test[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
#Changing outliers for null
df_test.loc[df_test[col]>=Max,col]=np.nan
df_test.loc[df_test[col]<=Min,col]=np.nan

Variable 23: Sport-specific training

In [None]:
col = 'Sport-specific training'

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_test[col].mean()+3*df_test[col].std()
Min=df_test[col].mean()-3*df_test[col].std()
max_len=len(df_test[df_test[col]>=Max])
min_len=len(df_test[df_test[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
#Changing outliers for null
df_test.loc[df_test[col]>=Max,col]=np.nan
df_test.loc[df_test[col]<=Min,col]=np.nan

In [None]:
#Sport-specific training depends on Age group
#Sport-specific training depends on Competition
#Sport-specific training is higher when Mental preparation=1
#Sport-specific training is higher when No coach=1
#Sport-specific training is higher when Outdoor workout=1
#Sport-specific training is higher when Previous attempts is lower

cols_groupby = ['Age group', 'Competition', 'Mental preparation', 'No coach', 'Outdoor Workout', 'Previous attempts']
df_means = df_test.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_test = df_test.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_test.loc[df_test[col].isna(), col] = df_test[df_test[col].isna()][col+'_median']
df_test.drop(columns = [col+'_median'], inplace = True)

Variable 24: Squad training

In [None]:
col = 'Squad training'

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_test[col].mean()+3*df_test[col].std()
Min=df_test[col].mean()-3*df_test[col].std()
max_len=len(df_test[df_test[col]>=Max])
min_len=len(df_test[df_test[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
#Changing outliers for null
df_test.loc[df_test[col]>=Max,col]=np.nan
df_test.loc[df_test[col]<=Min,col]=np.nan

In [None]:
#Squad training depends on Age group
#Squad training depends on Competition
#Squad training depends on Edition
#Squad training is higher when Mental preparation = 1
#Squad training is higher when Outdoor Workout = 1
#Squad training is higher when Past injuries = 1

cols_groupby = ['Age group', 'Competition', 'Edition', 'Mental preparation', 'Outdoor Workout', 'Past injuries']
df_means = df_test.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_test = df_test.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_test.loc[df_test[col].isna(), col] = df_test[df_test[col].isna()][col+'_median']
df_test.drop(columns = [col+'_median'], inplace = True)

Variable 25: Strength training

In [None]:
col = 'Strength training'

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_test[col].mean()+3*df_test[col].std()
Min=df_test[col].mean()-3*df_test[col].std()
max_len=len(df_test[df_test[col]>=Max])
min_len=len(df_test[df_test[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
#Changing outliers for null
df_test.loc[df_test[col]>=Max,col]=np.nan
df_test.loc[df_test[col]<=Min,col]=np.nan

In [None]:
#Strength training is higher with Age group
#Strength training is higher when Cancelled enrollment = 0
#Strength training depends on Competition
#Strength training is higher when Mental preparation = 1
#Strength training is higher when Outdoor Workout = 1
#Strength training is higher when Past injuries = 1
#Strength training is higher when Previous attempts = 1
#Strength training is higher when Sex = M

cols_groupby = ['Age group', 'Cancelled enrollment', 'Competition', 'Mental preparation', 'Outdoor Workout', 'Past injuries', 'Previous attempts', 'Sex']
df_means = df_test.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_test = df_test.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_test.loc[df_test[col].isna(), col] = df_test[df_test[col].isna()][col+'_median']
df_test.drop(columns = [col+'_median'], inplace = True)

Variable 26: Supplements

In [None]:
col = 'Supplements'

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_test[col].mean()+3*df_test[col].std()
Min=df_test[col].mean()-3*df_test[col].std()
max_len=len(df_test[df_test[col]>=Max])
min_len=len(df_test[df_test[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
#Changing outliers for null
df_test.loc[df_test[col]>=Max,col]=np.nan
df_test.loc[df_test[col]<=Min,col]=np.nan

In [None]:
#Supplements is higher with Age group
#Supplements depends on Competition
#Supplements is higher when Mental preparation = 1
#Supplements is higher when No coach = 1
#Supplements is higher when Outdoor Workout = 1
#Supplements is higher when Past injuries = 1

cols_groupby = ['Age group', 'Competition', 'Mental preparation', 'No coach', 'Outdoor Workout', 'Past injuries']
df_means = df_test.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_test = df_test.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_test.loc[df_test[col].isna(), col] = df_test[df_test[col].isna()][col+'_median']
df_test.drop(columns = [col+'_median'], inplace = True)

Variable 27: Train bf competition

In [None]:
col = 'Train bf competition'

In [None]:
#Identifying the outliers with the rule of the mean -+ 3 * std
Max=df_test[col].mean()+3*df_test[col].std()
Min=df_test[col].mean()-3*df_test[col].std()
max_len=len(df_test[df_test[col]>=Max])
min_len=len(df_test[df_test[col]<=Min])
print('There are '+str(max_len)+' values higher than '+str(Max)+' which is the mean+3*std.')
print('There are '+str(min_len)+' values lower than '+str(Min)+' which is the mean-3*std.')

In [None]:
#Changing outliers for null
df_test.loc[df_test[col]>=Max,col]=np.nan
df_test.loc[df_test[col]<=Min,col]=np.nan

In [None]:
#Train bf competition is higher with Cancelled enrollment==0
#Train bf competition is higher with Mental preparation==1
#Train bf competition is higher with No coach==1
#Train bf competition is higher with Outdoor Workout==1
#Train bf competition is higher with Past injuries==1
#Train bf competition is higher with Previous attempts
cols_groupby = ['Cancelled enrollment','Mental preparation','No coach','Outdoor Workout','Past injuries','Previous attempts']
df_means = df_test.groupby(cols_groupby, as_index=False).mean()[cols_groupby+[col]]
df_test = df_test.merge(df_means, on = cols_groupby, how = 'left', suffixes = ('', '_median'))
df_test.loc[df_test[col].isna(), col] = df_test[df_test[col].isna()][col+'_median']
df_test.drop(columns = [col+'_median'], inplace = True)

In [None]:
#df_test.info()

#### Label

In [None]:
# Encode the train dataset

# Competition
#Replacement mapping
Competition_mapping = {
    'Local Match': 0,
    'Regional Match': 1,
    'Regional Tournament': 2,
    'National Cup': 3,
    'Federation League': 4,
    'Continental Championship': 5,
    'Olympic Games': 6,
    'World Championship': 7
}

# Replace values in the "Competition" column using the mapping
df_test['Competition'] = df_test['Competition'].replace(Competition_mapping)


# Edition
#Replacement mapping
Edition_mapping = {
    2019: 0,
    2020: 1,
    2021: 2,
    2022: 3
}

# Replace values in the "Edition" column using the mapping
df_test['Edition'] = df_test['Edition'].replace(Edition_mapping)


# Sex
# Replacement mapping
Sex_mapping = {
    'M': 0,
    'F': 1
}

# Replace values in the "Sex" column using the mapping
df_test['Sex'] = df_test['Sex'].replace(Sex_mapping)



# Region
# Replacement mapping
Region_mapping = {
    'Oceania': 0,
    'Middle East': 1,
    'Northern Africa': 2,
    'Southern Africa': 3,
    'Southern Europe': 4,
    'Western Europe': 5,
    'Eastern Europe': 6,
    'East Asia': 7,
    'Central Asia': 8,
    'South Asia': 9,
    'Central America': 10,
    'North America': 11,
    'South America': 12
}

# Replace values in the "Region" column using the mapping
df_test['Region'] = df_test['Region'].replace(Region_mapping)



# Education
# Replacement mapping
Education_mapping = {
    'Elementary school': 0,
    'Middle school': 1,
    'High school': 2,
    'University Degree': 3,
    'Post Graduate': 4
}

# Replace values in the "Education" column using the mapping
df_test['Education'] = df_test['Education'].replace(Education_mapping)



# Age
# Replacement mapping
Age_mapping = {
    '0-35': 0,
    '35-55': 1,
    '55<=': 2
}

# Replace values in the "Age group" column using the mapping
df_test['Age group'] = df_test['Age group'].replace(Age_mapping)



# Income
# Replacement mapping
Income_mapping = {
    'Low': 0,
    'Middle-Low': 1,
    'Middle': 2,
    'Middle-High': 3,
    'High': 4
}

# Replace values in the "Income" column using the mapping
df_test['Income'] = df_test['Income'].replace(Income_mapping)


## Decision Tree
We creat a decision tree to asses the relevance of the different features.

In [None]:
df_raw_no_nulls = df_raw.dropna() #drop the rows were some of the features is null

data = df_raw_no_nulls.drop(['Outcome'], axis=1)
data = data.drop(['RecordID'], axis=1)
data = data.drop(['Athlete Id'], axis=1)
target = df_raw_no_nulls['Outcome']



In [None]:
#split the dataset into two parts: Trainning and testing
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, stratify=target, random_state=5)

In [None]:
#Creat and fit the decision tree
modelDT = DecisionTreeClassifier()
modelDT.fit(X_train, y_train)

In [None]:
# Process a decision tree graph using Graphviz
dot_data = tree.export_graphviz(modelDT,
                                out_file=None,
                                feature_names=data.columns,
                                class_names=["Losing", "Winning"],
                                filled=True,
                                rounded=True,
                                special_characters=True)
graph = graphviz.Source(dot_data)
graph

In [None]:
y_pred = modelDT.predict(X_test)
print('Model score:', modelDT.score(X_test, y_test))
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

In [None]:
def plot_feature_importances(model):
    n_features = X_train.shape[1]
    plt.figure(figsize=(50,30))
    plt.barh(range(n_features), model.feature_importances_, color='yellowgreen')
    plt.yticks(np.arange(n_features), X_train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    plt.title('Feature Importance in Decision Tree Classifier')
    plt.show()

#plot_feature_importances(modelDT)

Testing the Decision tree as a predictive model for the final test dataset.

In [None]:
data_test = df_test.drop(['RecordID'], axis=1)
data_test = data_test.drop(['Athlete Id'], axis=1)

y_pred_testDT = modelDT.predict(data_test)

In [None]:
# Create a DataFrame with the results
result_df = pd.DataFrame({'RecordID': df_test['RecordID'], 'Outcome': y_pred_testDT.flatten()})

# Convert columns to integer using astype()
result_df['RecordID'] = result_df['RecordID'].astype(int)
result_df['Outcome'] = result_df['Outcome'].astype(int)

# Save the DataFrame into a csv file
result_df.to_csv('Group01_Version01.csv', index=False)

# Download the file to your local machine
files.download('Group01_Version01.csv')

Conclusion: The Decision tree is not a very good predictive model for this dataset.

From the graph above we can see that there are features with a low classification power. They are the following:
No coach

*   No coach
*   Plyometric training
*   Outdoor Workout
*   Mental preparation
*  Late enrollment



In [None]:
df_train = df_raw.drop('No coach', axis=1)

df_train = df_train.drop('Plyometric training', axis=1)

df_train = df_train.drop('Outdoor Workout', axis=1)

df_train = df_train.drop('Mental preparation', axis=1)

df_train = df_train.drop('Late enrollment', axis=1)

df_train = df_train.drop('Athlete Id', axis=1) #This feature is not relevant

df_train = df_train.drop('RecordID', axis=1) #This is just an identifier

#Drop rows with null in some variable

df_train=df_train.dropna()




In [None]:
df_train.columns

## Random Forest

In [None]:
# Create a random forest classifier
X = df_train.drop('Outcome', axis=1)
y = df_train['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rfc = RandomForestClassifier(random_state=42)

# Train the classifier
rfc.fit(X_train, y_train)

# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.05
sfm = SelectFromModel(rfc, threshold=0.05)

# Train the selector
sfm.fit(X_train, y_train)

# Transform the data to create a new dataset containing only the most important features
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
clf_important.fit(X_important_train, y_train)

# Now we can use the trained model to make predictions on a new dataset
# First, we need to prepare the new dataset in the same way as the training dataset

# Drop the unnecessary columns
data_test = df_test.drop(['No coach', 'Plyometric training', 'Outdoor Workout', 'Mental preparation', 'Late enrollment', 'Athlete Id'], axis=1)

data_test_ID = df_test[['RecordID']]
data_test = data_test.drop(['RecordID'], axis=1)

# Make predictions on the test set
y_pred = rfc.predict(data_test)

# Create a DataFrame with the results
result_df = pd.DataFrame({'RecordID': data_test_ID['RecordID'], 'Outcome': y_pred})

# Convert columns to integer using astype()
result_df['RecordID'] = result_df['RecordID'].astype(int)
result_df['Outcome'] = result_df['Outcome'].astype(int)

# Save the DataFrame into a csv file
result_df.to_csv('Group01_Version02.csv', index=False)

# Download the file
files.download('Group01_Version02.csv')


## Creation of an hybrid model

The model will predict the outcome to be 0 for all the rows where Cancelled enrolment was 1.
For the other rows we will apply a multilayer percepton.

In [None]:
# Decision tree

#Remove the Cancelled enrollment records, and use only the branch where Cancelled enrollment = 0

df_train_not_cancelled = df_train[df_train['Cancelled enrollment'] == 0]
df_train_not_cancelled = df_train_not_cancelled.drop('Cancelled enrollment', axis=1)



In [None]:
# ANN

# Assuming you have your features stored in X and labels/targets in y
X = df_train_not_cancelled.drop('Outcome', axis=1)
y = df_train_not_cancelled['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create an MLP classifier model
model = MLPClassifier(hidden_layer_sizes=(100, 50), activation='logistic', solver='adam', alpha=0.0001, learning_rate='constant', random_state=42)


# Define the grid of hyperparameters to search
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,), (50, 50), (50,100), (100,100), (100, 50), (30, 20, 10), (64, 32), (24, 24), (10,10,10,10), (20,20,20,20), (100,100,100), (64,), (64,64,64,64), (64,64,64), (10, 15, 10) ],
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],}

# Create a GridSearchCV object
grid = GridSearchCV(model, parameter_space, n_jobs=-1, cv=3)

# Train the model using the grid search
grid.fit(X_train_scaled, y_train)

# Get the best parameters
print("Best parameters found:\n", grid.best_params_)

# Get the accuracy of the best model
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)




In [None]:
num_layers = model.n_layers_
print("Number of layers:", num_layers)

In [None]:
# Calculate the F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

Apply the model to the test data

In [None]:
data_test = df_test.drop('No coach', axis=1)

data_test = data_test.drop('Plyometric training', axis=1)

data_test = data_test.drop('Outdoor Workout', axis=1)

data_test = data_test.drop('Mental preparation', axis=1)

data_test = data_test.drop('Late enrollment', axis=1)

data_test = data_test.drop('Athlete Id', axis=1) #This feature is not relevant

In [None]:
#Decision Tree

# Divide the dataset into those with cancelled enrolment=0 and cancelled enrolment=1
#select the rows with 1
data_test_cancelled = data_test[data_test['Cancelled enrollment'] == 1]
data_test_cancelled = data_test_cancelled[['RecordID']]
data_test_cancelled['Outcome']=0

#select the rows with 0
data_test_not_cancelled = data_test[data_test['Cancelled enrollment'] == 0]
data_test_NC_ID=data_test_not_cancelled[['RecordID']]
data_test_not_cancelled = data_test_not_cancelled.drop('Cancelled enrollment', axis=1)
data_test_not_cancelled = data_test_not_cancelled.drop('RecordID', axis=1)


In [None]:
# ANN

# Scale the features using StandardScaler
data_scaled = scaler.transform(data_test_not_cancelled)

# Make predictions on the test set
y_pred_not_cancelled = model.predict(data_scaled)

# Create a DataFrame with the results
result_df_not_c = pd.DataFrame({'RecordID': data_test_NC_ID['RecordID'], 'Outcome': y_pred_not_cancelled.flatten()})

In [None]:
# Create a DataFrame with the results
result_df = pd.concat([result_df_not_c,data_test_cancelled])

# Convert columns to integer using astype()
result_df['RecordID'] = result_df['RecordID'].astype(int)
result_df['Outcome'] = result_df['Outcome'].astype(int)

# Save the DataFrame into a csv file
result_df.to_csv('Group01_Version03.csv', index=False)

# Download the file to your local machine
files.download('Group01_Version03.csv')