## Importación de librerías

In [111]:
import pandas as pd

## Importación de CSV

In [112]:
df = pd.read_csv("Input/athlete_events.csv")  

df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


## Inspección de datos

In [113]:
df.shape

(271116, 15)

## Limpieza de datos

**Eliminación de columnas de escasa relevancia**

In [114]:
df.drop(columns=["ID","Name","Games","City","Event","NOC"], axis=1, inplace=True)

In [115]:
df.head()

Unnamed: 0,Sex,Age,Height,Weight,Team,Year,Season,Sport,Medal
0,M,24.0,180.0,80.0,China,1992,Summer,Basketball,
1,M,23.0,170.0,60.0,China,2012,Summer,Judo,
2,M,24.0,,,Denmark,1920,Summer,Football,
3,M,34.0,,,Denmark/Sweden,1900,Summer,Tug-Of-War,Gold
4,F,21.0,185.0,82.0,Netherlands,1988,Winter,Speed Skating,


**Eliminación de años anteriores a la segunda guerra mundial**

In [116]:
df = df.loc[df.Year>=1990].sort_values(by=["Year"]).reset_index(drop=True)

display(df.head())
df.tail()

Unnamed: 0,Sex,Age,Height,Weight,Team,Year,Season,Sport,Medal
0,M,30.0,,,Hungary,1948,Summer,Boxing,
1,M,,,,Iran,1948,Summer,Wrestling,
2,M,26.0,,,Great Britain,1948,Summer,Rowing,Silver
3,M,25.0,,,Italy,1948,Summer,Basketball,
4,M,34.0,,,Uruguay,1948,Summer,Water Polo,


Unnamed: 0,Sex,Age,Height,Weight,Team,Year,Season,Sport,Medal
232339,M,23.0,167.0,67.0,Ukraine,2016,Summer,Gymnastics,
232340,M,23.0,167.0,67.0,Ukraine,2016,Summer,Gymnastics,
232341,M,30.0,167.0,64.0,Iran,2016,Summer,Wrestling,
232342,F,26.0,159.0,45.0,Morocco,2016,Summer,Athletics,
232343,M,28.0,175.0,78.0,Argentina,2016,Summer,Shooting,


**Comprobación de valores nulos**

In [117]:
df.isnull().sum()

Sex            0
Age         3206
Height     28348
Weight     29078
Team           0
Year           0
Season         0
Sport          0
Medal     200881
dtype: int64

In [118]:
df.loc[df.Height.isnull()].head()

Unnamed: 0,Sex,Age,Height,Weight,Team,Year,Season,Sport,Medal
0,M,30.0,,,Hungary,1948,Summer,Boxing,
1,M,,,,Iran,1948,Summer,Wrestling,
2,M,26.0,,,Great Britain,1948,Summer,Rowing,Silver
3,M,25.0,,,Italy,1948,Summer,Basketball,
4,M,34.0,,,Uruguay,1948,Summer,Water Polo,


In [119]:
df.loc[df.Age.isnull()].head()

Unnamed: 0,Sex,Age,Height,Weight,Team,Year,Season,Sport,Medal
1,M,,,,Iran,1948,Summer,Wrestling,
5,M,,,,Lebanon,1948,Summer,Boxing,
9,M,,,,Afghanistan,1948,Summer,Football,
15,M,,,,Czechoslovakia,1948,Summer,Boxing,
16,M,,189.0,80.0,South Korea,1948,Summer,Basketball,


In [120]:
df.dropna(subset=['Age', 'Height', "Weight"], inplace = True)

df.head()

Unnamed: 0,Sex,Age,Height,Weight,Team,Year,Season,Sport,Medal
8,M,24.0,180.0,79.0,France,1948,Summer,Cycling,Gold
11,M,23.0,171.0,70.0,Greece,1948,Summer,Athletics,
13,M,23.0,171.0,70.0,Greece,1948,Summer,Athletics,
14,M,23.0,171.0,70.0,Greece,1948,Summer,Athletics,
20,M,26.0,176.0,72.0,Yugoslavia,1948,Summer,Athletics,


## Preparar datos para el modelo

**Transformación de datos categóricos a numéricos**

In [121]:
df = pd.get_dummies(df,columns=["Sex"])

df.head()

Unnamed: 0,Age,Height,Weight,Team,Year,Season,Sport,Medal,Sex_F,Sex_M
8,24.0,180.0,79.0,France,1948,Summer,Cycling,Gold,0,1
11,23.0,171.0,70.0,Greece,1948,Summer,Athletics,,0,1
13,23.0,171.0,70.0,Greece,1948,Summer,Athletics,,0,1
14,23.0,171.0,70.0,Greece,1948,Summer,Athletics,,0,1
20,26.0,176.0,72.0,Yugoslavia,1948,Summer,Athletics,,0,1


In [127]:
df["Medal"].value_counts()

Bronze    9788
Gold      9592
Silver    9445
Name: Medal, dtype: int64

In [123]:
df.replace("Gold", 3).replace("Silver", 2).replace("Bronze", 1)

Unnamed: 0,Age,Height,Weight,Team,Year,Season,Sport,Medal,Sex_F,Sex_M
8,24.0,180.0,79.0,France,1948,Summer,Cycling,3.0,0,1
11,23.0,171.0,70.0,Greece,1948,Summer,Athletics,,0,1
13,23.0,171.0,70.0,Greece,1948,Summer,Athletics,,0,1
14,23.0,171.0,70.0,Greece,1948,Summer,Athletics,,0,1
20,26.0,176.0,72.0,Yugoslavia,1948,Summer,Athletics,,0,1
...,...,...,...,...,...,...,...,...,...,...
232339,23.0,167.0,67.0,Ukraine,2016,Summer,Gymnastics,,0,1
232340,23.0,167.0,67.0,Ukraine,2016,Summer,Gymnastics,,0,1
232341,30.0,167.0,64.0,Iran,2016,Summer,Wrestling,,0,1
232342,26.0,159.0,45.0,Morocco,2016,Summer,Athletics,,1,0


## Generación de modelo

Ver apartado de Google Colab