# S09 T01: Tasca Feature Engineering

Descripció:

Aprèn a gestionar paràmetres amb Python.

Objectius:

- Pre-processar les dades realitzant feature engineering
- Interpretar els diferents conceptes de feature engineering

In [1]:
# Load libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
Athletes_df = pd.read_csv('../hypothesis_testing/input/athlete_events.csv', index_col = 'Year').drop('ID', axis = 1) 

In [3]:
# lets check the glimpse of first five rows of athletes dataframe
Athletes_df.head()

Unnamed: 0_level_0,Name,Sex,Age,Height,Weight,Team,NOC,Games,Season,City,Sport,Event,Medal
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1992,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,Summer,Barcelona,Basketball,Basketball Men's Basketball,
2012,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,Summer,London,Judo,Judo Men's Extra-Lightweight,
1920,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,Summer,Antwerpen,Football,Football Men's Football,
1900,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
1988,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [4]:
# shape of dataframe
print('Shape of Athlete DF {}\n'.format(Athletes_df.shape))

Shape of Athlete DF (271116, 13)



In [5]:
## getting an overview of our data
Athletes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 271116 entries, 1992 to 2002
Data columns (total 13 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Name    271116 non-null  object 
 1   Sex     271116 non-null  object 
 2   Age     261642 non-null  float64
 3   Height  210945 non-null  float64
 4   Weight  208241 non-null  float64
 5   Team    271116 non-null  object 
 6   NOC     271116 non-null  object 
 7   Games   271116 non-null  object 
 8   Season  271116 non-null  object 
 9   City    271116 non-null  object 
 10  Sport   271116 non-null  object 
 11  Event   271116 non-null  object 
 12  Medal   39783 non-null   object 
dtypes: float64(3), object(10)
memory usage: 29.0+ MB


In [6]:
# summary statistics
Athletes_df.describe()

Unnamed: 0,Age,Height,Weight
count,261642.0,210945.0,208241.0
mean,25.556898,175.33897,70.702393
std,6.393561,10.518462,14.34802
min,10.0,127.0,25.0
25%,21.0,168.0,60.0
50%,24.0,175.0,70.0
75%,28.0,183.0,79.0
max,97.0,226.0,214.0


In [7]:
# lets check for missing values
Athletes_df.isnull().sum()

Name           0
Sex            0
Age         9474
Height     60171
Weight     62875
Team           0
NOC            0
Games          0
Season         0
City           0
Sport          0
Event          0
Medal     231333
dtype: int64

From the above code it is apparent that "Age", "Height", "Weight" and "Medal" are the only columns with missing values. However, the missing values in the "Medal" column are probably due to the fact that the players did not win any medals at that time and not that those values are actually missing. Therefore, we will only impute the values in the "Age", "Height" and "Weight" columns of the data set.

## Model Building

- Data Cleaning

In [8]:
Athletes_df.columns

Index(['Name', 'Sex', 'Age', 'Height', 'Weight', 'Team', 'NOC', 'Games',
       'Season', 'City', 'Sport', 'Event', 'Medal'],
      dtype='object')

In [9]:
# dropping features that do not make sense for our proposal 
data = Athletes_df.copy()
data.drop(['Name', 'Games', 'Team', 'Season', 'City', 'Event'], axis = 1, inplace = True)

In [10]:
# The dataset with the features that interest us for our proposal
data.columns

Index(['Sex', 'Age', 'Height', 'Weight', 'NOC', 'Sport', 'Medal'], dtype='object')

In [11]:
# Encoding for Medal##data['Medal'] = data['Medal'].replace({np.NaN: 0,'Gold': 1, 'Silver': 2,'Bronze': 3}) # Option A : Keep the Medals NoMedal(0),Gold(1),Silver(2),Bronze(3)
#data['Medal'] = data['Medal'].replace({np.NaN: 0,'Gold': 1, 'Silver': 1,'Bronze': 1}) # Option B : Keep the Medals NoMedal(0),Medal(1)
data['Medal'] = data['Medal'].apply(lambda x: 0 if str(x) == 'nan' else 1)            # Option C : Keep the Medals NoMedal(0),Medal(1)

In [12]:
data.query('Medal == 0').head() # & Medal == 1

Unnamed: 0_level_0,Sex,Age,Height,Weight,NOC,Sport,Medal
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1992,M,24.0,180.0,80.0,CHN,Basketball,0
2012,M,23.0,170.0,60.0,CHN,Judo,0
1920,M,24.0,,,DEN,Football,0
1988,F,21.0,185.0,82.0,NED,Speed Skating,0
1988,F,21.0,185.0,82.0,NED,Speed Skating,0


In [13]:
# applying groupby() function to group the data on Medal value.
search = data.groupby('Medal')
# Let's print the first entries in all the groups formed. 
search.first() 

Unnamed: 0_level_0,Sex,Age,Height,Weight,NOC,Sport
Medal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,M,24.0,180.0,80.0,CHN,Basketball
1,M,34.0,184.0,85.0,DEN,Tug-Of-War


In [14]:
# list of columns with null values 
missing_values_columns = [col for col in data.columns if data.isnull().sum()[col] > 0]
missing_values_columns

['Age', 'Height', 'Weight']

In [15]:
# groupby for medal and gender then calculate mean for numerical features 
data.groupby(['Medal', 'Sex']).mean().astype(int)

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Height,Weight
Medal,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,F,23,167,59
0,M,26,178,75
1,F,24,170,63
1,M,26,181,79


In [16]:
data.shape

(271116, 7)

In [17]:
# checking for null values
data.dropna(inplace=True)
data.shape

(206165, 7)

In [18]:
# saving the dataframe
data.to_csv('feature_engineering.csv', header=True, index=False)