# Data Preprocessing (1)

## Prepare Data

In [20]:
import os
import numpy as np
import pandas as pd

In [21]:
if not os.path.exists('titanic'):
    !kaggle competitions download -c titanic
    !unzip titanic.zip -d titanic
    !rm *.zip

In [22]:
train_df = pd.read_csv('titanic/train.csv')
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_df.set_index('PassengerId', inplace=True)
train_df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
target = 'Survived'
y = train_df[target]
X = train_df.drop(target, axis=1)
X.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Name      891 non-null    object 
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Ticket    891 non-null    object 
 7   Fare      891 non-null    float64
 8   Cabin     204 non-null    object 
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 76.6+ KB


In [7]:
X = X.drop(['Name', 'Ticket', 'Cabin'], axis=1)
X.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,male,22.0,1,0,7.25,S
2,1,female,38.0,1,0,71.2833,C
3,3,female,26.0,0,0,7.925,S
4,1,female,35.0,1,0,53.1,S
5,3,male,35.0,0,0,8.05,S


In [8]:
X.isna().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [9]:
X.nunique()

Pclass        3
Sex           2
Age          88
SibSp         7
Parch         7
Fare        248
Embarked      3
dtype: int64

In [10]:
num_cols = X.select_dtypes('floating').columns
num_cols

Index(['Age', 'Fare'], dtype='object')

In [11]:
cat_cols = X.columns.difference(num_cols)
cat_cols

Index(['Embarked', 'Parch', 'Pclass', 'Sex', 'SibSp'], dtype='object')

## Missing Value Imputation

For numerical features:
- Impute missing values with mean value of the feature
- Impute missing values with median value of the feature
- Impute missing values with a constant value (0, -1, etc.)

For categorical features:
- Impute missing values with the most frequent value of the feature
- Regard missing values as a new category

In [12]:
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy='mean')  # 'median'
# num_imputer = SimpleImputer(strategy='constant', fill_value=0.)
cat_imputer = SimpleImputer(strategy='most_frequent')

X_num_imputed = num_imputer.fit_transform(X[num_cols])
df1 = pd.DataFrame(X_num_imputed, columns=num_cols)
X_cat_imputed = cat_imputer.fit_transform(X[cat_cols])
df2 = pd.DataFrame(X_cat_imputed, columns=cat_cols)
X_imputed = pd.concat([df1, df2], axis=1)
X_imputed = X_imputed[X.columns]

X_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    object 
 1   Sex       891 non-null    object 
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    object 
 4   Parch     891 non-null    object 
 5   Fare      891 non-null    float64
 6   Embarked  891 non-null    object 
dtypes: float64(2), object(5)
memory usage: 48.9+ KB


## Feature Encoding

Often features are not given as continuous values but categorical. For example in the Titanic dataset, the "sex" feature is a binary categorical feature, and the "embarked" feature is a feature with three categories. In order to use these features in a machine learning algorithm, we need to encode them as numerical values. This is called feature encoding. 

In [13]:
{col: vals.unique() for col, vals in X[cat_cols].items()}

{'Embarked': array(['S', 'C', 'Q', nan], dtype=object),
 'Parch': array([0, 1, 2, 5, 3, 4, 6], dtype=int64),
 'Pclass': array([3, 1, 2], dtype=int64),
 'Sex': array(['male', 'female'], dtype=object),
 'SibSp': array([1, 0, 3, 4, 2, 5, 8], dtype=int64)}

### Nominal Categories

Note that integer features may also be categorical. In contrast to "SibSp" and "Parch", which have numerical meanings, the value of "Pclass" (ticket class) is only an ID. It is a categorical feature and we need encode it as well.

If the categories of a feature do not follow any order, like "male/female", "N/E/S/W", etc., they are called "nominal categories". The one-hot encoding is commonly applied on these categories.

In [14]:
from sklearn.preprocessing import OneHotEncoder

nom_cats = ['Embarked']
ohe = OneHotEncoder(sparse_output=False).fit(X[nom_cats])
X[ohe.get_feature_names_out(nom_cats)] = ohe.transform(X[nom_cats])
X = X.drop(nom_cats, axis=1)
X.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,male,22.0,1,0,7.25,0.0,0.0,1.0,0.0
2,1,female,38.0,1,0,71.2833,1.0,0.0,0.0,0.0
3,3,female,26.0,0,0,7.925,0.0,0.0,1.0,0.0
4,1,female,35.0,1,0,53.1,0.0,0.0,1.0,0.0
5,3,male,35.0,0,0,8.05,0.0,0.0,1.0,0.0


### Ordinal Categories

If the categories of a feature follow an order, like "beginner/intermediate/advanced", they are called "ordinal categories". The common practice is to use a label encoder, which converts $N$ ordinal categories to $0,1,\ldots,N-1$.

In [15]:
from sklearn.preprocessing import LabelEncoder

bin_cats = ['Sex']
ord_cats = ['Pclass']
for cat in bin_cats + ord_cats:
    X[cat] = LabelEncoder().fit_transform(X[cat])
X.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2,1,22.0,1,0,7.25,0.0,0.0,1.0,0.0
2,0,0,38.0,1,0,71.2833,1.0,0.0,0.0,0.0
3,2,0,26.0,0,0,7.925,0.0,0.0,1.0,0.0
4,0,0,35.0,1,0,53.1,0.0,0.0,1.0,0.0
5,2,1,35.0,0,0,8.05,0.0,0.0,1.0,0.0


### All in One: ColumnTransformer

In [16]:
from sklearn.preprocessing import StandardScaler

X_imputed[num_cols].describe()

Unnamed: 0,Age,Fare
count,891.0,891.0
mean,29.699118,32.204208
std,13.002015,49.693429
min,0.42,0.0
25%,22.0,7.9104
50%,29.699118,14.4542
75%,35.0,31.0
max,80.0,512.3292


In [17]:
scalar = StandardScaler()
X_imputed[num_cols] = scalar.fit_transform(X_imputed[num_cols])
X_imputed[num_cols].describe()

Unnamed: 0,Age,Fare
count,891.0,891.0
mean,2.232906e-16,3.9873330000000004e-18
std,1.000562,1.000562
min,-2.253155,-0.6484217
25%,-0.5924806,-0.4891482
50%,0.0,-0.3573909
75%,0.407926,-0.02424635
max,3.870872,9.667167


In [19]:
from sklearn.compose import ColumnTransformer

LabelEncoder._ft = LabelEncoder.fit_transform
LabelEncoder.fit_transform = lambda self, x, y=None, **kwargs: \
    np.apply_along_axis(self._ft, 1, x.values.astype(str))
LabelEncoder.get_feature_names_out = lambda self, names: names

feature_encoder = ColumnTransformer([
    ('nominal', OneHotEncoder(sparse_output=False), nom_cats),
    ('ordinal', LabelEncoder(), ord_cats + bin_cats),
    ('numeric', StandardScaler(), num_cols),
], remainder='passthrough')

X = train_df.drop(['Name', 'Ticket', 'Cabin', target], axis=1)
out = feature_encoder.fit_transform(X)
pd.DataFrame(out, columns=feature_encoder.get_feature_names_out()).head()

Unnamed: 0,nominal__Embarked_C,nominal__Embarked_Q,nominal__Embarked_S,nominal__Embarked_nan,ordinal__Pclass,ordinal__Sex,numeric__Age,numeric__Fare,remainder__SibSp,remainder__Parch
0,0.0,0.0,1.0,0.0,0.0,1.0,-0.530377,-0.502445,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.571831,0.786845,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,-0.254825,-0.488854,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.365167,0.42073,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.365167,-0.486337,0.0,0.0
