In [13]:
import pandas as pd
import sklearn

In [15]:
df = pd.read_csv('train.csv', index_col='PassengerId')
df.pop('Name')
df.pop('Ticket')
df.pop('Cabin')

class_col = df.pop('Survived')

print(df)

             Pclass     Sex   Age  SibSp  Parch     Fare Embarked
PassengerId                                                      
1                 3    male  22.0      1      0   7.2500        S
2                 1  female  38.0      1      0  71.2833        C
3                 3  female  26.0      0      0   7.9250        S
4                 1  female  35.0      1      0  53.1000        S
5                 3    male  35.0      0      0   8.0500        S
...             ...     ...   ...    ...    ...      ...      ...
887               2    male  27.0      0      0  13.0000        S
888               1  female  19.0      0      0  30.0000        S
889               3  female   NaN      1      2  23.4500        S
890               1    male  26.0      0      0  30.0000        C
891               3    male  32.0      0      0   7.7500        Q

[891 rows x 7 columns]


In [16]:
print(df.isnull().any())

Pclass      False
Sex         False
Age          True
SibSp       False
Parch       False
Fare        False
Embarked     True
dtype: bool


In [17]:
cat_mask = (df.dtypes==object)
cat_cols = df.columns[cat_mask].tolist()
df_cat = df[cat_cols]
df_num = df.drop(cat_cols, axis=1)

In [18]:
from sklearn.impute import SimpleImputer

imp_cat = SimpleImputer(strategy='most_frequent')
columns = df_cat.columns
index = df_cat.index
df_cat = pd.DataFrame(imp_cat.fit_transform(df_cat), columns=columns, index=index)

print(df_cat.isnull().any())

Sex         False
Embarked    False
dtype: bool


In [20]:
from sklearn import preprocessing
from collections import defaultdict

d = defaultdict(preprocessing.LabelEncoder)

df_cat_le = df_cat.apply(lambda col: d[col.name].fit_transform(col))
print(df_cat_le)

             Sex  Embarked
PassengerId               
1              1         2
2              0         0
3              0         2
4              0         2
5              1         2
...          ...       ...
887            1         2
888            0         2
889            0         2
890            1         0
891            1         1

[891 rows x 2 columns]


In [21]:
inverse = df_cat_le.apply(lambda col: d[col.name].inverse_transform(col))
print(inverse)

transformed = inverse.apply(lambda col: d[col.name].transform(col))
print(transformed)

                Sex Embarked
PassengerId                 
1              male        S
2            female        C
3            female        S
4            female        S
5              male        S
...             ...      ...
887            male        S
888          female        S
889          female        S
890            male        C
891            male        Q

[891 rows x 2 columns]
             Sex  Embarked
PassengerId               
1              1         2
2              0         0
3              0         2
4              0         2
5              1         2
...          ...       ...
887            1         2
888            0         2
889            0         2
890            1         0
891            1         1

[891 rows x 2 columns]


In [23]:
ohe = preprocessing.OneHotEncoder(sparse=False)
df_cat_ohe = pd.DataFrame(ohe.fit_transform(df_cat), 
    columns=ohe.get_feature_names_out(df_cat.columns.tolist()), index=df_cat.index)

print(df_cat_ohe)

             Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S
PassengerId                                                          
1                   0.0       1.0         0.0         0.0         1.0
2                   1.0       0.0         1.0         0.0         0.0
3                   1.0       0.0         0.0         0.0         1.0
4                   1.0       0.0         0.0         0.0         1.0
5                   0.0       1.0         0.0         0.0         1.0
...                 ...       ...         ...         ...         ...
887                 0.0       1.0         0.0         0.0         1.0
888                 1.0       0.0         0.0         0.0         1.0
889                 1.0       0.0         0.0         0.0         1.0
890                 0.0       1.0         1.0         0.0         0.0
891                 0.0       1.0         0.0         1.0         0.0

[891 rows x 5 columns]


In [24]:
imp_num = SimpleImputer(strategy='mean')
columns = df_num.columns
index = df_num.index
df_num = pd.DataFrame(imp_num.fit_transform(df_num), columns=columns, index=index)

print(df_num.isnull().any())

Pclass    False
Age       False
SibSp     False
Parch     False
Fare      False
dtype: bool


In [25]:
df_preprocessed = pd.merge(left=df_cat_ohe, right=df_num, on='PassengerId')
print(df_preprocessed)

             Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S  Pclass  \
PassengerId                                                                     
1                   0.0       1.0         0.0         0.0         1.0     3.0   
2                   1.0       0.0         1.0         0.0         0.0     1.0   
3                   1.0       0.0         0.0         0.0         1.0     3.0   
4                   1.0       0.0         0.0         0.0         1.0     1.0   
5                   0.0       1.0         0.0         0.0         1.0     3.0   
...                 ...       ...         ...         ...         ...     ...   
887                 0.0       1.0         0.0         0.0         1.0     2.0   
888                 1.0       0.0         0.0         0.0         1.0     1.0   
889                 1.0       0.0         0.0         0.0         1.0     3.0   
890                 0.0       1.0         1.0         0.0         0.0     1.0   
891                 0.0     

In [26]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

fs_k_best_chi2 = SelectKBest(chi2, k=4)
fs_k_best_chi2.fit(df_preprocessed, class_col)
col_filter = fs_k_best_chi2.get_support()
df_k_best_chi2 = df_preprocessed.iloc[:, col_filter]

print(df_k_best_chi2)

             Sex_female  Sex_male  Pclass     Fare
PassengerId                                       
1                   0.0       1.0     3.0   7.2500
2                   1.0       0.0     1.0  71.2833
3                   1.0       0.0     3.0   7.9250
4                   1.0       0.0     1.0  53.1000
5                   0.0       1.0     3.0   8.0500
...                 ...       ...     ...      ...
887                 0.0       1.0     2.0  13.0000
888                 1.0       0.0     1.0  30.0000
889                 1.0       0.0     3.0  23.4500
890                 0.0       1.0     1.0  30.0000
891                 0.0       1.0     3.0   7.7500

[891 rows x 4 columns]
