## khai báo thư viện

In [800]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [801]:
df = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

# **Xử lý dữ liệu**

## **Xóa cột PassengerID,Ticket** : Vì có vẻ đặc trưng này không giúp ích cho mô hình

In [802]:
train_test_data = [df,df_test]
for ds in train_test_data:
    ds.drop('PassengerId',axis=1,inplace=True)
    ds.drop('Ticket',axis=1,inplace=True)

## **Tiền xử lý đặc trưng Name và Age**

In [803]:
for ds in train_test_data:
    ds['Title'] = ds['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False) #Lấy ra hết danh từ danh xưng thành một đặc trưng title

In [804]:
df['Title'].value_counts()

Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Mlle          2
Major         2
Ms            1
Mme           1
Don           1
Lady          1
Sir           1
Capt          1
Countess      1
Jonkheer      1
Name: count, dtype: int64

In [805]:
Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}
for ds in train_test_data:
    ds['Title'] = ds['Title'].map(Title_Dictionary) #Map lại cho title ít giá trị hơn.


In [806]:
for ds in train_test_data:
    if(ds['Title'].isna().any()):
        display(ds[ds['Title'].isna()])

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
414,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,108.9,C105,C,


### fill NaN cho đặc trưng Age dựa vào danh xưng của tên
- để tránh data leakage từ tập test. Tả sẽ dùng trung vị của tập train rồi sau đó thêm các giá trị thiếu của tập test bằng tập train

In [807]:
grouped_train = df.groupby(['Sex','Title'])
grouped_median_train = grouped_train['Age'].median()
grouped_median_train = grouped_median_train.reset_index()[['Sex', 'Title', 'Age']]
grouped_median_train

Unnamed: 0,Sex,Title,Age
0,female,Miss,21.0
1,female,Mrs,35.0
2,female,Officer,49.0
3,female,Royalty,40.5
4,male,Master,3.5
5,male,Mr,30.0
6,male,Officer,50.5
7,male,Royalty,40.0


- Fill vào cho đặc trưng bị thiếu Age của train và test

In [808]:
lookup_table = grouped_median_train.set_index(['Sex','Title'])['Age']

In [809]:
def fill_age(row):
    if pd.notna(row['Age']):
        return row['Age']
    lookup_keys = (row['Sex'],row['Title'])
    return lookup_table.loc[lookup_keys]

for ds in train_test_data:
    ds['Age'] = ds.apply(fill_age,axis=1)

### Tiếp theo ta sẽ dùng phương pháp One-hot encoding cho đặc trưng đã tạo

In [810]:
#Drop cột không cần thiết
for ds in train_test_data:

    ds.drop('Name',axis=1,inplace=True)

In [811]:
#Encoding dùng thư viện One-hot
from sklearn.preprocessing import OneHotEncoder
def ApplyOneHotEncoder(df,feat):
    encoder = OneHotEncoder(sparse_output=False,handle_unknown='ignore')    
    encoder.fit(df[[feat]])
    title_encoded = encoder.transform(df[[feat]])
    title_feature_names = encoder.get_feature_names_out([feat])
    df_encoded = pd.DataFrame(title_encoded, columns=title_feature_names)
    df_encoded.index = df.index
    df = pd.concat([df,df_encoded],axis=1)
    df.drop(feat,axis=1,inplace=True)
    return df

df = ApplyOneHotEncoder(df,'Title')

In [812]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,0,3,male,22.0,1,0,7.25,,S,0.0,0.0,1.0,0.0,0.0,0.0
1,1,1,female,38.0,1,0,71.2833,C85,C,0.0,0.0,0.0,1.0,0.0,0.0
2,1,3,female,26.0,0,0,7.925,,S,0.0,1.0,0.0,0.0,0.0,0.0
3,1,1,female,35.0,1,0,53.1,C123,S,0.0,0.0,0.0,1.0,0.0,0.0
4,0,3,male,35.0,0,0,8.05,,S,0.0,0.0,1.0,0.0,0.0,0.0


In [813]:
df_test = ApplyOneHotEncoder(df_test,'Title')

In [814]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_nan
0,3,male,34.5,0,0,7.8292,,Q,0.0,0.0,1.0,0.0,0.0,0.0
1,3,female,47.0,1,0,7.0,,S,0.0,0.0,0.0,1.0,0.0,0.0
2,2,male,62.0,0,0,9.6875,,Q,0.0,0.0,1.0,0.0,0.0,0.0
3,3,male,27.0,0,0,8.6625,,S,0.0,0.0,1.0,0.0,0.0,0.0
4,3,female,22.0,1,1,12.2875,,S,0.0,0.0,0.0,1.0,0.0,0.0


Sau khi OneHot Encoding xong thì sẽ thấy không còn đặc trưng Name. Các đặc trưng mới xuất hiện Title_X VD: nếu Title_X = 1 thì người đó sẽ có danh từ xưng hô là Mr

## Xử lý Fare

In [815]:
df_test['Fare'].isna().sum()

np.int64(1)

In [816]:
df_test["Fare"].fillna(df['Fare'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test["Fare"].fillna(df['Fare'].mean(),inplace=True)


## Xử lý Embarked

In [817]:
df['Embarked'] = df['Embarked'].fillna('S')
df_test['Embarked'] = df_test['Embarked'].fillna('S')

In [818]:
df = ApplyOneHotEncoder(df,'Embarked')

In [819]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Embarked_C,Embarked_Q,Embarked_S
0,0,3,male,22.0,1,0,7.25,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,1,female,38.0,1,0,71.2833,C85,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1,3,female,26.0,0,0,7.925,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,1,female,35.0,1,0,53.1,C123,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0,3,male,35.0,0,0,8.05,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [820]:
df_test = ApplyOneHotEncoder(df_test,'Embarked')

In [821]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_nan,Embarked_C,Embarked_Q,Embarked_S
0,3,male,34.5,0,0,7.8292,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3,female,47.0,1,0,7.0,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,2,male,62.0,0,0,9.6875,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3,male,27.0,0,0,8.6625,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3,female,22.0,1,1,12.2875,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [822]:
train_cabin, test_cabin = set(), set()
for char in df['Cabin']:
    try:
        train_cabin.add(char[0])
    except:
        train_cabin.add('U')

for char in df_test['Cabin']:
    try:
        test_cabin.add(char[0])
    except:
        test_cabin.add('U')

In [823]:
print(train_cabin)

{'B', 'E', 'C', 'A', 'D', 'G', 'T', 'F', 'U'}


In [824]:
print(test_cabin)

{'E', 'C', 'A', 'U', 'D', 'G', 'F', 'B'}


In [825]:
df['Cabin'].fillna('U',inplace=True)
df['Cabin'] = df['Cabin'].map(lambda c: c[0])
df_test['Cabin'].fillna('U',inplace=True)
df_test['Cabin'] = df_test['Cabin'].map(lambda c: c[0])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin'].fillna('U',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['Cabin'].fillna('U',inplace=True)


## Xử lý cabin

In [826]:
df = ApplyOneHotEncoder(df,'Cabin')

In [827]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,...,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
0,0,3,male,22.0,1,0,7.25,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,1,female,38.0,1,0,71.2833,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,3,female,26.0,0,0,7.925,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,1,female,35.0,1,0,53.1,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,3,male,35.0,0,0,8.05,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [828]:
df_test = ApplyOneHotEncoder(df_test,'Cabin')

In [829]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,...,Embarked_Q,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_U
0,3,male,34.5,0,0,7.8292,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,female,47.0,1,0,7.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,male,62.0,0,0,9.6875,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,male,27.0,0,0,8.6625,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3,female,22.0,1,1,12.2875,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Xử lý giới tính

In [830]:
train_test_data = [df,df_test]
mapping_gender = {"male":0,"female":1}
for ds in train_test_data:
    ds['Sex'] = ds['Sex'].map(mapping_gender)

In [831]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,...,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
0,0,3,0,22.0,1,0,7.25,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,1,1,38.0,1,0,71.2833,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,3,1,26.0,0,0,7.925,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,1,1,35.0,1,0,53.1,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,3,0,35.0,0,0,8.05,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Xử lý SibSp + Parch
**Gộp biến SibSp và Parch** vì có liên quan đến nhau. Gộp theo công thức sau:

**FamilySize** = Sibsp + Parch + 1 (Cộng 1 là vì cộng chính hành khách đó)

In [832]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1

In [833]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,...,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,FamilySize
0,0,3,0,22.0,1,0,7.25,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2
1,1,1,1,38.0,1,0,71.2833,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,1,3,1,26.0,0,0,7.925,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
3,1,1,1,35.0,1,0,53.1,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,0,3,0,35.0,0,0,8.05,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1


In [834]:
train_test_data = [df,df_test]
for ds in train_test_data:
    ds['Singleton'] = ds['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    ds['SmallFamily'] = ds['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
    ds['LargeFamily'] = ds['FamilySize'].map(lambda s: 1 if 5 <= s else 0)
    ds

In [835]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,...,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,FamilySize,Singleton,SmallFamily,LargeFamily
0,0,3,0,22.0,1,0,7.25,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2,0,1,0
1,1,1,1,38.0,1,0,71.2833,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2,0,1,0
2,1,3,1,26.0,0,0,7.925,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1,1,0,0
3,1,1,1,35.0,1,0,53.1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2,0,1,0
4,0,3,0,35.0,0,0,8.05,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1,1,0,0


In [836]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_U,FamilySize,Singleton,SmallFamily,LargeFamily
0,3,0,34.5,0,0,7.8292,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1,1,0,0
1,3,1,47.0,1,0,7.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2,0,1,0
2,2,0,62.0,0,0,9.6875,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1,1,0,0
3,3,0,27.0,0,0,8.6625,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1,1,0,0
4,3,1,22.0,1,1,12.2875,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,3,0,1,0


### Lưu lại vào exps

#### Tạo thư mục

In [837]:
exp_dir = "../exps"
if os.path.exists(exp_dir) == False:
    os.makedir(exp_dir,exist_ok=True)

save_dir = f"{exp_dir}/feature_Labels"
os.makedirs(save_dir,exist_ok=True)

#### Lưu dữ liệu Features

In [838]:
df.to_csv(f'{save_dir}/train_Onehot.csv',index=False)
df_test.to_csv(f'{save_dir}/test_Onehot.csv',index=False)