## khai báo thư viện

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import itables
from itables import init_notebook_mode
from itables import show

## Gộp train và test vào thành một và bỏ biến mục tiêu Survived

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

targets = train.Survived
train.drop(['Survived'],axis=1,inplace=True)

combined = pd.concat([train,test],ignore_index=True)

combined.shape

(1309, 11)

# **Xử lý dữ liệu**

## **Xóa cột PassengerID** : Vì có vẻ đặc trưng này không giúp ích cho mô hình

In [3]:
combined.drop('PassengerId',axis=1,inplace=True)

## **Tiền xử lý đặc trưng Name và Age**

In [4]:
combined['Title'] = combined['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False) #Lấy ra hết danh từ danh xưng thành một đặc trưng title

In [5]:
combined['Title'].value_counts()

Title
Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Major         2
Mlle          2
Ms            2
Mme           1
Don           1
Sir           1
Lady          1
Capt          1
Countess      1
Jonkheer      1
Dona          1
Name: count, dtype: int64

In [6]:
Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

combined['Title'] = combined['Title'].map(Title_Dictionary) #Map lại cho title ít giá trị hơn.


In [7]:
display(combined[combined['Title'].isna()])

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
1305,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C,


### fill NaN cho đặc trưng Age dựa vào danh xưng của tên
- để tránh data leakage từ tập test. Tả sẽ dùng trung vị của tập train rồi sau đó thêm các giá trị thiếu của tập test bằng tập train

In [8]:
grouped_train = combined.iloc[:891].groupby(['Sex','Title'])
grouped_median_train = grouped_train['Age'].median()
grouped_median_train = grouped_median_train.reset_index()[['Sex', 'Title', 'Age']]
grouped_median_train

Unnamed: 0,Sex,Title,Age
0,female,Miss,21.0
1,female,Mrs,35.0
2,female,Officer,49.0
3,female,Royalty,40.5
4,male,Master,3.5
5,male,Mr,30.0
6,male,Officer,50.5
7,male,Royalty,40.0


- Fill vào cho đặc trưng bị thiếu Age của train và test

In [9]:
lookup_table = grouped_median_train.set_index(['Sex','Title'])['Age']

In [10]:
def fill_age(row):
    if pd.notna(row['Age']):
        return row['Age']
    lookup_keys = (row['Sex'],row['Title'])
    return lookup_table.loc[lookup_keys]
    
combined['Age'] = combined.apply(fill_age,axis=1)

### Tiếp theo ta sẽ dùng phương pháp One-hot encoding cho đặc trưng đã tạo

In [11]:
#Drop cột không cần thiết
combined.drop('Name',axis=1,inplace=True)

In [12]:
#Encoding dùng thư viện One-hot
from sklearn.preprocessing import OneHotEncoder
def ApplyOneHotEncoder(combined,feat):
    encoder = OneHotEncoder(sparse_output=False,handle_unknown='ignore')    
    encoder.fit(combined[[feat]])
    title_encoded = encoder.transform(combined[[feat]])
    title_feature_names = encoder.get_feature_names_out([feat])
    combined_encoded = pd.DataFrame(title_encoded, columns=title_feature_names)
    combined_encoded.index = combined.index
    combined = pd.concat([combined,combined_encoded],axis=1)
    combined.drop(feat,axis=1,inplace=True)
    return combined

combined = ApplyOneHotEncoder(combined,'Title')

In [13]:
combined.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Title_nan
0,3,male,22.0,1,0,A/5 21171,7.25,,S,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1,female,35.0,1,0,113803,53.1,C123,S,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,3,male,35.0,0,0,373450,8.05,,S,0.0,0.0,1.0,0.0,0.0,0.0,0.0


Sau khi OneHot Encoding xong thì sẽ thấy không còn đặc trưng Name. Các đặc trưng mới xuất hiện Title_X VD: nếu Title_X = 1 thì người đó sẽ có danh từ xưng hô là Mr

## Xử lý Fare

In [14]:
combined['Fare'].isna().sum()

np.int64(1)

In [15]:
combined["Fare"].fillna(combined['Fare'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined["Fare"].fillna(combined['Fare'].mean(),inplace=True)


## Xử lý Embarked

In [16]:
combined['Embarked'].isna().sum()

np.int64(2)

In [17]:
combined['Embarked'] = combined['Embarked'].fillna('S')

In [18]:
combined = ApplyOneHotEncoder(combined,'Embarked')

In [19]:
show(combined)

0
Loading ITables v2.5.2 from the internet...  (need help?)


In [20]:
train_cabin, test_cabin = set(), set()
for char in combined['Cabin']:
    try:
        train_cabin.add(char[0])
    except:
        train_cabin.add('U')

for char in combined['Cabin']:
    try:
        test_cabin.add(char[0])
    except:
        test_cabin.add('U')

In [21]:
print(train_cabin)

{'D', 'F', 'B', 'T', 'E', 'C', 'A', 'U', 'G'}


In [22]:
print(test_cabin)

{'D', 'F', 'B', 'T', 'E', 'C', 'A', 'U', 'G'}


In [23]:
combined['Cabin'].fillna('U',inplace=True)
combined['Cabin'] = combined['Cabin'].map(lambda c: c[0])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined['Cabin'].fillna('U',inplace=True)


## Xử lý cabin

In [24]:
combined = ApplyOneHotEncoder(combined,'Cabin')

In [25]:
show(combined)

0
Loading ITables v2.5.2 from the internet...  (need help?)


## Xử lý giới tính

In [26]:
mapping_gender = {"male":0,"female":1}
combined['Sex'] = combined['Sex'].map(mapping_gender)

In [27]:
show(combined)

0
Loading ITables v2.5.2 from the internet...  (need help?)


## Xử lý Hạng hành khách

In [28]:
combined = ApplyOneHotEncoder(combined,'Pclass')

In [29]:
show(combined)

0
Loading ITables v2.5.2 from the internet...  (need help?)


## Xử lý Ticket

### Lấy ra các chữ cái đầu của mã thẻ.

In [30]:
def process_ticket(df):
    def clean_ticket_prefix(ticket):
        cleaned_ticket = (
            ticket.replace('.', '')
            .replace('/', '')
            .strip()
            .split()
        )
        
        prefix_list = [
            item.strip() 
            for item in cleaned_ticket 
            if not item.strip().isdigit()
        ]

        if prefix_list:
            return prefix_list[0]
        else: 
            return 'XXX'
    
    # Tạo cột mới và trả về DataFrame
    df['Ticket_Prefix'] = df['Ticket'].apply(clean_ticket_prefix)
    df.drop(['Ticket'],axis=1,inplace=True)
    return df

combined = process_ticket(combined)

In [31]:
combined['Ticket_Prefix'].unique()

array(['A5', 'PC', 'STONO2', 'XXX', 'PP', 'CA', 'SCParis', 'SCA4', 'A4',
       'SP', 'SOC', 'WC', 'SOTONOQ', 'WEP', 'STONO', 'C', 'SCPARIS',
       'SOP', 'Fa', 'LINE', 'FCC', 'SWPP', 'SCOW', 'PPP', 'SC', 'SCAH',
       'AS', 'SOPP', 'FC', 'SOTONO2', 'CASOTON', 'SCA3', 'STONOQ', 'AQ4',
       'A', 'LP', 'AQ3'], dtype=object)

In [32]:
show(combined)

0
Loading ITables v2.5.2 from the internet...  (need help?)


### Chuyển thành OneHot

In [33]:
combined = ApplyOneHotEncoder(combined,'Ticket_Prefix')

In [34]:
show(combined)

0
Loading ITables v2.5.2 from the internet...  (need help?)


## Xử lý SibSp + Parch
**Gộp biến SibSp và Parch** vì có liên quan đến nhau. Gộp theo công thức sau:

**FamilySize** = Sibsp + Parch + 1 (Cộng 1 là vì cộng chính hành khách đó)

In [35]:
combined['FamilySize'] = combined['SibSp'] + combined['Parch'] + 1

In [36]:
show(combined)

0
Loading ITables v2.5.2 from the internet...  (need help?)


In [37]:
combined['Singleton'] = combined['FamilySize'].map(lambda s: 1 if s == 1 else 0)
combined['SmallFamily'] = combined['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
combined['LargeFamily'] = combined['FamilySize'].map(lambda s: 1 if 5 <= s else 0)

In [38]:
combined.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,...,Ticket_Prefix_STONO2,Ticket_Prefix_STONOQ,Ticket_Prefix_SWPP,Ticket_Prefix_WC,Ticket_Prefix_WEP,Ticket_Prefix_XXX,FamilySize,Singleton,SmallFamily,LargeFamily
0,0,22.0,1,0,7.25,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2,0,1,0
1,1,38.0,1,0,71.2833,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2,0,1,0
2,1,26.0,0,0,7.925,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1,1,0,0
3,1,35.0,1,0,53.1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2,0,1,0
4,0,35.0,0,0,8.05,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1,1,0,0


# Lưu lại vào exps

## Tạo thư mục

In [39]:
exp_dir = "../exps"
if os.path.exists(exp_dir) == False:
    os.makedir(exp_dir,exist_ok=True)

save_dir = f"{exp_dir}/feature_Labels"
os.makedirs(save_dir,exist_ok=True)

## Lưu dữ liệu Features

In [40]:
combined.to_csv(f'{save_dir}/combined_OnehotV2.csv',index=False)