<a href="https://colab.research.google.com/github/BanhMiRuoc/CNN_titanic/blob/main/CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import numpy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim

In [5]:
data = pd.read_csv('/content/train.csv')
data.head()
print(data.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [11]:
#tiền xử lý dữ liệu
## xây dựng lớp titanicDataset
class TitanicDataset(Dataset):

  #Định nghĩa hàm khởi tạo
  def __init__(self, data_path, train=True):

    self.train = train
    self.data = pd.read_csv(data_path)

  def select_features(self):

    columns_to_drop = ['Name', 'Ticket', 'Cabin', 'Pclass', 'Embarked', 'SibSp', 'Parch', 'Fare']
    self.data = self.data.drop(columns_to_drop, axis=1)
    selected_columns = ['PassengerId', 'Survived','Sex', 'Age']

    self.data = self.data[selected_columns]
    self.data['Sex'] = self.data['Sex'].map({'male': 0, 'female': 1})


    return self.data

  def handle_missing_values(self):

    self.data['Age'] = self.data['Age'].fillna(self.data['Age'].mean())

  def preprocess(self):
    X = self.data.drop('Survived', axis=1)
    y = self.data['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    numeric_features = ['Age']
    categorical_features = ['Sex', 'PassengerId']

    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

    X_train = pipeline.fit_transform(X_train)
    X_test = pipeline.transform(X_test)

    return X_train, X_test, y_train, y_test


In [12]:
data_path = '/content/train.csv'
titanic_dataset = TitanicDataset(data_path)
titanic_dataset.select_features()
titanic_dataset.handle_missing_values()
X_train, X_test, y_train, y_test = titanic_dataset.preprocess()
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Sample X_train:", X_train[:5])

Shape of X_train: (712, 715)
Shape of X_test: (179, 715)
Sample X_train: [[ 1.22920747  1.          0.         ...  0.          0.
   0.        ]
 [-0.50350514  1.          0.         ...  0.          0.
   0.        ]
 [ 0.18957991  1.          0.         ...  0.          0.
   0.        ]
 [-0.27247679  1.          0.         ...  0.          0.
   0.        ]
 [-1.81266577  0.          1.         ...  0.          0.
   0.        ]]
