In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np

In [123]:
class TitanicDataset(Dataset):

    def __init__(self, x_np_arr, y_np_arr, transform=None):
        self.x = torch.from_numpy(x_np_arr)
        self.y = torch.from_numpy(y_np_arr)
        self.transform = transform

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, index):
        if self.transform != None:
            return self.transform((self.x[index], self.y[index]))
        else:
            return self.x[index], self.y[index]

In [51]:
df = pd.read_csv("./titanic.csv")

In [52]:
X = df.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin', 'PassengerId'])
print(X.isna().sum())
X.head()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [53]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [54]:
imputer = SimpleImputer(strategy='mean')
ohe = OneHotEncoder()
transformer = ColumnTransformer(transformers=[('impute', imputer, ['Age']),
                                              ('ohe', ohe, ['Sex',
                                                            'Embarked'])],
                                remainder='passthrough')
X = transformer.fit_transform(X)

In [55]:
X.shape

(891, 11)

In [56]:
Y = df['Survived']
Y.isna().sum()
Y = Y.values

In [102]:
dataset = TitanicDataset(X, Y)

In [103]:
dataset[0:10]

(tensor([[22.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  0.0000,  3.0000,
           1.0000,  0.0000,  7.2500],
         [38.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  1.0000,
           1.0000,  0.0000, 71.2833],
         [26.0000,  1.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  3.0000,
           0.0000,  0.0000,  7.9250],
         [35.0000,  1.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  1.0000,
           1.0000,  0.0000, 53.1000],
         [35.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  0.0000,  3.0000,
           0.0000,  0.0000,  8.0500],
         [29.6991,  0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  3.0000,
           0.0000,  0.0000,  8.4583],
         [54.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  0.0000,  1.0000,
           0.0000,  0.0000, 51.8625],
         [ 2.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  0.0000,  3.0000,
           3.0000,  1.0000, 21.0750],
         [27.0000,  1.0000,  0.0000,  0.

In [124]:
class Scaler:

    def __init__(self, scale_factor=1):
        self.scale_factor = scale_factor

    def __call__(self, sample):
        return sample[0] * self.scale_factor, self.scale_factor * sample[1]


dataset_scaled = TitanicDataset(X, Y, transform=Scaler(2))

In [127]:
dataset_scaled[1]

(tensor([ 76.0000,   2.0000,   0.0000,   2.0000,   0.0000,   0.0000,   0.0000,
           2.0000,   2.0000,   0.0000, 142.5666], dtype=torch.float64),
 tensor(2))

In [128]:
dataset[1]

(tensor([38.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  1.0000,
          1.0000,  0.0000, 71.2833], dtype=torch.float64),
 tensor(1))

tensor([2., 4., 6.], dtype=torch.float64)