In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame as df

In [2]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv', index_col=0)
train_df.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_df.dtypes

Survived      int64
Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

mean, std = dict(), dict()

def age_type(age):
    if age <= 1:
        return "newborn"
    if age <= 8:
        return "child"
    if age <= 13:
        return "teenager"
    if age <= 18:
        return "old teenager"
    if age <= 23:
        return "very young"
    if age <= 75:
        return "old"
    return "very old"


def family_type(family_size):
    if family_size <= 1:
        return "solo"
    if family_size <= 2:
        return "couple"
    if family_size <= 4:
        return "average family"
    return "big family"
    

def normalize(df, train):
    df["AgeType"] = df["Age"].apply(age_type)
    df["FamilySize"] = (df["SibSp"] + df["Parch"] + 1)
    df["FamilyType"] = df["FamilySize"].apply(family_type)
    for col in df:
        if col == 'Survived':
            continue
        if df[col].dtype == object:
            le.fit(df[col])
            df[col] = le.transform(df[col])
            
        df[col] = df[col].astype(float)
        if train:
            mean[col] = df[col].mean()
            std[col] = df[col].std()
        
        df[col].fillna(mean[col], inplace=True)        
        df[col] = (df[col] - mean[col]) / std[col]    
    
#     df.drop('Name', axis=1, inplace=True)    

In [5]:
normalize(train_df, True)
print(abs(train_df.corr()['Survived']).sort_values())

FamilySize    0.016639
SibSp         0.035322
Name          0.057343
Age           0.069809
Parch         0.081629
AgeType       0.137737
Embarked      0.163517
Ticket        0.164549
FamilyType    0.186737
Cabin         0.254888
Fare          0.257307
Pclass        0.338481
Sex           0.543351
Survived      1.000000
Name: Survived, dtype: float64


In [6]:
from torch.utils.data import Dataset, DataLoader
from torch import tensor
import torch


def get_probabilities(df):
    survived, nsurvived = tensor(df['Survived'].values), tensor((1 - df['Survived']).values)
    survived, nsurvived = survived.view(survived.shape[0], 1), nsurvived.view(nsurvived.shape[0], 1)
    return torch.cat((nsurvived, survived), axis=1).float()

class CustomDataset(Dataset):
    def __init__(self, df):        
        self.X = tensor(df.drop('Survived', axis=1).values)
        self.Y = get_probabilities(df)
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, index):
        return self.X[index], self.Y[index]
    

In [7]:
from torch import nn
class Net(nn.Module):
    def __init__(self, input_size, hidden_size1, dropout1, hidden_size2, dropout2, hidden_size3, dropout3, output_size):
        super(Net, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Flatten(),
            nn.LazyLinear(hidden_size1, dtype=float),
            nn.ReLU(),        
        )
        
        self.dropout1 = nn.Dropout(dropout1)
        
        self.fc2 = nn.Sequential(
            nn.LazyLinear(hidden_size2, dtype=float),
            nn.ReLU(),
        )
        
        self.dropout2 = nn.Dropout(dropout2)
        
        self.fc3 = nn.Sequential(
            nn.LazyLinear(hidden_size3, dtype=float),
            nn.ReLU(),
        )
        
        self.dropout3 = nn.Dropout(dropout3)
        
        self.fc4 = nn.LazyLinear(output_size, dtype=float)
        
                
        
    def forward(self, x):                
        y = self.fc1(x)
        if self.train:
            y = self.dropout1(y)
            
        y = self.fc2(y)        
        if self.train:
            y = self.dropout2(y)
            
        y = self.fc3(y)
        
        if self.train:
            y = self.dropout3(y)
            
        y = self.fc4(y)
        return y


In [8]:
hparams = {
    'input_size': train_df.shape[1] - 2,
    'hidden_size1': 100,
    'dropout1': 0,
    'hidden_size2': 500,
    'dropout2': 0.5,
    'hidden_size3': 250,
    'dropout3': 0.25,
    'output_size': 2,    
}

In [9]:
criterion = torch.nn.CrossEntropyLoss()

In [10]:
def train(df, ec, lr, wd):
    model = Net(**hparams)
    model.train()
    
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=wd)
    dataset = CustomDataset(df)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    
    for epoch in range(ec):
        correct = 0
        total = 0

        for batch in dataloader:
            x_tensor, y_tensor = batch                                           
            y_pred = model(x_tensor)
            loss = criterion(y_pred, y_tensor)        

            predictions = torch.argmax(y_pred, dim=1)
            targets = torch.argmax(y_tensor, dim=1)
            total += predictions.shape[0]
            correct += (predictions==targets).sum().item()

            assert not torch.isnan(loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        if epoch == ec - 1:
            print(f'{epoch:} {round(100*correct / total, 2)}')

    return model
    

In [11]:
def k_fold(df, ec, lr, wd, k = 5):
    fold_indices = np.array_split(np.arange(len(df)), k)    
    total = 0
    correct = 0
    
    for fold in range(k):
        test_indices = fold_indices[fold]
        train_indices = np.concatenate(fold_indices[:fold] + fold_indices[fold+1:])
        
        model = train(df.iloc[train_indices], ec, lr, wd)        
        model.eval()
        
        x_tensor, y_tensor = tensor(df.values[test_indices, :-1]), get_probabilities(df.iloc[test_indices])
        y_pred = model(x_tensor)    
        correct += (torch.argmax(y_pred, dim=1) == torch.argmax(y_tensor, dim=1)).sum().item()
        total += y_pred.shape[0]
    
    return round(100*correct / total, 2)

In [12]:
# candidates = [0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

# for candidate in candidates:    
#     print(f'wd={candidate} --> {k_fold(train_df, ec=250, lr=1.5e-2, wd=candidate)}')

In [13]:
model = train(train_df, ec=250, lr=1.5e-2, wd=1e-5)
model.eval()



249 89.56


Net(
  (fc1): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=13, out_features=100, bias=True)
    (2): ReLU()
  )
  (dropout1): Dropout(p=0, inplace=False)
  (fc2): Sequential(
    (0): Linear(in_features=100, out_features=500, bias=True)
    (1): ReLU()
  )
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc3): Sequential(
    (0): Linear(in_features=500, out_features=250, bias=True)
    (1): ReLU()
  )
  (dropout3): Dropout(p=0.25, inplace=False)
  (fc4): Linear(in_features=250, out_features=2, bias=True)
)

In [14]:
test_df = pd.read_csv('/kaggle/input/titanic/test.csv', index_col=0)

In [15]:
normalize(test_df, True)
test_df.describe()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeType,FamilySize,FamilyType
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,-3.0810020000000004e-17,0.0,-3.8246920000000005e-17,3.505967e-17,-1.381139e-17,-5.5245550000000004e-17,-1.184592e-16,-1.9123460000000003e-17,1.019918e-16,-9.774212000000001e-17,-1.083663e-16,-6.586969e-17,2.4966740000000003e-17
std,1.0,1.0,1.0,0.8909349,1.0,1.0,1.0,0.9988002,1.0,1.0,1.0,1.0,1.0
min,-1.503319,-1.725844,-1.321292,-2.12271,-0.4988722,-0.3997686,-1.682681,-0.6372515,-3.532364,-1.640632,-1.90096,-0.5527802,-1.953925
25%,-1.503319,-0.862922,-1.321292,-0.5128329,-0.4988722,-0.3997686,-0.8899063,-0.496022,0.4484846,-0.4703517,-0.7281169,-0.5527802,-0.1918093
50%,0.8724364,0.0,0.7550242,0.0,-0.4988722,-0.3997686,0.0005116897,-0.3787141,0.4484846,0.6999282,-0.7281169,-0.5527802,0.6892487
75%,0.8724364,0.862922,0.7550242,0.3862442,0.6162539,-0.3997686,0.9188279,-0.07382163,0.4484846,0.6999282,1.031148,0.1055164,0.6892487
max,0.8724364,1.725844,0.7550242,3.224507,8.422137,8.770534,1.683704,8.526608,0.4484846,0.6999282,1.61757,6.030186,0.6892487


In [16]:
x_tensor = torch.tensor(test_df.iloc[:, :].values, dtype=float)
x_tensor.shape

torch.Size([418, 13])

In [17]:
with torch.no_grad():
    outputs = model(x_tensor)
    predicted_classes = torch.argmax(outputs, dim=1)
    result = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predicted_classes})
    result.to_csv('sub.csv', index=False)