In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame as df

In [2]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv', index_col=0)
test_df = pd.read_csv('/kaggle/input/titanic/test.csv', index_col=0)
merged_df = train_df.append(test_df, ignore_index=True)
merged_df.head(10)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
train_df['Survived'].value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [4]:
train_df.groupby('Sex')['Survived'].value_counts(normalize=True)

Sex     Survived
female  1           0.742038
        0           0.257962
male    0           0.811092
        1           0.188908
Name: Survived, dtype: float64

In [5]:
merged_df['Sex'] = merged_df['Sex'].apply(lambda s: 1 if s == 'female' else 0)

In [6]:
def get_title(name):
    title = "#"
    start = 0
    while title[-1] != '.':
        while name[start] == ' ':
            start += 1
            
        end = start
        while name[end] != ' ':
            end += 1
        
        title = name[start:end]
        start = end
    return title

titles = ['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Rev.']
for title in titles:
    merged_df[title] = merged_df['Name'].apply(lambda name: 1 if get_title(name) == title else 0)
merged_df.drop('Name', axis=1, inplace=True)

In [7]:
merged_df['Newborn'] = merged_df['Age'].apply(lambda age: 1 if age <= 1 else 0)
merged_df.drop('Age', axis=1, inplace=True)

In [8]:
merged_df.drop(['Parch', 'SibSp'], axis=1, inplace=True)

In [9]:
train_df.groupby(['Pclass', 'Sex'])['Survived'].value_counts(normalize=True)

Pclass  Sex     Survived
1       female  1           0.968085
                0           0.031915
        male    0           0.631148
                1           0.368852
2       female  1           0.921053
                0           0.078947
        male    0           0.842593
                1           0.157407
3       female  0           0.500000
                1           0.500000
        male    0           0.864553
                1           0.135447
Name: Survived, dtype: float64

In [10]:
pclasses = [('p1', 1), ('p2', 2), ('p3', 3)]
for (col, val) in pclasses:
    merged_df[col] = merged_df['Pclass'].apply(lambda p: 1 if p == val else 0)

merged_df.drop('Pclass', axis=1, inplace=True)
merged_df.head(5)

Unnamed: 0,Survived,Sex,Ticket,Fare,Cabin,Embarked,Mr.,Mrs.,Miss.,Master.,Rev.,Newborn,p1,p2,p3
0,0.0,0,A/5 21171,7.25,,S,1,0,0,0,0,0,0,0,1
1,1.0,1,PC 17599,71.2833,C85,C,0,1,0,0,0,0,1,0,0
2,1.0,1,STON/O2. 3101282,7.925,,S,0,0,1,0,0,0,0,0,1
3,1.0,1,113803,53.1,C123,S,0,1,0,0,0,0,1,0,0
4,0.0,0,373450,8.05,,S,1,0,0,0,0,0,0,0,1


In [11]:
merged_df['TicketNumber'] = merged_df['Ticket'].apply(lambda t: merged_df['Ticket'].value_counts()[t])
merged_df.drop('Ticket', axis=1, inplace=True)
merged_df.head(10)

Unnamed: 0,Survived,Sex,Fare,Cabin,Embarked,Mr.,Mrs.,Miss.,Master.,Rev.,Newborn,p1,p2,p3,TicketNumber
0,0.0,0,7.25,,S,1,0,0,0,0,0,0,0,1,1
1,1.0,1,71.2833,C85,C,0,1,0,0,0,0,1,0,0,2
2,1.0,1,7.925,,S,0,0,1,0,0,0,0,0,1,1
3,1.0,1,53.1,C123,S,0,1,0,0,0,0,1,0,0,2
4,0.0,0,8.05,,S,1,0,0,0,0,0,0,0,1,1
5,0.0,0,8.4583,,Q,1,0,0,0,0,0,0,0,1,1
6,0.0,0,51.8625,E46,S,1,0,0,0,0,0,1,0,0,2
7,0.0,0,21.075,,S,0,0,0,1,0,0,0,0,1,5
8,1.0,1,11.1333,,S,0,1,0,0,0,0,0,0,1,3
9,1.0,1,30.0708,,C,0,1,0,0,0,0,0,1,0,2


In [12]:
merged_df['Fare'].fillna(merged_df['Fare'].mean(), inplace=True)
merged_df['Fare'] = merged_df['Fare'] / merged_df['TicketNumber']
merged_df['TicketNumber'] = (merged_df['TicketNumber'] - merged_df['TicketNumber'].mean()) / merged_df['TicketNumber'].std()
merged_df['Fare'] = (merged_df['Fare'] - merged_df['Fare'].mean()) / merged_df['Fare'].std()
merged_df.head(10)

Unnamed: 0,Survived,Sex,Fare,Cabin,Embarked,Mr.,Mrs.,Miss.,Master.,Rev.,Newborn,p1,p2,p3,TicketNumber
0,0.0,0,-0.554699,,S,1,0,0,0,0,0,0,0,1,-0.618937
1,1.0,1,1.53906,C85,C,0,1,0,0,0,0,1,0,0,-0.057086
2,1.0,1,-0.50492,,S,0,0,1,0,0,0,0,0,1,-0.618937
3,1.0,1,0.868591,C123,S,0,1,0,0,0,0,1,0,0,-0.057086
4,0.0,0,-0.495702,,S,1,0,0,0,0,0,0,0,1,-0.618937
5,0.0,0,-0.465592,,Q,1,0,0,0,0,0,0,0,1,-0.618937
6,0.0,0,0.822961,E46,S,1,0,0,0,0,0,1,0,0,-0.057086
7,0.0,0,-0.778516,,S,0,0,0,1,0,0,0,0,1,1.628466
8,1.0,1,-0.815677,,S,0,1,0,0,0,0,0,0,1,0.504764
9,1.0,1,0.01944,,C,0,1,0,0,0,0,0,1,0,-0.057086


In [13]:
merged_df.drop('Cabin', axis=1, inplace=True)
merged_df.head(10)

Unnamed: 0,Survived,Sex,Fare,Embarked,Mr.,Mrs.,Miss.,Master.,Rev.,Newborn,p1,p2,p3,TicketNumber
0,0.0,0,-0.554699,S,1,0,0,0,0,0,0,0,1,-0.618937
1,1.0,1,1.53906,C,0,1,0,0,0,0,1,0,0,-0.057086
2,1.0,1,-0.50492,S,0,0,1,0,0,0,0,0,1,-0.618937
3,1.0,1,0.868591,S,0,1,0,0,0,0,1,0,0,-0.057086
4,0.0,0,-0.495702,S,1,0,0,0,0,0,0,0,1,-0.618937
5,0.0,0,-0.465592,Q,1,0,0,0,0,0,0,0,1,-0.618937
6,0.0,0,0.822961,S,1,0,0,0,0,0,1,0,0,-0.057086
7,0.0,0,-0.778516,S,0,0,0,1,0,0,0,0,1,1.628466
8,1.0,1,-0.815677,S,0,1,0,0,0,0,0,0,1,0.504764
9,1.0,1,0.01944,C,0,1,0,0,0,0,0,1,0,-0.057086


In [14]:
merged_df['EC'] = merged_df['Embarked'].apply(lambda e: 1 if e == 'C' else 0)
merged_df['ES'] = merged_df['Embarked'].apply(lambda e: 1 if e == 'S' else 0)
merged_df['EQ'] = merged_df['Embarked'].apply(lambda e: 1 if e == 'Q' else 0)
merged_df.drop('Embarked', axis=1, inplace=True)
merged_df.head(10)

Unnamed: 0,Survived,Sex,Fare,Mr.,Mrs.,Miss.,Master.,Rev.,Newborn,p1,p2,p3,TicketNumber,EC,ES,EQ
0,0.0,0,-0.554699,1,0,0,0,0,0,0,0,1,-0.618937,0,1,0
1,1.0,1,1.53906,0,1,0,0,0,0,1,0,0,-0.057086,1,0,0
2,1.0,1,-0.50492,0,0,1,0,0,0,0,0,1,-0.618937,0,1,0
3,1.0,1,0.868591,0,1,0,0,0,0,1,0,0,-0.057086,0,1,0
4,0.0,0,-0.495702,1,0,0,0,0,0,0,0,1,-0.618937,0,1,0
5,0.0,0,-0.465592,1,0,0,0,0,0,0,0,1,-0.618937,0,0,1
6,0.0,0,0.822961,1,0,0,0,0,0,1,0,0,-0.057086,0,1,0
7,0.0,0,-0.778516,0,0,0,1,0,0,0,0,1,1.628466,0,1,0
8,1.0,1,-0.815677,0,1,0,0,0,0,0,0,1,0.504764,0,1,0
9,1.0,1,0.01944,0,1,0,0,0,0,0,1,0,-0.057086,1,0,0


In [15]:
train_df = merged_df.iloc[:len(train_df)]
train_df.describe()

Unnamed: 0,Survived,Sex,Fare,Mr.,Mrs.,Miss.,Master.,Rev.,Newborn,p1,p2,p3,TicketNumber,EC,ES,EQ
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,0.352413,-0.016276,0.580247,0.140292,0.204265,0.044893,0.006734,0.015713,0.242424,0.20651,0.551066,0.011017,0.188552,0.722783,0.08642
std,0.486592,0.47799,1.0011,0.493796,0.347485,0.40339,0.207186,0.08183,0.124431,0.42879,0.405028,0.497665,1.009455,0.391372,0.447876,0.281141
min,0.0,0.0,-1.089354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.618937,0.0,0.0,0.0
25%,0.0,0.0,-0.536572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.618937,0.0,0.0,0.0
50%,0.0,0.0,-0.495702,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.618937,0.0,1.0,0.0
75%,1.0,1.0,-0.069821,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.504764,0.0,1.0,0.0
max,1.0,1.0,8.356147,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.99957,1.0,1.0,1.0


In [16]:
test_df = merged_df.iloc[len(train_df):]
test_df.describe()

Unnamed: 0,Survived,Sex,Fare,Mr.,Mrs.,Miss.,Master.,Rev.,Newborn,p1,p2,p3,TicketNumber,EC,ES,EQ
count,0.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,,0.363636,0.034693,0.574163,0.172249,0.186603,0.050239,0.004785,0.019139,0.255981,0.222488,0.521531,-0.023483,0.244019,0.645933,0.110048
std,,0.481622,0.997961,0.495062,0.378049,0.390059,0.2187,0.069088,0.137177,0.436934,0.416416,0.500135,0.980325,0.430019,0.478803,0.313324
min,,0.0,-1.089354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.618937,0.0,0.0,0.0
25%,,0.0,-0.519902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.618937,0.0,0.0,0.0
50%,,0.0,-0.450533,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.618937,0.0,1.0,0.0
75%,,1.0,0.446397,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,-0.057086,0.0,1.0,0.0
max,,1.0,8.356147,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.99957,1.0,1.0,1.0


In [17]:
test_df.drop('Survived', axis=1, inplace=True)
test_df.describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Sex,Fare,Mr.,Mrs.,Miss.,Master.,Rev.,Newborn,p1,p2,p3,TicketNumber,EC,ES,EQ
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,0.363636,0.034693,0.574163,0.172249,0.186603,0.050239,0.004785,0.019139,0.255981,0.222488,0.521531,-0.023483,0.244019,0.645933,0.110048
std,0.481622,0.997961,0.495062,0.378049,0.390059,0.2187,0.069088,0.137177,0.436934,0.416416,0.500135,0.980325,0.430019,0.478803,0.313324
min,0.0,-1.089354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.618937,0.0,0.0,0.0
25%,0.0,-0.519902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.618937,0.0,0.0,0.0
50%,0.0,-0.450533,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.618937,0.0,1.0,0.0
75%,1.0,0.446397,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,-0.057086,0.0,1.0,0.0
max,1.0,8.356147,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.99957,1.0,1.0,1.0


In [18]:
from torch.utils.data import Dataset, DataLoader
from torch import tensor
import torch


def get_probabilities(df):
    survived, nsurvived = tensor(df['Survived'].values), tensor((1 - df['Survived']).values)
    survived, nsurvived = survived.view(survived.shape[0], 1), nsurvived.view(nsurvived.shape[0], 1)
    return torch.cat((nsurvived, survived), axis=1).float()

class CustomDataset(Dataset):
    def __init__(self, df):        
        self.X = tensor(df.drop('Survived', axis=1).values)
        self.Y = get_probabilities(df)
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, index):
        return self.X[index], self.Y[index]
    

In [19]:
from torch import nn
class Net(nn.Module):
    def __init__(self, input_size, hidden_size1, dropout1, hidden_size2, dropout2, hidden_size3, dropout3, output_size):
        super(Net, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Flatten(),
            nn.LazyLinear(hidden_size1, dtype=float),
            nn.ReLU(),        
        )
        
        self.dropout1 = nn.Dropout(dropout1)
        
        self.fc2 = nn.Sequential(
            nn.LazyLinear(hidden_size2, dtype=float),
            nn.ReLU(),
        )
        
        self.dropout2 = nn.Dropout(dropout2)
        
        self.fc3 = nn.Sequential(
            nn.LazyLinear(hidden_size3, dtype=float),
            nn.ReLU(),
        )
        
        self.dropout3 = nn.Dropout(dropout3)
        
        self.fc4 = nn.LazyLinear(output_size, dtype=float)
        
                
        
    def forward(self, x):                
        y = self.fc1(x)
        if self.train:
            y = self.dropout1(y)
            
        y = self.fc2(y)        
        if self.train:
            y = self.dropout2(y)
            
        y = self.fc3(y)
        
        if self.train:
            y = self.dropout3(y)
            
        y = self.fc4(y)
        return y


In [20]:
hparams = {
    'input_size': train_df.shape[1] - 2,
    'hidden_size1': 100,
    'dropout1': 0,
    'hidden_size2': 500,
    'dropout2': 0.5,
    'hidden_size3': 250,
    'dropout3': 0.25,
    'output_size': 2,    
}

In [21]:
criterion = torch.nn.CrossEntropyLoss()

In [22]:
def train(df, ec, lr, wd):
    model = Net(**hparams)
    model.train()
    
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=wd)
    dataset = CustomDataset(df)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    
    for epoch in range(ec):
        correct = 0
        total = 0

        for batch in dataloader:
            x_tensor, y_tensor = batch                                           
            y_pred = model(x_tensor)
            loss = criterion(y_pred, y_tensor)        

            predictions = torch.argmax(y_pred, dim=1)
            targets = torch.argmax(y_tensor, dim=1)
            total += predictions.shape[0]
            correct += (predictions==targets).sum().item()

            assert not torch.isnan(loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        if epoch == ec - 1:
            print(f'{epoch:} {round(100*correct / total, 2)}')

    return model
    

In [23]:
def k_fold(df, ec, lr, wd, k = 5):
    fold_indices = np.array_split(np.arange(len(df)), k)    
    total = 0
    correct = 0
    
    for fold in range(k):
        test_indices = fold_indices[fold]
        train_indices = np.concatenate(fold_indices[:fold] + fold_indices[fold+1:])
        
        model = train(df.iloc[train_indices], ec, lr, wd)        
        model.eval()
        
        x_tensor, y_tensor = tensor(df.values[test_indices, :-1]), get_probabilities(df.iloc[test_indices])
        y_pred = model(x_tensor)    
        correct += (torch.argmax(y_pred, dim=1) == torch.argmax(y_tensor, dim=1)).sum().item()
        total += y_pred.shape[0]
    
    return round(100*correct / total, 2)

In [24]:
print(f'--> {k_fold(train_df, ec=250, lr=0.015, wd=1e-5)}')



249 84.83
249 84.57
249 84.15
249 85.83
249 83.03
--> 71.83


In [25]:
model = train(train_df, ec=250, lr=1.5e-2, wd=1e-5)
model.eval()

249 84.51


Net(
  (fc1): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=15, out_features=100, bias=True)
    (2): ReLU()
  )
  (dropout1): Dropout(p=0, inplace=False)
  (fc2): Sequential(
    (0): Linear(in_features=100, out_features=500, bias=True)
    (1): ReLU()
  )
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc3): Sequential(
    (0): Linear(in_features=500, out_features=250, bias=True)
    (1): ReLU()
  )
  (dropout3): Dropout(p=0.25, inplace=False)
  (fc4): Linear(in_features=250, out_features=2, bias=True)
)

In [26]:
x_tensor = torch.tensor(test_df.iloc[:, :].values, dtype=float)
x_tensor.shape

torch.Size([418, 15])

In [27]:
with torch.no_grad():
    outputs = model(x_tensor)
    predicted_classes = torch.argmax(outputs, dim=1)
    result = pd.DataFrame({'PassengerId': test_df.index+1, 'Survived': predicted_classes})
    result.to_csv('sub.csv', index=False)