In [1]:
!nvidia-smi

Sun Aug 21 09:18:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [40]:
import torch
import torch.nn as nn

import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, QuantileTransformer,RobustScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
warnings.filterwarnings("ignore")

In [41]:
def num_cat_split(df):
    num_col = list(df.select_dtypes(include=['number']).columns)
    cat_col = list(set(df.columns) - set(num_col))

    print(f"Numerical columns : \n{num_col}")
    print("")
    print(f"Categorical columns :\n {cat_col}")
    
    return num_col, cat_col

def cabin_letter(x):
    if isinstance(x, str):
        return x[0]
    else:
        return str(np.nan)
    
def featuring(DF):
    DF.drop(['Name','PassengerId', 'Ticket'], axis=1, inplace=True)
    DF['Cabin'] = DF['Cabin'].apply(cabin_letter)
    DF['Embarked'] = DF['Embarked'].str.upper()
    return DF.copy()

In [43]:
DF = pd.read_csv('./train.csv')
DF

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [44]:
p = 0.7 # columns with more than 30 n/a rows 
df_rows = DF.shape[0]
na_freq = DF.isna().sum().sort_values(ascending=False) / df_rows
na_freq = na_freq[na_freq > 0.3]

In [45]:
na_columns = na_freq.index
na_num_col, na_cat_col = num_cat_split(DF[na_columns])

Numerical columns : 
[]

Categorical columns :
 ['Cabin']


In [46]:
num_col , _ = num_cat_split(DF)

Numerical columns : 
['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

Categorical columns :
 ['Embarked', 'Name', 'Cabin', 'Ticket', 'Sex']


In [48]:
_ , cat_col = num_cat_split(DF)

Numerical columns : 
['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

Categorical columns :
 ['Embarked', 'Name', 'Cabin', 'Ticket', 'Sex']


In [49]:
DF = featuring(DF)
DF

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.2500,,S
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,,S
3,1,1,female,35.0,1,0,53.1000,C,S
4,0,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S
887,1,1,female,19.0,0,0,30.0000,B,S
888,0,3,female,,1,2,23.4500,,S
889,1,1,male,26.0,0,0,30.0000,C,C


In [50]:
X = DF.drop(['Survived'], axis=1).copy()
y = DF.Survived

sample = RandomOverSampler(sampling_strategy='minority')

X_new, y_new = sample.fit_resample(X, y)

In [51]:
X_train, X_valid, y_train, y_valid = train_test_split(X_new, y_new, test_size=.2, random_state=77, stratify=y_new)

In [54]:
num_col, cat_col = num_cat_split(X)

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('numerical_imputer', SimpleImputer(strategy='median')),
    ('quantile', QuantileTransformer(n_quantiles=100, output_distribution='normal')),
    #('robust', RobustScaler((5,95))),
    #('scaler', StandardScaler()),
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('categorical_imputer', SimpleImputer(fill_value='NA', strategy='constant')),
    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num1', numerical_transformer, num_col),
        ('cat', categorical_transformer, cat_col)
    ])

Numerical columns : 
['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

Categorical columns :
 ['Embarked', 'Cabin', 'Sex']


In [55]:
X_train__transformed = preprocessor.fit_transform(X_train)
X_train__transformed

array([[ 5.19933758, -0.83442701,  0.79908276, ...,  1.        ,
         0.        ,  1.        ],
       [ 5.19933758, -2.04959427, -5.19933758, ...,  1.        ,
         1.        ,  0.        ],
       [-0.28221615, -0.28221615, -5.19933758, ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [-0.28221615,  0.74673073,  0.79908276, ...,  1.        ,
         0.        ,  1.        ],
       [-5.19933758,  0.78178075, -5.19933758, ...,  0.        ,
         0.        ,  1.        ],
       [ 5.19933758,  0.01266008,  0.79908276, ...,  1.        ,
         1.        ,  0.        ]])

In [56]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNet, self).__init__()
        self.input = nn.Linear(input_size, hidden_size)
        self.h1 = nn.Linear(hidden_size, hidden_size // 2)
        self.h2 = nn.Linear(hidden_size // 2, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.drop = nn.Dropout(0.3)

        
    def forward(self, x):
        '''Forward propagation'''
        out = self.input(x)
#         out = self.drop(out)
        out = self.relu(self.h1(out))
#         out = self.drop(out)
        out = self.sigmoid(self.h2(out))
        return out

In [57]:
input_size = X_train__transformed.shape[1]

model = NeuralNet(input_size, input_size)

In [58]:
criterion = nn.BCELoss() #Binary Cross Entropy
# optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, nesterov=True, momentum=0.8)

In [94]:
epochs = 200
batch_size = 16

# Numpy to tensor
train_sample_y = torch.from_numpy(y_train.values).float().view(-1,1)
train_sample_X = torch.from_numpy(X_train__transformed).float()

train_ds = torch.utils.data.TensorDataset(train_sample_X, train_sample_y)
train_loader = torch.utils.data.DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)

In [95]:
total_steps = len(train_loader)
for epoch in range(epochs):
    total_loss = 0
    model.train()
    for i, (features, labels) in enumerate(train_loader):
        
        optimizer.zero_grad()
        outputs = model(features) #forward propagation
        
        loss = criterion(outputs, labels) 
        
        # backward propagation
        loss.backward()
        optimizer.step()
        total_loss += loss.data.item()
    print("Epoch: {}, BCELoss: {}".format(epoch, total_loss / len(train_loader)))

Epoch: 0, BCELoss: 0.3151511212641543
Epoch: 1, BCELoss: 0.3074849165298722
Epoch: 2, BCELoss: 0.30181280252608383
Epoch: 3, BCELoss: 0.31549546420574187
Epoch: 4, BCELoss: 0.31034158305688336
Epoch: 5, BCELoss: 0.3130348047072237
Epoch: 6, BCELoss: 0.31271033950827337
Epoch: 7, BCELoss: 0.30975683317943054
Epoch: 8, BCELoss: 0.31015641303224994
Epoch: 9, BCELoss: 0.31632099564779886
Epoch: 10, BCELoss: 0.3178364726630124
Epoch: 11, BCELoss: 0.3176340359178456
Epoch: 12, BCELoss: 0.31144479716365986
Epoch: 13, BCELoss: 0.31084369962865654
Epoch: 14, BCELoss: 0.305103592032736
Epoch: 15, BCELoss: 0.3169666520573876
Epoch: 16, BCELoss: 0.3138429441235282
Epoch: 17, BCELoss: 0.30695190903815356
Epoch: 18, BCELoss: 0.31217921552332967
Epoch: 19, BCELoss: 0.3057590183886615
Epoch: 20, BCELoss: 0.2988995126702569
Epoch: 21, BCELoss: 0.31231021041219886
Epoch: 22, BCELoss: 0.31616709367795426
Epoch: 23, BCELoss: 0.3120511289347302
Epoch: 24, BCELoss: 0.3098180036653172
Epoch: 25, BCELoss: 0.3

In [96]:
X_valid__transformed = preprocessor.transform(X_valid).astype('float32')

#predictions
# model.eval()
preds = model(torch.from_numpy(X_valid__transformed)).squeeze().detach().numpy().round()

# model performence
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

           0       0.82      0.94      0.88       110
           1       0.93      0.80      0.86       110

    accuracy                           0.87       220
   macro avg       0.88      0.87      0.87       220
weighted avg       0.88      0.87      0.87       220



In [97]:
# Save
torch.save(model.state_dict(), 'titanic_model.pth')

# Load
_model = NeuralNet(input_size, input_size)
_model.load_state_dict(torch.load('titanic_model.pth'))
_model.eval()

NeuralNet(
  (input): Linear(in_features=19, out_features=19, bias=True)
  (h1): Linear(in_features=19, out_features=9, bias=True)
  (h2): Linear(in_features=9, out_features=1, bias=True)
  (relu): ReLU()
  (sigmoid): Sigmoid()
  (drop): Dropout(p=0.3, inplace=False)
)

In [98]:
#predictions
_preds = _model(torch.from_numpy(X_valid__transformed)).squeeze().detach().numpy().round()

# model performence
print(classification_report(y_valid, _preds))

              precision    recall  f1-score   support

           0       0.82      0.94      0.88       110
           1       0.93      0.80      0.86       110

    accuracy                           0.87       220
   macro avg       0.88      0.87      0.87       220
weighted avg       0.88      0.87      0.87       220



In [99]:
X_test = pd.read_csv('./test.csv')
Id = X_test['PassengerId']
X_test = featuring(X_test)
X_test = preprocessor.transform(X_test).astype('float32')
preds_test = model(torch.from_numpy(X_test)).squeeze().detach().numpy().round().astype('int')
# Save test predictions to file
output = pd.DataFrame({'PassengerId': Id,
                       'Survived': preds_test})
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [100]:
output.to_csv('titanic_submission.csv', index=False)