In [127]:
# // * Copyright (C) 2023 Matthew Favela - All Rights Reserved
# // * You may use, distribute and modify this code under the
# // * terms of the MIT license, which unfortunately won't be
# // * written for another century.
#     // *
# // * You should have received a copy of the MIT license with
#     // * this file. If not, please write to: Chewy42
# // * @author Matthew Favela
# // * @version 1.0
# // * @since 2023-02-17
# // */

In [1]:
#import modules
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
#combine datasets into 1 dataframe
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
df = pd.concat([train_df, test_df], sort=False)
df.to_csv('combinedDataset.csv')

In [4]:
#condense to only neccessary columns & get rid of empty rows
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Embarked']]
df = df.dropna()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Embarked
0,0.0,3,male,22.0,1,7.2500,S
1,1.0,1,female,38.0,1,71.2833,C
2,1.0,3,female,26.0,0,7.9250,S
3,1.0,1,female,35.0,1,53.1000,S
4,0.0,3,male,35.0,0,8.0500,S
...,...,...,...,...,...,...,...
885,0.0,3,female,39.0,0,29.1250,Q
886,0.0,2,male,27.0,0,13.0000,S
887,1.0,1,female,19.0,0,30.0000,S
889,1.0,1,male,26.0,0,30.0000,C


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Embarked
0,0.0,3,male,22.0,1,7.25,S
1,1.0,1,female,38.0,1,71.2833,C
2,1.0,3,female,26.0,0,7.925,S
3,1.0,1,female,35.0,1,53.1,S
4,0.0,3,male,35.0,0,8.05,S
6,0.0,1,male,54.0,0,51.8625,S
7,0.0,3,male,2.0,3,21.075,S
8,1.0,3,female,27.0,0,11.1333,S
9,1.0,2,female,14.0,1,30.0708,C
10,1.0,3,female,4.0,1,16.7,S


In [146]:
le = LabelEncoder() # instantiate the label encoder to encode categorical values

df[['Sex', 'Embarked']] = df[['Sex', 'Embarked']].apply(le.fit_transform) # encode the categorical values
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Embarked']].values.astype(np.float32) # get the features as X and convert to numpy float 32
y = df['Survived'].values.astype(np.float32).reshape(-1, 1) # get the labels as y and convert to numpy float 32. Also reshape to 2D array

In [132]:
scaler = StandardScaler() # instantiate the scaler
X = scaler.fit_transform(X) # scale the features
y = scaler.fit_transform(y) # scale the labels

In [133]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # split the data into training and testing sets

In [134]:
# convert the numpy arrays to torch tensors that can be interpreted by the neural network
X_train = torch.from_numpy(X_train)
X_test = torch.from_numpy(X_test)
y_train = torch.from_numpy(y_train)
y_test = torch.from_numpy(y_test)

In [135]:
class RegressionModel(nn.Module): # create the regression model
    def __init__(self):
        super().__init__() # inherit from the nn.Module class
        self.linear = nn.Linear(6, 1) # create a linear layer with 6 inputs and 1 output. The 6 inputs will be the features (factors into outcome) and the 1 output (prediction) will be the label
        
    def forward(self, x):
        pred = torch.sigmoid(self.linear(x)) # apply the sigmoid function to the linear layer to get the prediction. the sigmoid function will return a value between 0 and 1
        return pred

In [136]:
model = RegressionModel() # instantiate the model
criterion = nn.MSELoss() # instantiate the loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # instantiate the optimizer

In [137]:
epochs = 1000 # number of times the model will train

In [147]:
for i in range(epochs): # train the model
    model.train() # set the model to training mode
    y_pred = model.forward(X_train) # get the prediction
    loss = criterion(y_pred, y_train) # calculate the loss

    #print('epoch: ', i, 'loss: ', loss.item())

    loss.backward() # backpropagate the loss
    optimizer.step() # update the weights
    optimizer.zero_grad() # reset the gradients

model.eval() # set the model to evaluation mode
with torch.no_grad(): # test the model
    y_eval = model.forward(X_test) # get the prediction
    loss = criterion(y_eval, y_test) # calculate the loss
    print('loss: ', np.round((loss.item() * 100)), '%') # print the loss

loss:  73.0 %


In [139]:
#test the model
test_df = pd.read_csv('test.csv') # read the test data
test_df = test_df[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Embarked']] # get the neccessary columns needed to test

#reapply data cleaning procedures
test_df[['Sex', 'Embarked']] = test_df[['Sex', 'Embarked']].apply(le.fit_transform)
test_df['Survived'] = 0

X = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Embarked']].values.astype(np.float32)
X = scaler.fit_transform(X)
X = torch.from_numpy(X)

y = test_df[['Survived']].values.astype(np.float32)
y = scaler.fit_transform(y)
y = torch.from_numpy(y)
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Fare,Embarked,Survived
0,892,3,1,34.5,0,7.8292,1,0
1,893,3,0,47.0,1,7.0,2,0
2,894,2,1,62.0,0,9.6875,1,0
3,895,3,1,27.0,0,8.6625,2,0
4,896,3,0,22.0,1,12.2875,2,0


In [140]:
model.eval()
with torch.no_grad():
    y_eval = model.forward(X)
    loss = criterion(y_eval, y)
    print('loss: ', np.round((loss.item() * 100)), '%')

loss:  nan %


In [141]:
test_df['Survived'] = y_eval

In [142]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Fare,Embarked,Survived
0,892,3,1,34.5,0,7.8292,1,0.000737
1,893,3,0,47.0,1,7.0,2,0.049068
2,894,2,1,62.0,0,9.6875,1,0.002545
3,895,3,1,27.0,0,8.6625,2,0.001105
4,896,3,0,22.0,1,12.2875,2,0.230029


In [143]:
test_df['Survived'] = test_df['Survived'].apply(lambda x: 1 if x > 0.5 else 0)

In [144]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Fare,Embarked,Survived
0,892,3,1,34.5,0,7.8292,1,0
1,893,3,0,47.0,1,7.0,2,0
2,894,2,1,62.0,0,9.6875,1,0
3,895,3,1,27.0,0,8.6625,2,0
4,896,3,0,22.0,1,12.2875,2,0


In [145]:
#export to csv
test_df = test_df[['PassengerId', 'Survived']]
test_df = test_df.dropna()
test_df.to_csv('submission.csv', index=False)