In [14]:
import torch
import torch.nn as nn 
import torch.optim as optim 
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from google.colab import files

In [33]:
files.upload()
df = pd.read_csv("train.csv")

Saving train.csv to train (2).csv


In [34]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [35]:
# Intuitive feature engineering, 
# Pclass would serve as a proxy for socioeconomic status, indicating higher likelihood of finding a boat, as well as fair 
# Sex and Age are also accounted for, Survived will be the dependent variable
df = df[["Pclass", "Sex", "Age", "Fare", "Survived"]]

In [36]:
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [37]:
df.info() # have some null values for age, will fill them with the mean of each feature

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   Fare      891 non-null    float64
 4   Survived  891 non-null    int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 34.9+ KB


In [38]:
mean_age = df['Age'].mean()
df.fillna(mean_age,axis=1,inplace=True)

df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [39]:
#converting sex to nominal values
df['Sex'] = df['Sex'].map({"male":0, "female":1})
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,0,22.0,7.25,0
1,1,1,38.0,71.2833,1
2,3,1,26.0,7.925,1
3,1,1,35.0,53.1,1
4,3,0,35.0,8.05,0


In [40]:
x = df[['Pclass', 'Sex', 'Age', 'Fare']].values 
y = df['Survived'].values

In [41]:
sc = StandardScaler()
x = sc.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = .2) # 80% of data set will be used for training

In [42]:
#converting to tensors
x_train, x_test = torch.tensor(x_train, dtype = torch.float32), torch.tensor(x_test, dtype = torch.float32)
y_train, y_test = torch.tensor(y_train,  dtype = torch.float32), torch.tensor(y_test,  dtype = torch.float32)

In [43]:
y_train = y_train.view(-1,1)

y_train.shape

input_size = 4 # "Pclass", "Sex", "Age", "Fare"
output_size = 1 # 'Survived'

In [44]:
model = nn.Sequential(nn.Linear(input_size, output_size),nn.Sigmoid())

In [45]:
#hyper-paramters
learning_rate = .01
iters = 10000

criterion = nn.BCELoss() #binary classification loss
optimizer = optim.SGD(model.parameters(), lr = learning_rate) #stochastic gradient descent


In [46]:
for epoch in range(iters):
  y_pred = model(x_train)
  loss = criterion(y_pred, y_train)
  loss.backward() #backpropogation
  optimizer.step()
  optimizer.zero_grad()

  if epoch % 100 == 0: 
    print("Epoch: {}, loss: {}".format(epoch, loss))

Epoch: 0, loss: 0.6780282855033875
Epoch: 100, loss: 0.59254390001297
Epoch: 200, loss: 0.5454593896865845
Epoch: 300, loss: 0.5189638733863831
Epoch: 400, loss: 0.5032781362533569
Epoch: 500, loss: 0.49344998598098755
Epoch: 600, loss: 0.4869846701622009
Epoch: 700, loss: 0.4825620651245117
Epoch: 800, loss: 0.4794398546218872
Epoch: 900, loss: 0.47717753052711487
Epoch: 1000, loss: 0.4755013585090637
Epoch: 1100, loss: 0.4742353558540344
Epoch: 1200, loss: 0.47326260805130005
Epoch: 1300, loss: 0.4725034832954407
Epoch: 1400, loss: 0.4719028174877167
Epoch: 1500, loss: 0.47142139077186584
Epoch: 1600, loss: 0.4710310995578766
Epoch: 1700, loss: 0.47071126103401184
Epoch: 1800, loss: 0.47044673562049866
Epoch: 1900, loss: 0.470225989818573
Epoch: 2000, loss: 0.4700404107570648
Epoch: 2100, loss: 0.4698833227157593
Epoch: 2200, loss: 0.4697495400905609
Epoch: 2300, loss: 0.469635009765625
Epoch: 2400, loss: 0.4695364832878113
Epoch: 2500, loss: 0.4694513976573944
Epoch: 2600, loss: 0.4

In [47]:
pred = model(x_test).round()

In [48]:
# accuracy of model 
print(accuracy_score(y_test.detach().numpy(), pred.detach().numpy() ))

0.8044692737430168
