In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
%matplotlib inline

import seaborn as sns

In [2]:
import tensorflow as tf
tf.__version__

'2.12.0'

Dataset

In [5]:
train_csv_path: str = 'train.csv'

data: pd.DataFrame = pd.read_csv(train_csv_path)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Watch the data style

In [6]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Remove any columns that aren't needed from the dataset.

In [7]:
data = data.drop(['PassengerId','Name','Ticket','Cabin','Parch'],axis=1)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Embarked
0,0,3,male,22.0,1,7.25,S
1,1,1,female,38.0,1,71.2833,C
2,1,3,female,26.0,0,7.925,S
3,1,1,female,35.0,1,53.1,S
4,0,3,male,35.0,0,8.05,S


Checking null values

In [8]:
data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Fare          0
Embarked      2
dtype: int64

In [9]:
data.dropna(subset=['Embarked'], inplace=True)

Filling null values

In [10]:
data['Age'].fillna(data['Age'].mean(),inplace = True)

In [11]:
data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Fare        0
Embarked    0
dtype: int64

In [12]:
sex_col = data['Sex'] == 'male'
sex_col = sex_col.astype('int32')


data = data.drop(['Sex'],axis=1)

data['Sex'] = sex_col

data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Fare,Embarked,Sex
0,0,3,22.0,1,7.25,S,1
1,1,1,38.0,1,71.2833,C,0
2,1,3,26.0,0,7.925,S,0
3,1,1,35.0,1,53.1,S,0
4,0,3,35.0,0,8.05,S,1


In [13]:
data = pd.get_dummies(data, columns = ['Embarked'])
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Fare,Sex,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,7.25,1,0,0,1
1,1,1,38.0,1,71.2833,0,1,0,0
2,1,3,26.0,0,7.925,0,0,0,1
3,1,1,35.0,1,53.1,0,0,0,1
4,0,3,35.0,0,8.05,1,0,0,1


Split Data

In [14]:
X = data.drop('Survived', axis=1).to_numpy()
y = data['Survived'].to_numpy()

In [15]:
X.shape, y.shape

((889, 8), (889,))

Feature scaling

In [16]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

Splitting traning set

In [17]:
from sklearn.model_selection import train_test_split

tf.random.set_seed(42)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((711, 8), (178, 8), (711,), (178,))

Building and Training model

In [18]:

tf.random.set_seed(42)

model_1 = tf.keras.Sequential([
           tf.keras.layers.Dense(9, activation='relu'),
           tf.keras.layers.Dense(15, activation='relu'),
           tf.keras.layers.Dense(50, activation='relu'),
           tf.keras.layers.Dense(2, activation='softmax')
])

model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                 optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                 metrics=['accuracy'])

history = model_1.fit(X_train, 
                      tf.one_hot(y_train, depth=2), 
                      epochs=250,
                      verbose = 1,
                      validation_data=(X_valid, tf.one_hot(y_valid, depth=2)))

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250


Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78/250
Epoch 79/250
Epoch 80/250
Epoch 81/250
Epoch 82/250
Epoch 83/250
Epoch 84/250
Epoch 85/250
Epoch 86/250
Epoch 87/250
Epoch 88/250
Epoch 89/250
Epoch 90/250
Epoch 91/250
Epoch 92/250
Epoch 93/250
Epoch 94/250
Epoch 95/250
Epoch 96/250
Epoch 97/250
Epoch 98/250
Epoch 99/250
Epoch 100/250
Epoch 101/250
Epoch 102/250
Epoch 103/250
Epoch 104/250
Epoch 105/250
Epoch 106/250
Epoch 107/250
Epoch 108/250
Epoch 109/250
Epoch 110/250
Epoch 111/250
Epoch 112/250
Epoch 113/250
Epoch 114/250
Epoch 115/250
Epoch 116/250
Epoch 117/250
Epoch 118/250
Epoch 119/250
Epoch 120/250
Epoch 121/250
Epoch 122/250
Epoch 123/250
Epoch 124/250
Epoch 125/250
Epoch 126/250
Epoch 127/250
Epoch 128/250
Epoch 129/250
Epoch 130/250
Epoch 131/250
Epoch 132/250
Epoch

Epoch 174/250
Epoch 175/250
Epoch 176/250
Epoch 177/250
Epoch 178/250
Epoch 179/250
Epoch 180/250
Epoch 181/250
Epoch 182/250
Epoch 183/250
Epoch 184/250
Epoch 185/250
Epoch 186/250
Epoch 187/250
Epoch 188/250
Epoch 189/250
Epoch 190/250
Epoch 191/250
Epoch 192/250
Epoch 193/250
Epoch 194/250
Epoch 195/250
Epoch 196/250
Epoch 197/250
Epoch 198/250
Epoch 199/250
Epoch 200/250
Epoch 201/250
Epoch 202/250
Epoch 203/250
Epoch 204/250
Epoch 205/250
Epoch 206/250
Epoch 207/250
Epoch 208/250
Epoch 209/250
Epoch 210/250
Epoch 211/250
Epoch 212/250
Epoch 213/250
Epoch 214/250
Epoch 215/250
Epoch 216/250
Epoch 217/250
Epoch 218/250
Epoch 219/250
Epoch 220/250
Epoch 221/250
Epoch 222/250
Epoch 223/250
Epoch 224/250
Epoch 225/250
Epoch 226/250
Epoch 227/250
Epoch 228/250
Epoch 229/250
Epoch 230/250
Epoch 231/250
Epoch 232/250
Epoch 233/250
Epoch 234/250
Epoch 235/250
Epoch 236/250
Epoch 237/250
Epoch 238/250
Epoch 239/250
Epoch 240/250
Epoch 241/250
Epoch 242/250
Epoch 243/250
Epoch 244/250
Epoch 

In [19]:
model_1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 9)                 81        
                                                                 
 dense_1 (Dense)             (None, 15)                150       
                                                                 
 dense_2 (Dense)             (None, 50)                800       
                                                                 
 dense_3 (Dense)             (None, 2)                 102       
                                                                 
Total params: 1,133
Trainable params: 1,133
Non-trainable params: 0
_________________________________________________________________


In [20]:
from tensorflow.keras.utils import plot_model

plot_model(model_1, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


Test

In [23]:
test_dataset = pd.read_csv('test.csv')

In [24]:
test_passengerIds = test_dataset['PassengerId'].values
test_dataset=test_dataset.drop(['PassengerId', 'Name','Ticket','Cabin', 'Parch'],axis=1)

In [25]:
test_dataset.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked
0,3,male,34.5,0,7.8292,Q
1,3,female,47.0,1,7.0,S
2,2,male,62.0,0,9.6875,Q
3,3,male,27.0,0,8.6625,S
4,3,female,22.0,1,12.2875,S


In [26]:
test_dataset.isna().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Fare         1
Embarked     0
dtype: int64

In [27]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Fare      417 non-null    float64
 5   Embarked  418 non-null    object 
dtypes: float64(2), int64(2), object(2)
memory usage: 19.7+ KB


In [28]:
test_dataset['Age'].fillna(test_dataset['Age'].mean(),inplace = True)
test_dataset['Fare'].fillna(test_dataset['Fare'].mean(),inplace = True)
test_dataset.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Fare        0
Embarked    0
dtype: int64

In [29]:
sex_col = test_dataset['Sex'] == 'male'
sex_col = sex_col.astype('int32')


test_dataset = test_dataset.drop(['Sex'],axis=1)

test_dataset['Sex'] = sex_col

test_dataset.head()

Unnamed: 0,Pclass,Age,SibSp,Fare,Embarked,Sex
0,3,34.5,0,7.8292,Q,1
1,3,47.0,1,7.0,S,0
2,2,62.0,0,9.6875,Q,1
3,3,27.0,0,8.6625,S,1
4,3,22.0,1,12.2875,S,0


In [30]:
test_dataset = pd.get_dummies(test_dataset, columns = ['Embarked'])
test_dataset.head()

Unnamed: 0,Pclass,Age,SibSp,Fare,Sex,Embarked_C,Embarked_Q,Embarked_S
0,3,34.5,0,7.8292,1,0,1,0
1,3,47.0,1,7.0,0,0,0,1
2,2,62.0,0,9.6875,1,0,1,0
3,3,27.0,0,8.6625,1,0,0,1
4,3,22.0,1,12.2875,0,0,0,1


In [31]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
test_dataset = sc.fit_transform(test_dataset)

In [32]:
y_pred = model_1.predict(test_dataset).argmax(axis = 1)
y_pred.shape



(418,)

In [33]:
test_passengerIds.shape

(418,)

In [66]:
output = pd.DataFrame({'PassengerId':test_passengerIds, 'Survived': y_pred})
output.to_csv('submission.csv', index=False)

In [67]:
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Fare,Sex,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,7.25,1,0,0,1
1,1,1,38.0,1,71.2833,0,1,0,0
2,1,3,26.0,0,7.925,0,0,0,1
3,1,1,35.0,1,53.1,0,0,0,1
4,0,3,35.0,0,8.05,1,0,0,1


In [68]:
data2 = data.drop(['Survived'],axis = 1)

In [70]:
data2.head()

Unnamed: 0,Pclass,Age,SibSp,Fare,Sex,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,7.25,1,0,0,1
1,1,38.0,1,71.2833,0,1,0,0
2,3,26.0,0,7.925,0,0,0,1
3,1,35.0,1,53.1,0,0,0,1
4,3,35.0,0,8.05,1,0,0,1


In [71]:
predst = model_1.predict(data2.values).argmax(axis = 1)



In [73]:
predst.shape

(889,)

In [74]:
z = data['Survived']

In [76]:
print(classification_report(z,predst))

              precision    recall  f1-score   support

           0       0.72      0.80      0.76       549
           1       0.61      0.50      0.55       340

    accuracy                           0.69       889
   macro avg       0.66      0.65      0.65       889
weighted avg       0.68      0.69      0.68       889

