# Import Libraries

In [73]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Load data

Columns like PassengerID, Name, Ticket don't seem to be a good feature. Cabin is usually `NaN`. It's better to remove these columns from the data.

In [74]:
# Load the dataset
data = pd.read_csv('train.csv')

# Drop some columns and rows
data = data.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

print(f"Number of rows {data.shape[0]}")
data.head(5)

Number of rows 891


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Add some features

I add 3 more features to the features.
- **FamilySize**: Tells the size of the family which is the sum of `SibSp` and `Parch`. 
- **HasSomebody**: It is true (=1) if the person is so young (`Age` < 22) or so old (`Age` > 60) and does not have any family (=Solo).
- **Ability**: Tells about amount of physical ability to escape or save themselves.

In [75]:
# Create a new feature that represents the size of the passenger's family
data['FamilySize'] = data['SibSp'] + data['Parch']

# Create a new feature that represent a person need somebody to take care of or not
data['HasSomebody'] = pd.cut(data['Age'], bins=[-1, 22, 60, 200], labels=[0, 1, 0], ordered=False).astype('int64')
data['HasSomebody'] = data['HasSomebody'] + data['FamilySize']
data['HasSomebody'] = pd.cut(data['HasSomebody'], bins=[-1, 0, 300], labels=[0, 1])

# Create a new feature that represents the ability based on age
data['Ability'] = pd.cut(data['Age'], bins=[-1, 4, 15, 40, 60, 200], labels=['Low', 'Medium', 'High', 'Medium', 'Low'], ordered=False)
data['FamilySize'] = pd.cut(data['FamilySize'], bins=[-1, 0, 3, 7, 20], labels=['Solo', 'Small', 'Medium', 'Large'])

# Convert categorical features to numerical
data = pd.get_dummies(data, columns=['Sex', 'Embarked', 'FamilySize', 'Ability'])

data.head(5)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,HasSomebody,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,FamilySize_Solo,FamilySize_Small,FamilySize_Medium,FamilySize_Large,Ability_High,Ability_Low,Ability_Medium
0,0,3,22.0,1,0,7.25,1,0,1,0,0,1,0,1,0,0,1,0,0
1,1,1,38.0,1,0,71.2833,1,1,0,1,0,0,0,1,0,0,1,0,0
2,1,3,26.0,0,0,7.925,1,1,0,0,0,1,1,0,0,0,1,0,0
3,1,1,35.0,1,0,53.1,1,1,0,0,0,1,0,1,0,0,1,0,0
4,0,3,35.0,0,0,8.05,1,0,1,0,0,1,1,0,0,0,1,0,0


# Provide the train and test data

I consider 30% of the train test for test data. There are not many records to train. As I know, in these cases, it's better to dedicate more than 20% of our train data to the test data. 

In [76]:
X = data.drop('Survived', axis=1)
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build the Model and Training

In [77]:
# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fab49e2a910>

# Evaluation

In [78]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8022388219833374


# Judge the unseen test data

In [79]:
# Load the dataset
data = pd.read_csv('test.csv')
backup_data = data

# Drop some columns and rows
data = data.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

print(f"Number of rows {data.shape[0]}")

# Create a new feature that represents the size of the passenger's family
data['FamilySize'] = data['SibSp'] + data['Parch']

# Create a new feature that represent a person need somebody to take care of or not
data['HasSomebody'] = pd.cut(data['Age'], bins=[-1, 22, 60, 200], labels=[0, 1, 0], ordered=False).astype('int64')
data['HasSomebody'] = data['HasSomebody'] + data['FamilySize']
data['HasSomebody'] = pd.cut(data['HasSomebody'], bins=[-1, 0, 300], labels=[0, 1])

# Create a new feature that represents the ability based on age
data['Ability'] = pd.cut(data['Age'], bins=[-1, 4, 15, 40, 60, 200], labels=['Low', 'Medium', 'High', 'Medium', 'Low'], ordered=False)
data['FamilySize'] = pd.cut(data['FamilySize'], bins=[-1, 0, 3, 7, 20], labels=['Solo', 'Small', 'Medium', 'Large'])

# Convert categorical features to numerical
data = pd.get_dummies(data, columns=['Sex', 'Embarked', 'FamilySize', 'Ability'])

Number of rows 418


In [80]:
y_pred = model.predict(data)
y_pred_binary = (y_pred >= 0.5).astype(int)
backup_data['Survived'] = y_pred_binary



In [81]:
# Create the output file
output = pd.DataFrame()
output['PassengerId'] = backup_data['PassengerId']
output['Survived'] = backup_data['Survived']
output.to_csv("gender_submission.csv", header=True)