<a href="https://www.kaggle.com/code/gabriellagloria/first-competition-titanic-using-neural-network?scriptVersionId=142969401" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import Libraries and Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from tensorflow import keras
from tensorflow.keras import layers

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings("ignore")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read data
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

# Explore Data

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.hist();

### Drop and fill missing values

In [None]:
# We can see that "Cabin" has many missing data, so we drop "Cabin" column
train_data.drop('Cabin', axis = 1, inplace = True)
test_data.drop('Cabin', axis = 1, inplace = True)

In [None]:
# "Age" has some missing data, so we fill them using "Age" mean
train_data['Age'].fillna(train_data['Age'].mean().round(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean().round(), inplace=True)

In [None]:
train_data["Embarked"].hist()
# S is the most frequent value, so we fill missing data with S
train_data["Embarked"].fillna('S', inplace = True)

In [None]:
test_data["Embarked"].hist()
# S is the most frequent value, so we fill missing data with S
test_data["Embarked"].fillna('S', inplace = True)

### Drop features with too many unique value

**PassengerId, Name, and Ticket** too much unique value, so we drop them

In [None]:
for i in train_data.columns :
    print(i, train_data[i].unique().size)

In [None]:
train_data.drop(["PassengerId", "Name", "Ticket"], axis = 1, inplace = True)
test_data.drop(["Name", "Ticket"], axis = 1, inplace = True)

### Correlation between Numerical features and Survived values

In [None]:
# Correlation heatmap
train_data.corr()
sns.heatmap(train_data.corr());
# Only correlation between numerical features (Excluding Name, Sex, Embarked and Ticket)

In [None]:
# Plot how "Sex" and "Embarked" effects the survived value
fig, ax = plt.subplots(1, 2)
sns.countplot(x = 'Survived', hue = 'Sex', data = train_data, ax = ax[0])
sns.countplot(x = 'Survived', hue = 'Embarked', data = train_data, ax = ax[1])
fig.show()

In [None]:
train_data[["Survived"]].hist();

In [None]:
train_data.hist();

In [None]:
train_data.head()

# Change Categorical Features to Numerical

### 1) Sex column

In [None]:
# Change "female" to 1, "male" to 0
train_data["Sex"] = (train_data["Sex"] == "female") 
test_data["Sex"] = (test_data["Sex"] == "female")

### 2) Age, Embarked, and Fare Column

In [None]:
fig = px.scatter(train_data, y ='Fare')
fig.show()

In [None]:
train_data["Fare"].describe()

In [None]:
train_data["Age"].describe()

In [None]:
# Categorize Fare :
def changeFare(data) :
    for i in range(len(data)) :
        if data["Fare"][i] <= 10 :
            data["Fare"][i] = 0
        elif data["Fare"][i] <= 50 :
            data["Fare"][i] = 1
        else :
            data["Fare"][i] = 2
    return data

# Categorize Embarked :
def changeEmbarked(data) :
    for i in range(len(data)) :
        if data["Embarked"][i] == "S" :
            data["Embarked"][i] = 0
        elif data["Embarked"][i] == "C" :
            data["Embarked"][i] = 1
        elif data["Embarked"][i] == "Q" :
            data["Embarked"][i] = 2
    return data

# Categorize Age :
def changeAge(data) :
    for i in range(len(data)) :
        if data["Age"][i] <= 20:
            data["Age"][i] = 0
        elif data["Age"][i] <= 50 :
            data["Age"][i] = 1
        elif data["Age"][i] <= 100 :
            data["Age"][i] = 2
    return data

In [None]:
# Apply above functions to our data :
train_data = changeAge(changeEmbarked(changeFare(train_data)))
test_data = changeAge(changeEmbarked(changeFare(test_data)))

train_data["Age"] = train_data["Age"].astype('int')
train_data["Fare"] = train_data["Fare"].astype('int')
train_data["Embarked"] = train_data["Embarked"].astype('int')
train_data["Sex"] = train_data["Sex"].astype('int')

test_data["Age"] = test_data["Age"].astype('int')
test_data["Fare"] = test_data["Fare"].astype('int')
test_data["Embarked"] = test_data["Embarked"].astype('int')
test_data["Sex"] = test_data["Sex"].astype('int')

#### Final Correlation

In [None]:
# Our correlation heatmap after categorizing above features
train_data.corr()
sns.heatmap(train_data.corr());

In [None]:
train_data.info()

In [None]:
train_data.head()

In [None]:
# Print train data and test data shape
print("Train Data : ", train_data.shape)
print("Test Data : ", test_data.shape) 

# Build Model

In [None]:
X_train = train_data.drop("Survived", axis=1)
y_train = train_data["Survived"]
X_test = test_data.drop("PassengerId", axis = 1)
print("X_train : ",X_train.shape)
print("y_train : ",y_train.shape)
print("X_test : ",X_test.shape)

In [None]:
model = keras.Sequential([
    layers.BatchNormalization(),
    layers.Dense(16, activation='relu', input_shape=[X_train.shape[1]]),
    layers.BatchNormalization(),
    layers.Dropout(0.2),
    layers.Dense(8, activation='relu'), 
    layers.BatchNormalization(),
    layers.Dropout(0.2),
    layers.Dense(4, activation='relu'), 
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid'),
])

In [None]:
# Add optimizer, loss, and metrics
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

In [None]:
history = model.fit(
    X_train, y_train,
    batch_size=8,
    epochs=500,
    verbose=0,
)

In [None]:
pred = model.predict(X_test)
pred = (pred > 0.5).astype(int).ravel()
print(pred)

# Submit Prediction!

In [None]:
submission = pd.DataFrame({"PassengerId": test_data.PassengerId, 'Survived': pred})
submission.to_csv("final_submission.csv",index = False)

**Thankyou, any suggestion will be appreciated!**