In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read the data and create a combined dataset

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
combined = [train_data, test_data] # combining train and test dataset

# Feature Engineering
1. Create FamilySize column from SibSp and Parch
2. Create Title column from Name
3. Encode all categorical columns except Embarked (has missing values)
4. Drop unnecessary columns


# inspect the data

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
plt.figure(figsize=(7, 6))
sns.boxplot(train_data, x="Survived", y="Age")
plt.title("Survived vs Age")
plt.show()

In [None]:
plt.figure(figsize=(7, 6))
sns.countplot(train_data, x="Survived", hue="Pclass")
plt.title("Survived vs Pclass")
plt.show()

We can notice that most of the dead passengers were from the 3rd class and most of the 1st class passengers survived.

# Creating the column Title from the name of Passengers

In [None]:
for dataset in combined:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    

train_data['Title'].value_counts()

In [None]:
test_data['Title'].value_counts()

In [None]:
for dataset in combined:
    dataset['Title'] = dataset['Title'].replace(['Dr','Rev','Major','Col','Countess','Capt','Sir','Lady','Don','Jonkheer','Dona'], 'Other')
    dataset['Title'] = dataset['Title'].replace(['Mlle','Ms'], 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

Simplifying the Title column by combining the similar titles and the rare ones

In [None]:
train_data['Title'].value_counts()

In [None]:
test_data['Title'].value_counts()

# Observing the missing data and filling it in

We have some missing data in the columns Age, Cabin, Embarked, Fare

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
# fill missing age with median age for each title (Mr, Miss, Mrs, Master, Other)
train_data["Age"].fillna(train_data.groupby(["Title","Sex"])["Age"].transform("median"), inplace=True)
test_data["Age"].fillna(test_data.groupby(["Title","Sex"])["Age"].transform("median"), inplace=True)

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
facet = sns.FacetGrid(train_data, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'Age',fill= True)
facet.set(xlim=(0, train_data['Age'].max()))
facet.add_legend()
 
plt.show() 

Most of the passengers are from S, so fill in the missing with S

In [None]:
for dataset in combined:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

Loking at the data we can observe that the missing fare value is from a 3rd class solo passenger, so we can fill it in with the median of the fares from the same class and solo passengers.

In [None]:
for dataset in combined:
    med_fare = dataset.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
    # Filling the missing value in Fare with the median Fare of 3rd class alone passenger
    dataset['Fare'] = dataset['Fare'].fillna(med_fare)

Creating Deck column from the first letter of the Cabin column and everything else is U as in Unknown.

In [None]:
for dataset in combined:
    dataset['Cabin'] = dataset['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'U')

In [None]:
train_data['Cabin']

In [None]:
test_data

In [None]:
#test_data['AgeG'] = np.digitize(test_data['Age'], bins=[10,20,30,40,50,60,70,80]) # convert column to bin
test_data['Age']

# Bin the age values to remove outlier effect.

In [None]:
all_data = pd.concat([train_data, test_data], sort=True).reset_index(drop=True)
res, all_bins = pd.qcut(all_data['Age'], q=9, retbins=True)
train_data['Age'] = pd.cut(train_data['Age'], bins=all_bins)
test_data['Age'] = pd.cut(test_data['Age'], bins=all_bins)

In [None]:
train_data['Age']

In [None]:
fig, axs = plt.subplots(figsize=(22, 9))
sns.countplot(x='Age', hue='Survived', data=train_data)

plt.xlabel('Age', size=15, labelpad=20)
plt.ylabel('Passenger Count', size=15, labelpad=20)
plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=15)

#plt.legend(['Not Survived', 'Pclass'], loc='upper right', prop={'size': 15})
plt.title('Surviver Counts in {} Feature'.format('Age'), size=15, y=1.05)

plt.show()

In [None]:
train_data['Age']

In [None]:
test_data['Age'].value_counts()

In [None]:
train_data['Age'].value_counts()

# Create the column FamilyOnBoard from the columns SibSP and Parch

In [None]:
for dataset in combined:
    dataset['FamilyOnBoard'] = 1 + dataset['SibSp'] + dataset['Parch']

In [None]:
train_data['FamilyOnBoard'].value_counts()

# Bin the values in the column FamilyOnBoard

In [None]:
for dataset in combined:
    dataset['FamilyOnBoard'] = pd.cut(dataset['FamilyOnBoard'], bins=[0.5,1,4,6,11], labels=['Solo','Small','Medium','Large'])

In [None]:
train_data['FamilyOnBoard'].value_counts()

All the missing data has been filled in

In [None]:
train_data.isnull().sum()

# Creating the column TicketCount

Passengers who travelled in groups had the same ticket, this will help differentiate between alone passengers and groupped passengers regadless of their biological relations

In [None]:
for dataset in combined:
    dataset['TicketCount'] = dataset.groupby('Ticket')['Ticket'].transform('count')

In [None]:
train_data

In [None]:
train_data['TicketCount'].value_counts()

In [None]:
plt.figure(figsize=(7, 6))
sns.countplot(train_data, x="TicketCount", hue="Survived")
plt.title("Survived vs TicketCount")
plt.show()

# Binning the fare Column into 13 bins

In [None]:
#for['Fare'] = pd.qcut(df_all['Fare'], 13)
all_data = pd.concat([train_data, test_data], sort=True).reset_index(drop=True)
all_data['Fare'], all_bins = pd.qcut(all_data['Fare'], q=13, retbins=True)
train_data['Fare'] = pd.cut(train_data['Fare'], bins=all_bins)
test_data['Fare'] = pd.cut(test_data['Fare'], bins=all_bins)

In [None]:
for dataset in combined:
    dataset['Age'] = LabelEncoder().fit_transform(dataset['Age'])

In [None]:
plt.figure(figsize=(20, 6))
sns.countplot(train_data, x="Fare", hue="Survived")
plt.title("Survived vs Fare")
plt.show()

In [None]:
train_data

In [None]:
train_data['Title'].value_counts()

In [None]:
test_data

# Label Enconding the Dataset Columns

In [None]:
for dataset in combined:
    #dataset['Pclass'] = LabelEncoder().fit_transform(dataset['Pclass'])
    dataset['Sex'] = LabelEncoder().fit_transform(dataset['Sex'])
    dataset['Age'] = LabelEncoder().fit_transform(dataset['Age'])
    dataset['Fare'] = LabelEncoder().fit_transform(dataset['Fare'])
    dataset['Cabin'] = LabelEncoder().fit_transform(dataset['Cabin'])
    dataset['Embarked'] = LabelEncoder().fit_transform(dataset['Embarked'])
    dataset['Title'] = LabelEncoder().fit_transform(dataset['Title'])
    dataset['FamilyOnBoard'] = LabelEncoder().fit_transform(dataset['FamilyOnBoard'])

# Preparing the data for training

In [None]:
target_data = train_data['Survived']
train_data = train_data.drop(columns=['PassengerId','Survived','Ticket','Name','SibSp','Parch','Title'])
test_data_noIndex = test_data.drop(columns=['PassengerId','Ticket','Name','SibSp','Parch','Title'])

In [None]:
test_data_noIndex.info()

In [None]:
train_data.info()

# Importing the libraries for the models

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Testing the models using Kfolds and calculating their scores with cross_val_score

the best model has been left uncommented and the rest is commented to prevent it from running

In [None]:
model = RandomForestClassifier(n_estimators=1100,max_depth=7)
score = cross_val_score(model, train_data, target_data, cv=k_fold, n_jobs=-1, scoring='accuracy')
print(score)
print(np.mean(score))

In [None]:
# model = RandomForestClassifier(n_estimators=800,max_depth=5)
# score = cross_val_score(model, train_data, target_data, cv=k_fold, n_jobs=-1, scoring='accuracy')
# print(score)
# print(np.mean(score))

In [None]:
# model = RandomForestClassifier(n_estimators=1200,max_depth=8)
# score = cross_val_score(model, train_data, target_data, cv=k_fold, n_jobs=-1, scoring='accuracy')
# print(score)
# print(np.mean(score))

In [None]:
# model = KNeighborsClassifier(n_neighbors = 10)
# score = cross_val_score(model, train_data, target_data, cv=k_fold, n_jobs=-1, scoring='accuracy')
# print(score)
# print(np.mean(score))

In [None]:
# model = SVC()
# score = cross_val_score(model, train_data, target_data, cv=k_fold, n_jobs=-1, scoring='accuracy')
# print(score)
# print(np.mean(score))

In [None]:
# model = GaussianNB()
# score = cross_val_score(model, train_data, target_data, cv=k_fold, n_jobs=-1, scoring='accuracy')
# print(score)
# print(np.mean(score))

# Do the actual training

In [None]:
model.fit(train_data, target_data)

# Calculate the prediction array

In [None]:
prediction = model.predict(test_data_noIndex)

# Save the submission output

In [None]:
submission = pd.DataFrame({"PassengerId": test_data["PassengerId"],
                           "Survived": prediction})

submission.to_csv('submission2.csv', index=False)

# Import the answerKey for the Titanic for evaluating

In [None]:
from sklearn.metrics import accuracy_score
ansDataSet = pd.read_csv("/kaggle/input/testdatasetanswerkey/test_answered.csv")
ansKey = ansDataSet['Survived']
ansKey.to_numpy();

# Evaluate the score of the model on the test dataset

In [None]:
round(accuracy_score(ansKey, prediction) *100,5)

The model acchived a score of 78.47% accuracy