## Start via importing libraries and accessing file path

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
sns.set()
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Titanic sink

In [None]:
from IPython.display import Image
Image(url= "https://static1.squarespace.com/static/5006453fe4b09ef2252ba068/5095eabce4b06cb305058603/5095eabce4b02d37bef4c24c/1352002236895/100_anniversary_titanic_sinking_by_esai8mellows-d4xbme8.jpg")

## Read csv files using pandas

In [None]:
train_df = pd.read_csv("../input/titanic/train.csv")
test_df = pd.read_csv("../input/titanic/test.csv")
train_df.head(15)

In [None]:
test_df.head(10)

### Checking rows and columns

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
train_df.info()

In [None]:
test_df.info()

## Checking missing values

In [None]:
train_df.isna().sum()

In [None]:
test_df.isna().sum()

## Describing data

In [None]:
train_df.describe()

In [None]:
train_df.describe(include=['O'])

In [None]:
# Lets check total women survived rate
women_surv = train_df.loc[train_df.Sex == 'female']["Survived"]
women_rate = sum(women_surv)/len(women_surv)
print("Total women survived (%):", women_rate*100)

In [None]:
# Printing num and categ columns which is used in data preprocessing
num_col = train_df.select_dtypes(include=np.number).columns
print("Numerical columns: \n",num_col)

cat_col = train_df.select_dtypes(exclude=np.number).columns
print("Categorical columns: \n",cat_col)

In [None]:
train_df["Survived"].value_counts()

### A simple bar chart to visualize survived count

In [None]:
def bar_chart(feature):
    survived = train_df[train_df['Survived']==1][feature].value_counts()
    dead = train_df[train_df['Survived']==0][feature].value_counts()
    df = pd.DataFrame([survived,dead])
    df.index = ['Survived','Dead']
    df.plot(kind='bar',color=["pink","grey","cyan","red","gold","black","blue"],stacked=True, figsize=(5,7)) 

In [None]:
bar_chart('Sex')

In [None]:
bar_chart('SibSp')

In [None]:
bar_chart('Embarked')

In [None]:
bar_chart('Pclass')

In [None]:
bar_chart('Parch')

In [None]:
train_df.head(2)

In [None]:
test_df.head(2)

## Missing value treatment for embarked feature

In [None]:
most_freq = train_df.Embarked.dropna().mode()[0]
train_df['Embarked'] = train_df['Embarked'].fillna(most_freq)
test_df['Embarked'] = test_df['Embarked'].fillna(most_freq)

# Sex and Embarked encoding
train_gender_encode = train_df.replace(to_replace={'Sex': {'female': 1,'male':0}})
train_data = train_gender_encode.replace(to_replace={'Embarked': {'S': 0,'C': 1,'Q': 2}})

test_gender_encode = test_df.replace(to_replace={'Sex': {'female': 1,'male':0}})
test_data = test_gender_encode.replace(to_replace={'Embarked': {'S': 0,'C': 1,'Q': 2}})

In [None]:
test_data.head(2)

In [None]:
print(train_data.shape)
print(test_data.shape)

## Processing title feature

In [None]:
# Title

train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
train_data['Title'].value_counts()

In [None]:
test_data['Title'] = test_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_data['Title'].value_counts()

In [None]:
test_data.head(2)

In [None]:
title_map = {"Mr": 0, "Miss": 1, "Mrs": 2, 
             "Master": 3, "Dr": 4, "Rev": 4, 
             "Col": 4, "Major": 4, "Mlle": 4,
             "Countess": 4,"Ms": 4, "Lady": 4, 
             "Jonkheer": 4, "Don": 4, "Dona" : 4, 
             "Mme": 4,"Capt": 4,"Sir": 4 }
train_data['Title'] = train_data['Title'].map(title_map)
test_data['Title'] = test_data['Title'].map(title_map)

In [None]:
train_data.head(2)

In [None]:
# Delete name feature from both train and test set
train_data.drop('Name', axis=1, inplace=True)
test_data.drop('Name', axis=1, inplace=True)

In [None]:
#Checking train data missing value
train_data.isna().sum()* 100 / len(train_data)

In [None]:
#Checking test data missing value
test_data.isna().sum()* 100 / len(test_data)

## Processing cabin feature

In [None]:
# cabin
train_data.Cabin.value_counts()


In [None]:
train_data.Pclass.value_counts()

In [None]:
train_data['Cabin'] = train_data['Cabin'].str[:1]
test_data['Cabin'] = test_data['Cabin'].str[:1]
train_data.head(20)

In [None]:
# Train data
Pclass1 = train_data[train_data['Pclass']==1]['Cabin'].value_counts()
Pclass2 = train_data[train_data['Pclass']==2]['Cabin'].value_counts()
Pclass3 = train_data[train_data['Pclass']==3]['Cabin'].value_counts()
train_data_df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
train_data_df.index = ['1st class','2nd class', '3rd class']
train_data_df.head()

In [None]:
# Test data
Pclass1 = test_data[test_data['Pclass']==1]['Cabin'].value_counts()
Pclass2 = test_data[test_data['Pclass']==2]['Cabin'].value_counts()
Pclass3 = test_data[test_data['Pclass']==3]['Cabin'].value_counts()
test_data_df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
test_data_df.index = ['1st class','2nd class', '3rd class']
test_data_df.head()

In [None]:
train_data_df.plot(kind='bar',stacked=True, figsize=(10,5))

In [None]:
test_data_df.plot(kind='bar',stacked=True, figsize=(10,5))

In [None]:
cabin_map = {"A": 0, "B": 0.4, "C": 0.8, "D": 1.2, "E": 1.6, "F": 2, "G": 2.4, "T": 2.8}
train_data['Cabin'] = train_data['Cabin'].map(cabin_map)
test_data['Cabin'] = test_data['Cabin'].map(cabin_map)

In [None]:
# fill missing Fare with median fare for each Pclass
train_data["Cabin"].fillna(train_data.groupby('Pclass')['Cabin'].transform("median"), inplace=True)
test_data["Cabin"].fillna(test_data.groupby('Pclass')['Cabin'].transform("median"), inplace=True)

In [None]:
train_data.head()

In [None]:
train_data.isna().sum()

## Missing value treatment for age

In [None]:
# Let's use Title's and pclass median age for missing Age

train_data["Age"].fillna(train_data.groupby(['Title','Pclass'])['Age'].transform("median"), inplace=True)
test_data["Age"].fillna(test_data.groupby(['Title','Pclass'])['Age'].transform("median"), inplace=True)

In [None]:
# encoding age in train data
train_data.loc[train_data['Age'] <= 16, 'Age'] = 0
train_data.loc[(train_data['Age'] > 16) & (train_data['Age'] <= 26), 'Age'] = 1
train_data.loc[(train_data['Age'] > 26) & (train_data['Age'] <= 36), 'Age'] = 2
train_data.loc[(train_data['Age'] > 36) & (train_data['Age'] <= 62), 'Age'] = 3
train_data.loc[train_data['Age'] > 62, 'Age'] = 4

# encoding age in test data
test_data.loc[test_data['Age'] <= 16, 'Age'] = 0
test_data.loc[(test_data['Age'] > 16) & (test_data['Age'] <= 26), 'Age'] = 1
test_data.loc[(test_data['Age'] > 26) & (test_data['Age'] <= 36), 'Age'] = 2
test_data.loc[(test_data['Age'] > 36) & (test_data['Age'] <= 62), 'Age'] = 3
test_data.loc[test_data['Age'] > 62, 'Age'] = 4


In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
test_data.isna().sum()
test_data['Age'].fillna((test_data['Age'].mean()), inplace=True)
test_data['Fare'].fillna((test_data['Fare'].mean()), inplace=True)

In [None]:
test_data.isna().sum()

## Processing Fare feature

In [None]:
# Split the datasets into 4 ranges using qcut
train_data['Farerange'] = pd.qcut(train_data['Fare'], 4)
train_data[['Farerange', 'Survived']].groupby(['Farerange'], as_index=False).mean().sort_values(by='Farerange', ascending=True)

In [None]:
#Categorize the fare values for train data
train_data.loc[ train_data['Fare'] <= 7.91, 'Fare'] = 0
train_data.loc[(train_data['Fare'] > 7.91) & (train_data['Fare'] <= 14.454), 'Fare'] = 1
train_data.loc[(train_data['Fare'] > 14.454) & (train_data['Fare'] <= 31), 'Fare']   = 2
train_data.loc[ train_data['Fare'] > 31, 'Fare'] = 3

#Categorize the fare values for test data
test_data.loc[ test_data['Fare'] <= 7.91, 'Fare'] = 0
test_data.loc[(test_data['Fare'] > 7.91) & (test_data['Fare'] <= 14.454), 'Fare'] = 1
test_data.loc[(test_data['Fare'] > 14.454) & (test_data['Fare'] <= 31), 'Fare']   = 2
test_data.loc[ test_data['Fare'] > 31, 'Fare'] = 3


In [None]:
train_data.head()

In [None]:
#Remove the features of no use
train_data = train_data.drop(['PassengerId','Farerange', 'Ticket'], axis=1)
test_data  = test_data.drop(['Ticket'], axis=1)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
# Split the dataframe into x and y
x_train = train_data.drop('Survived', axis=1)
y_train = train_data.Survived

In [None]:
x_train.head()

In [None]:
x_test  = test_data.copy()
x_test.drop(['PassengerId'], axis=1)

In [None]:
print("x_train",x_train.shape)
print("y_train",y_train.shape)

print("x_test",x_test.shape)

# Model Building

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV 


In [None]:
def build_models(model,x_train,y_train,folds=10):
    model = model.fit(x_train,y_train)
    accuracy = round(model.score(x_train,y_train)* 100,2)
    
    # Performing cross validation
    cv_score = cross_val_score(model,x_train,y_train,cv=folds,n_jobs=1)
    cv_accuracy = round(np.mean(cv_score)*100, 2) 
    return cv_score, accuracy, cv_accuracy

## Logistic Regression

In [None]:
log_train_pred, log_acc, log_cv_acc = build_models(LogisticRegression(),x_train,y_train)
print("Accuracy: %s" % log_acc)
print("CV Accuracy: %s" % log_cv_acc)

## KNN

In [None]:

# To find k use elbow method
def elbow(k):
    accuracy = []
    for i in k:
        knn = KNeighborsClassifier(n_neighbors=i)
        score = cross_val_score(knn,x_train,y_train)
        accuracy.append(score.mean())
    return accuracy   

In [None]:
k_vlaue = elbow(range(1,40))

In [None]:
# plotting the Curves
plt.figure(figsize=(10,6))
plt.plot(range(1,40),k_vlaue,marker = 'o')
plt.xlabel('No of Neighbours')
plt.ylabel('Accuracy')


In [None]:
# Lets take k=12
knn_train_pred, knn_acc, knn_cv_acc = build_models(KNeighborsClassifier(n_neighbors=12),x_train,y_train)
print("Accuracy: %s" % knn_acc)
print("CV Accuracy: %s" % knn_cv_acc)

## SVC

In [None]:
svc_train_pred, svc_acc, svc_cv_acc = build_models(SVC(),x_train,y_train)
print("Accuracy: %s" % svc_acc)
print("CV Accuracy: %s" % svc_cv_acc)

## GaussianNB

In [None]:
gnb_train_pred, gnb_acc, gnb_cv_acc = build_models(GaussianNB(),x_train,y_train)
print("Accuracy: %s" % gnb_acc)
print("CV Accuracy: %s" % gnb_cv_acc)

## Decision tree

In [None]:
dt_train_pred, dt_acc, dt_cv_acc = build_models(DecisionTreeClassifier(),x_train,y_train)
print("Accuracy: %s" % dt_acc)
print("CV Accuracy: %s" % dt_cv_acc)

## Random forest

In [None]:
rf_train_pred, rf_acc, rf_cv_acc = build_models(RandomForestClassifier(),x_train,y_train)
print("Accuracy: %s" % rf_acc)
print("CV Accuracy: %s" % rf_cv_acc)

## Ada boost

In [None]:
ada_train_pred, ada_acc, ada_cv_acc = build_models(AdaBoostClassifier(),x_train,y_train)
print("Accuracy: %s" % ada_acc)
print("CV Accuracy: %s" % ada_cv_acc)

## Gradient boost

In [None]:
gb_train_pred, gb_acc, gb_cv_acc = build_models(GradientBoostingClassifier(),x_train,y_train)
print("Accuracy: %s" % gb_acc)
print("CV Accuracy: %s" % gb_cv_acc)

## XG boost

In [None]:
xgb_train_pred, xgb_acc, xgb_cv_acc = build_models(XGBClassifier(),x_train,y_train)
print("Accuracy: %s" % xgb_acc)
print("CV Accuracy: %s" % xgb_cv_acc)

In [None]:
# Normal accuracy
models = pd.DataFrame({
    'model': ['Logistic Regression','KNN','SVC', 
              'GaussianNB','Decision Tree','Random Forest',
              'Ada Boost','Gradient Boost','XGBoost'],
    'accuracy': [log_acc,knn_acc,svc_acc,gnb_acc,dt_acc,rf_acc,ada_acc,gb_acc,xgb_acc]})
models.sort_values(by='accuracy', ascending=False)

In [None]:
# Cross validation accuracy
models = pd.DataFrame({
    'model': ['Logistic Regression','KNN','SVC', 
              'GaussianNB','Decision Tree','Random Forest',
              'Ada Boost','Gradient Boost','XGBoost'],
    'cv_accuracy': [log_cv_acc,knn_cv_acc,svc_cv_acc,gnb_cv_acc,dt_cv_acc,rf_cv_acc,ada_cv_acc,gb_cv_acc,xgb_cv_acc]})
models.sort_values(by='cv_accuracy', ascending=False)

# Hyper parameter tuning

In [None]:
# Lets tune with the best parameters for xgb, random forest and gradient boosting
# Random forest tuning
param_grid = { 'max_depth'   : [3,4,7,10,15,20,30],
               'n_estimators' : [88,100,155,200]
             } 
rfclassifier = RandomForestClassifier()
grid = GridSearchCV(rfclassifier, param_grid, refit = True, verbose = 1) 
grid.fit(x_train, y_train)

In [None]:
print(grid.best_params_) 
print(grid.best_estimator_)

In [None]:
# Random forest
rf_hyper_train_pred, rf_hyper_acc, rf_cv_hyper_acc = build_models(RandomForestClassifier(max_depth=4, n_estimators=100),x_train,y_train)
print("Accuracy: %s" % rf_hyper_acc)
print("CV Accuracy: %s" % rf_cv_hyper_acc)

In [None]:
# Gradient boosting tuning
gb_params = {'n_estimators':[90,100,150],
             'learning_rate':[1e-2,0.1,0.5,1],
             'max_depth':[3,4,6], 
             'min_samples_leaf':[1,3,5]}
gbclassifier = GradientBoostingClassifier()
grid = GridSearchCV(gbclassifier, gb_params, refit = True, verbose = 1) 
grid.fit(x_train, y_train)

In [None]:
print(grid.best_params_) 
print(grid.best_estimator_)

In [None]:
# Gradient boosting
gb_hyper_train_pred, gb_hyper_acc, gb_cv_hyper_acc = build_models(GradientBoostingClassifier(max_depth=6, min_samples_leaf=3, n_estimators=100,learning_rate=0.1),x_train,y_train)
print("Accuracy: %s" % gb_hyper_acc)
print("CV Accuracy: %s" % gb_cv_hyper_acc)

In [None]:
# XGB
xgb_params = {'learning_rate':[0.1,0.5,1],
              'max_depth':[4,6,9],
              'gamma':[0,10,55,73],
              'alpha':[0,23,67,103]}
xgbclassifier = XGBClassifier()
grid = GridSearchCV(xgbclassifier, xgb_params, refit = True, verbose = 1) 
grid.fit(x_train, y_train)

In [None]:
print(grid.best_params_) 
print(grid.best_estimator_)

In [None]:
# XG boost
xgb_hyper_train_pred, xgb_hyper_acc, xgb_hyper_cv_acc = build_models(XGBClassifier(alpha=0, gamma=0, learning_rate=0.5, max_depth=4),x_train,y_train)
print("Accuracy: %s" % xgb_hyper_acc)
print("CV Accuracy: %s" % xgb_hyper_cv_acc)

In [None]:
# Final prediction
all_models = pd.DataFrame({
    'model': ['Logistic Regression','KNN','SVC', 
              'GaussianNB','Decision Tree','Random Forest',
              'Ada Boost','Gradient Boost','XGBoost'],
    'final_accuracy': [log_cv_acc,knn_cv_acc,svc_cv_acc,gnb_cv_acc,dt_cv_acc,rf_cv_hyper_acc,ada_cv_acc,gb_cv_hyper_acc,xgb_hyper_cv_acc]})
all_models.sort_values(by='final_accuracy', ascending=False)

In [None]:
test_data.head()

In [None]:
test_data.isna().sum()

In [None]:
print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(test_data.shape)

## Model testing with test data

In [None]:
# Evaluating best prediction model with test data
# best_model =  GradientBoostingClassifier(max_depth=6, min_samples_leaf=3, n_estimators=100,learning_rate=0.1)
best_model = XGBClassifier(alpha=0, gamma=0, learning_rate=0.5, max_depth=4)
best_model.fit(x_train,y_train)
test_data_1 = test_data.drop("PassengerId", axis=1).copy()
prediction = best_model.predict(test_data_1)

In [None]:
result_submission = pd.DataFrame({
        "PassengerId": x_test["PassengerId"],
        "Survived": prediction
    })
result_submission.to_csv('submission.csv', index=False)

In [None]:
submission = pd.read_csv('submission.csv')
submission.Survived.value_counts()


In [None]:
submission.head(10)