### This solution is based on ideas of [Amer Wafiy](https://www.kaggle.com/amerwafiy)

# Imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Data Input
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
sub_example = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [None]:
train.sample(5)

In [None]:
test.sample(5)

In [None]:
train.describe()

In [None]:
train.describe(include=['O'])

In [None]:
test.describe()

In [None]:
test.describe(include=['O'])

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

# Explorative Data Analysis

In [None]:
train.sample(5)

In [None]:
# Pclass - 1 = 1st, 2 = 2nd, 3 = 3rd
plt.figure(dpi=100)
sns.barplot(x='Pclass',y='Survived',data=train,hue='Sex')

In [None]:
# Age - 0 to 76
plt.figure(dpi=100)
sns.histplot(data=train,x='Age',bins=100,kde=True,hue='Survived')

In [None]:
# SibSp - number of siblings / spouses aboard the Titanic
plt.figure(dpi=100)
sns.barplot(x='SibSp',y='Survived',data=train,hue='Sex')

In [None]:
# Parch - number of parents / children aboard the Titanic
plt.figure(dpi=100)
sns.barplot(x='Parch',y='Survived',data=train,hue='Sex')

In [None]:
# Fare - 0 to 512
plt.figure(dpi=100)
sns.histplot(data=train,x='Fare',bins=100,kde=True,hue='Survived')

In [None]:
# Embarked - C = Cherbourg, Q = Queenstown, S = Southampton
plt.figure(dpi=100)
sns.barplot(x='Embarked',y='Survived',data=train,hue='Sex')

In [None]:
# Correlation between features
plt.figure(dpi=100)
sns.heatmap(train.corr(),linewidth=0.5,annot=True,cmap='viridis')

# Feature Engineering

In [None]:
# Combining Train Test Data for simplicity
train_test_data = pd.concat([train,test],axis=0)
train_test_data.head(895)

In [None]:
train_test_data

In [None]:
# Name
train_test_data['Title'] = train_test_data['Name'].apply(lambda name: name.split(', ')[1].split('.')[0])
train_test_data['Title'] =  train_test_data['Title'].replace(\
                            to_replace=['Rev','Dr','Col','Mlle','Major','Ms','Lady','Sir','Mme','Don',\
                            'Capt','the Countess','Jonkheer','Dona'],value='Other')
train_test_data['Title'].value_counts()
train_test_data

In [None]:
# SibSp and Parch
train_test_data['FamilySize'] = train_test_data['SibSp'] + train_test_data['Parch'] + 1
train_test_data

In [None]:
# Ticket
train_test_data['TicketType'] = train_test_data['Ticket'].str.isnumeric()
train_test_data['TicketType'] =  train_test_data['TicketType'].replace(to_replace=True,value=1)
train_test_data['TicketType'] =  train_test_data['TicketType'].replace(to_replace=False,value=0)

In [None]:
# Fare
train_test_data['Fare'].fillna(value=train_test_data['Fare'].median(),inplace=True)

In [None]:
# Embarked
train_test_data['Embarked'].value_counts()

In [None]:
# Drop useless columns
train_test_data.drop(['PassengerId','Survived','Name','SibSp','Parch','Ticket','Cabin'],axis=1,inplace=True)

In [None]:
train_test_data.isnull().sum()

In [None]:
# Get dummies 
train_test_data_dum = pd.get_dummies(train_test_data,drop_first=True)

In [None]:
# Filling out missing age using KNN
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
train_test_data_dum_filled = imputer.fit_transform(train_test_data_dum)

In [None]:
train_test_data_dum_filled = pd.DataFrame(data=train_test_data_dum_filled,columns=train_test_data_dum.columns)

In [None]:
train_test_data_dum_filled

In [None]:
X = train_test_data_dum_filled.iloc[0:891]
y = train['Survived']
X_submission = train_test_data_dum_filled.iloc[891:1309]

In [None]:
print(X.shape)
print(y.shape)
print(X_submission.shape)

In [None]:
y

# Model Training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42);

In [None]:
# def find_best_model(X_train, X_test, y_train, y_test):
#     # Logistic Regression
#     logreg = LogisticRegression(max_iter = 600)
#     logreg.fit(X_train, y_train)
#     y_pred = logreg.predict(X_test)
#     logreg_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    
#     # Decision Tree
#     decision_tree = DecisionTreeClassifier()
#     decision_tree.fit(X_train, y_train)
#     y_pred = decision_tree.predict(X_test)
#     decision_tree_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    
#     # Random Forest
#     random_forest = RandomForestClassifier()
#     random_forest.fit(X_train, y_train)
#     y_pred = random_forest.predict(X_test)
#     random_forest_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    
#     # XGBoost
#     xgb = XGBClassifier()
#     xgb.fit(X_train, y_train)
#     y_pred = xgb.predict(X_test)
#     xgb_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    
#     # GBM
#     gbm = GradientBoostingClassifier()
#     gbm.fit(X_train, y_train)
#     y_pred = gbm.predict(X_test)
#     gbm_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    
#     # LightGBM
#     lgbm = LGBMClassifier()
#     lgbm.fit(X_train, y_train)
#     y_pred = lgbm.predict(X_test)
#     lgbm_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
        
#     # Catboost
#     catb = CatBoostClassifier(verbose = 0)
#     catb.fit(X_train, y_train)
#     y_pred = catb.predict(X_test)
#     catb_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    
#     # Histogram-based Gradient Boosting Classification Tree
#     hgb = HistGradientBoostingClassifier()
#     hgb.fit(X_train, y_train)
#     y_pred = hgb.predict(X_test)
#     hgb_acc = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    
#     model_df = pd.DataFrame({'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost', 'GBM', 'LightGBM', 'Catboost', 'HistBoost'],
#                        'Score': [logreg_acc, decision_tree_acc, random_forest_acc, xgb_acc, gbm_acc, lgbm_acc, catb_acc, hgb_acc]})
#     print(model_df.sort_values('Score', ascending = False).reset_index(drop = True))

In [None]:
# find_best_model(X_train, X_test, y_train, y_test)

In [None]:
# Grid Search on Random Forest Model

# n_estimators=[100,500,1000]
# max_features= ['sqrt','log2']
# bootstrap = [True,False]
# oob_score = [True,False]
# param_grid = {'n_estimators':n_estimators,
#              'max_features':max_features,
#              'bootstrap':bootstrap,
#              'oob_score':oob_score}  # Note, oob_score only makes sense when bootstrap=True!
# rfc = RandomForestClassifier(class_weight='balanced')
# grid = GridSearchCV(rfc,param_grid)
# grid.fit(X,y)

In [None]:
leaderboard_model = RandomForestClassifier(criterion='gini',
                                           n_estimators=1750,
                                           max_depth=7,
                                           min_samples_split=6,
                                           min_samples_leaf=6,
                                           max_features='auto',
                                           oob_score=True,
                                           n_jobs=-1,
                                           verbose=1)
leaderboard_model.fit(X,y)

In [None]:
predictions = leaderboard_model.predict(X_submission)
predictions

In [None]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output

In [None]:
output.to_csv('submission.csv', index=False)