# Importing Libraries

In [None]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 54kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.25.1


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Dataset Preprocessing

## Importing Training Set

In [None]:
from google.colab import drive
drive.mount("/content/drive")
data = pd.read_csv("drive/MyDrive/Colab Notebooks/Programs/Kaggle/Titanic/train.csv")
data_test = pd.read_csv("drive/MyDrive/Colab Notebooks/Programs/Kaggle/Titanic/test.csv")


Mounted at /content/drive


In [None]:
all_scores = {}

## Splitting X and Y variables

In [None]:

X = data.iloc[:, [2,4,5,6,7]].values
y = data.iloc[:, 1].values
X

array([[3, 'male', 22.0, 1, 0],
       [1, 'female', 38.0, 1, 0],
       [3, 'female', 26.0, 0, 0],
       ...,
       [3, 'female', nan, 1, 2],
       [1, 'male', 26.0, 0, 0],
       [3, 'male', 32.0, 0, 0]], dtype=object)

In [None]:
# Creating prediction data of the same structure as the training set
X_test = data_test.iloc[:, [1,3,4,5,6]].values


## Encoding Categorical Data

In [None]:
# Encoding Gender
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(X[:, 1])
X[:, 1] = label_encoder.transform(X[:, 1])

In [None]:
X_test[:, 1] = label_encoder.transform(X_test[:, 1])

In [None]:
print(X)
print(X_test)

[[3 1 22.0 1 0]
 [1 0 38.0 1 0]
 [3 0 26.0 0 0]
 ...
 [3 0 nan 1 2]
 [1 1 26.0 0 0]
 [3 1 32.0 0 0]]
[[3 1 34.5 0 0]
 [3 0 47.0 1 0]
 [2 1 62.0 0 0]
 ...
 [3 1 38.5 0 0]
 [3 1 nan 0 0]
 [3 1 nan 1 1]]


## Taking Care of Missing Data

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, [2]])
X[:, [2]]= imputer.transform(X[:, [2]])

In [None]:
X_test[:, [2]] = imputer.transform(X_test[:, [2]])

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X= sc.fit_transform(X)
X_test = sc.transform(X_test)

## Splitting into Training and Validation Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1)

# Random Forest


## Creating Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RFClassifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
RFClassifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

## Testing Random Forest Model on Validation Set

In [None]:
val_pred_RF = RFClassifier.predict(X_val)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
cm = confusion_matrix(y_val, val_pred_RF)
accuracy = accuracy_score(y_val, val_pred_RF)

cross_val_accuracies = cross_val_score(RFClassifier, X_val, y_val, cv = 10, n_jobs = -1)

print(cm)
print("Accuracy(Regular): {:.2f}".format(accuracy*100))
print("Accuracy(Cross Validation): {:.2f}%".format(cross_val_accuracies.mean()*100))
print("Standard Deviation: {:.2f}%".format(cross_val_accuracies.std()*100))

all_scores["Random Forest"] = cross_val_accuracies.mean()-cross_val_accuracies.std()

[[53 10]
 [13 14]]
Accuracy(Regular): 74.44
Accuracy(Cross Validation): 77.78%
Standard Deviation: 12.17%


# Decision Tree

## Creating Decision Tree Model

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT_Classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
DT_Classifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [None]:
cross_val_accuracies = cross_val_score(DT_Classifier, X_train, y_train, cv = 10, n_jobs = -1)

print("Accuracy(Cross Validation): {:.2f}%".format(cross_val_accuracies.mean()*100))
print("Standard Deviation: {:.2f}%".format(cross_val_accuracies.std()*100))

Accuracy(Cross Validation): 80.40%
Standard Deviation: 2.92%


## Testing on Validation Set

In [None]:
val_pred_DT = DT_Classifier.predict(X_val)

In [None]:
cm = confusion_matrix(y_val, val_pred_DT)
accuracy = accuracy_score(y_val, val_pred_DT)

cross_val_accuracies = cross_val_score(DT_Classifier, X_val, y_val, cv = 10, n_jobs = -1)

print(cm)
print("Accuracy(Regular): {:.2f}".format(accuracy*100))
print("Accuracy(Cross Validation): {:.2f}%".format(cross_val_accuracies.mean()*100))
print("Standard Deviation: {:.2f}%".format(cross_val_accuracies.std()*100))


all_scores["Decision Tree"] = cross_val_accuracies.mean()-cross_val_accuracies.std()

[[53 10]
 [13 14]]
Accuracy(Regular): 74.44
Accuracy(Cross Validation): 75.56%
Standard Deviation: 10.89%


# XGBoost

## Creating XGBoost Model

In [None]:
from xgboost import XGBClassifier
XGB_Classifier = XGBClassifier()
XGB_Classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
XGB_Classifier = XGBClassifier(gamma = 1, learning_rate = 0.5, max_depth = 4, n_estimators = 120, n_jobs = -1)
XGB_Classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=1,
              learning_rate=0.5, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=120, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

### Testing Model on Validation Set

In [None]:
val_pred_XGB = XGB_Classifier.predict(X_val)

In [None]:
accuracy = accuracy_score(y_val, val_pred_XGB)

cross_val_accuracies = cross_val_score(XGB_Classifier, X_val, y_val, cv = 10, n_jobs = -1)

print("Accuracy(Regular): {:.2f}".format(accuracy*100))
print("Accuracy(Cross Validation): {:.2f}%".format(cross_val_accuracies.mean()*100))
print("Standard Deviation: {:.2f}%".format(cross_val_accuracies.std()*100))

all_scores["XGBClassifier"] = cross_val_accuracies.mean()-cross_val_accuracies.std()

Accuracy(Regular): 81.11
Accuracy(Cross Validation): 77.78%
Standard Deviation: 13.15%


## Creating XGBRFClassifier Model

In [None]:
from xgboost import XGBRFClassifier
XGBRF_Classifier = XGBRFClassifier()

In [None]:
XGBRF_Classifier.fit(X_train, y_train)

XGBRFClassifier(base_score=0.5, colsample_bylevel=1, colsample_bynode=0.8,
                colsample_bytree=1, gamma=0, learning_rate=1, max_delta_step=0,
                max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
                n_jobs=1, nthread=None, objective='binary:logistic',
                random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                seed=None, silent=None, subsample=0.8, verbosity=1)

In [None]:
cross_val_accuracies = cross_val_score(XGBRF_Classifier, X_train, y_train, cv = 10, n_jobs = -1)

print("Accuracy(Cross Validation): {:.2f}%".format(cross_val_accuracies.mean()*100))
print("Standard Deviation: {:.2f}%".format(cross_val_accuracies.std()*100))

all_scores["XGBClassifier"] = cross_val_accuracies.mean()-cross_val_accuracies.std()

Accuracy(Cross Validation): 80.91%
Standard Deviation: 4.03%


### Testing XGBRFClassifier on Validation Set


In [None]:
val_pred_XGBRF = XGBRF_Classifier.predict(X_val)

In [None]:
# XGBRFClassifier
cm = confusion_matrix(y_val, val_pred_XGBRF)
accuracy = accuracy_score(y_val, val_pred_XGBRF)

cross_val_accuracies = cross_val_score(XGBRF_Classifier, X_val, y_val, cv = 10, n_jobs = -1)

print(cm)
print("Accuracy(Regular): {:.2f}".format(accuracy*100))
print("Accuracy(Cross Validation): {:.2f}%".format(cross_val_accuracies.mean()*100))
print("Standard Deviation: {:.2f}%".format(cross_val_accuracies.std()*100))

all_scores["XGBRFClassifier"] = cross_val_accuracies.mean()-cross_val_accuracies.std()

[[55  8]
 [11 16]]
Accuracy(Regular): 78.89
Accuracy(Cross Validation): 77.78%
Standard Deviation: 12.17%


# CatBoost


## Creating the Model

In [None]:
from catboost import CatBoostClassifier
CB_Classifier = CatBoostClassifier()
classifier = CatBoostClassifier(learning_rate = 0.5)
CB_Classifier.fit(X_train, y_train)

Learning rate set to 0.009371
0:	learn: 0.6895108	total: 645us	remaining: 645ms
1:	learn: 0.6830046	total: 1.8ms	remaining: 897ms
2:	learn: 0.6769441	total: 2.91ms	remaining: 969ms
3:	learn: 0.6712165	total: 4.05ms	remaining: 1.01s
4:	learn: 0.6662332	total: 4.98ms	remaining: 992ms
5:	learn: 0.6614360	total: 5.85ms	remaining: 970ms
6:	learn: 0.6557063	total: 6.98ms	remaining: 990ms
7:	learn: 0.6509770	total: 8.01ms	remaining: 994ms
8:	learn: 0.6460224	total: 9.14ms	remaining: 1.01s
9:	learn: 0.6406902	total: 10.2ms	remaining: 1.01s
10:	learn: 0.6364175	total: 11.2ms	remaining: 1s
11:	learn: 0.6332162	total: 12ms	remaining: 990ms
12:	learn: 0.6289389	total: 12.9ms	remaining: 979ms
13:	learn: 0.6242186	total: 13.9ms	remaining: 980ms
14:	learn: 0.6202557	total: 14.8ms	remaining: 972ms
15:	learn: 0.6166470	total: 15.6ms	remaining: 961ms
16:	learn: 0.6141495	total: 16.4ms	remaining: 946ms
17:	learn: 0.6100964	total: 17.5ms	remaining: 952ms
18:	learn: 0.6060077	total: 18.4ms	remaining: 949ms

<catboost.core.CatBoostClassifier at 0x7f0e1243d2d0>

In [None]:
cross_val_accuracies = cross_val_score(CB_Classifier, X_train, y_train, cv = 10, n_jobs = -1)

print("Accuracy(Cross Validation): {:.2f}%".format(cross_val_accuracies.mean()*100))
print("Standard Deviation: {:.2f}%".format(cross_val_accuracies.std()*100))




Accuracy(Cross Validation): 82.15%
Standard Deviation: 3.80%


## Testing the model on the Validation set

In [None]:
val_pred_CB = CB_Classifier.predict(X_val)

In [None]:
cm = confusion_matrix(y_val, val_pred_CB)
accuracy = accuracy_score(y_val, val_pred_CB)

cross_val_accuracies = cross_val_score(CB_Classifier, X_val, y_val, cv = 10, n_jobs = -1)

print(cm)
print("Accuracy(Regular): {:.2f}".format(accuracy*100))
print("Accuracy(Cross Validation): {:.2f}%".format(cross_val_accuracies.mean()*100))
print("Standard Deviation: {:.2f}%".format(cross_val_accuracies.std()*100))


all_scores["CatBoost"] = cross_val_accuracies.mean()-cross_val_accuracies.std()

[[53 10]
 [11 16]]
Accuracy(Regular): 76.67
Accuracy(Cross Validation): 78.89%
Standard Deviation: 11.60%


# Kernel SVM

## Creating KSVM Model

In [None]:
from sklearn.svm import SVC
SVM_Classifier = SVC(kernel = 'rbf', random_state = 0)
SVM_Classifier.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

## Testing on Validation Set

In [None]:
val_pred_SVM = SVM_Classifier.predict(X_val)

In [None]:
cm = confusion_matrix(y_val, val_pred_XGB)
accuracy = accuracy_score(y_val, val_pred_XGB)

cross_val_accuracies = cross_val_score(XGB_Classifier, X_val, y_val, cv = 10, n_jobs = -1)

print(cm)
print("Accuracy(Regular): {:.2f}".format(accuracy*100))
print("Accuracy(Cross Validation): {:.2f}%".format(cross_val_accuracies.mean()*100))
print("Standard Deviation: {:.2f}%".format(cross_val_accuracies.std()*100))


all_scores["Kernel SVM"] = cross_val_accuracies.mean()-cross_val_accuracies.std()

[[58  5]
 [12 15]]
Accuracy(Regular): 81.11
Accuracy(Cross Validation): 77.78%
Standard Deviation: 13.15%


## Hyperparameter Tuning


In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'C': [0.1, 0.2, 0.25, 0.3, 0.35,0.4, 0.45, 0.5, 0.6, 0.75, 1], 'kernel': ['linear']},
              {'C': [0.1, 0.2, 0.25, 0.3, 0.35,0.4, 0.45, 0.5, 0.6, 0.75, 1], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
SVM_Classifier = GridSearchCV(estimator = SVM_Classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
SVM_Classifier.fit(X_train, y_train)
best_accuracy = SVM_Classifier.best_score_
best_parameters = SVM_Classifier.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 82.40 %
Best Parameters: {'C': 0.25, 'gamma': 0.4, 'kernel': 'rbf'}


# Creating Test Set Predictions on Winner

In [None]:
print(all_scores)

{'Random Forest': 0.6560616538877408, 'Decision Tree': 0.6466893447651921, 'XGBClassifier': 0.7687795604723556, 'XGBRFClassifier': 0.6560616538877407, 'CatBoost': 0.6728854832343271, 'Kernel SVM': 0.6463093381533418}


In [None]:
prediction = SVM_Classifier.predict(X_test)
prediction

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [None]:
predictions = []
UserId = data_test.iloc[:, 0].values

for x in range(len(prediction)):
  predictions.append([UserId[x], prediction[x]])

predictions = pd.DataFrame(predictions)

predictions.head()

Unnamed: 0,0,1
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [None]:
predictions.columns = ["PassengerId", "Survived"]

In [None]:
predictions.to_csv("output_new.csv", index = False)