In [32]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# Import Package
from collections import Counter

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

import math

In [33]:
dataset_x_y_train = pd.read_csv("train.csv")
dataset_x_test = pd.read_csv("test.csv")
dataset_y_test = pd.read_csv("gender_submission.csv")

In [34]:
dataset_x_y_test = dataset_y_test.merge(dataset_x_test, on='PassengerId')

In [35]:
dataset = pd.concat([dataset_x_y_test, dataset_x_y_train])

In [36]:
dataset = dataset.sort_values(['PassengerId'])

In [37]:
print('\nNull Values in data \n{}'.format(dataset.isnull().sum()))
print('\nDuplicated values in data {}'.format(dataset.duplicated().sum()))


Null Values in data 
PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

Duplicated values in data 0


In [38]:
print('Embarkation per ports \n{}'.format(dataset['Embarked'].value_counts()))

Embarkation per ports 
Embarked
S    914
C    270
Q    123
Name: count, dtype: int64


In [39]:
dataset['Embarked'].fillna(value='S', inplace=True)
dataset['Fare'].fillna(value=dataset.Fare.mean(), inplace=True)

In [40]:
mean_age_miss = dataset[dataset["Name"].str.contains('Miss.', na=False)]['Age'].mean().round()
mean_age_mrs = dataset[dataset["Name"].str.contains('Mrs.', na=False)]['Age'].mean().round()
mean_age_mr = dataset[dataset["Name"].str.contains('Mr.', na=False)]['Age'].mean().round()
mean_age_master = dataset[dataset["Name"].str.contains('Master.', na=False)]['Age'].mean().round()

print('Mean age of Miss. title {}'.format(mean_age_miss))
print('Mean age of Mrs. title {}'.format(mean_age_mrs))
print('Mean age of Mr. title {}'.format(mean_age_mr))
print('Mean age of Master. title {}'.format(mean_age_master))

Mean age of Miss. title 22.0
Mean age of Mrs. title 37.0
Mean age of Mr. title 33.0
Mean age of Master. title 5.0


In [41]:
def fill_age(name_age):
    
    name = name_age[0]
    age = name_age[1]
    
    if pd.isnull(age):
        if 'Mr.' in name:
            return mean_age_mr
        if 'Mrs.' in name:
            return mean_age_mrs
        if 'Miss.' in name:
            return mean_age_miss
        if 'Master.' in name:
            return mean_age_master
        if 'Dr.' in name:
            return mean_age_master
        if 'Ms.' in name:
            return mean_age_miss
    else:
        return age

In [42]:
dataset['Age'] = dataset[['Name', 'Age']].apply(fill_age,axis=1)

In [43]:
dataset['Cabin'] = pd.Series(['X' if pd.isnull(ii) else ii[0] for ii in dataset['Cabin']])

In [44]:
print('Mean Fare of Cabin B {}'.format(round(dataset[dataset['Cabin']=='B']['Fare'].mean(), 2)))
print('Mean Fare of Cabin C {}'.format(round(dataset[dataset['Cabin']=='C']['Fare'].mean(), 2)))
print('Mean Fare of Cabin D {}'.format(round(dataset[dataset['Cabin']=='D']['Fare'].mean(), 2)))
print('Mean Fare of Cabin E {}'.format(round(dataset[dataset['Cabin']=='E']['Fare'].mean(), 2)))
print('Mean Fare of Cabin A {}'.format(round(dataset[dataset['Cabin']=='A']['Fare'].mean(), 2)))
print('Mean Fare of Cabin T {}'.format(round(dataset[dataset['Cabin']=='T']['Fare'].mean(), 2)))
print('Mean Fare of Cabin X {}'.format(round(dataset[dataset['Cabin']=='X']['Fare'].mean(), 2)))
print('Mean Fare of Cabin F {}'.format(round(dataset[dataset['Cabin']=='F']['Fare'].mean(), 2)))
print('Mean Fare of Cabin G {}'.format(round(dataset[dataset['Cabin']=='G']['Fare'].mean(), 2)))

Mean Fare of Cabin B 90.92
Mean Fare of Cabin C 72.76
Mean Fare of Cabin D 50.28
Mean Fare of Cabin E 40.41
Mean Fare of Cabin A 38.58
Mean Fare of Cabin T 21.36
Mean Fare of Cabin X 25.29
Mean Fare of Cabin F 25.73
Mean Fare of Cabin G 14.74


In [45]:
def reasign_cabin(cabin_fare):
    
    cabin = cabin_fare[0]
    fare = cabin_fare[1]
    
    if cabin=='X':
        if (fare >= 113.5):
            return 'B'
        if ((fare < 113.5) and (fare > 100)):
            return 'C'
        if ((fare < 100) and (fare > 57)):
            return 'D'
        if ((fare < 57) and (fare > 46)):
            return 'E'
        if ((fare < 46) and (fare > 39)):
            return 'A'            
        else:
            return 'X'
    else:
        return cabin

In [46]:
dataset['Cabin'] = dataset[['Cabin', 'Fare']].apply(reasign_cabin, axis=1)

In [47]:
print('\nNull Values in data \n{}'.format(dataset.isnull().sum()))
print('\nDuplicated values in data {}'.format(dataset.duplicated().sum()))


Null Values in data 
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

Duplicated values in data 0


In [48]:
def create_alone_feature(SibSp_Parch):
    if (SibSp_Parch[0]+SibSp_Parch[1])==0:
        return 1
    else:
        return 0

dataset['Alone'] = dataset[['SibSp','Parch']].apply(create_alone_feature, axis=1)
dataset['Familiars'] = dataset['SibSp'] + dataset['Parch']

In [49]:
categories = {"female": 1, "male": 0}
dataset['Sex']= dataset['Sex'].map(categories)

categories = {"S": 1, "C": 2, "Q": 3}
dataset['Embarked']= dataset['Embarked'].map(categories)

categories = {"X" : 0, "C" : 1, "E" : 2, "G" : 3, "D" : 4, "A" : 5, "B" : 6, "F" : 7, "T" : 8}
dataset['Cabin']= dataset['Cabin'].map(categories)


# dataset['Ticket'] = dataset['Ticket'].astype('category')
# dataset['Ticket'] = dataset['Ticket'].cat.codes

dataset = dataset.drop(['Name','PassengerId'], axis=1)

In [50]:
train = dataset[:891]
test = dataset[891:]

In [51]:
x_train = train.loc[:,['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Alone', 'Familiars']]
y_train = train.loc[:, 'Survived']

x_test = test.loc[:,['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Alone', 'Familiars']]
y_test = test.loc[:, 'Survived']

In [52]:
# Random Forest Implemetion

def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    idxs = np.random.choice(n_samples, n_samples, replace=True)
    #result = pd.DataFrame(X.values[idxs], columns=['PassengerId', 'Pclass','Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'])
    result = pd.DataFrame(X.values[idxs], columns=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Alone', 'Familiars'])
    return result, y[idxs]


def most_common_label(y):
    counter = Counter(y)
    most_common = counter.most_common(1)[0][0]
    return most_common


class RandomForest:
    def __init__(self, n_trees=10, min_samples_split=2, max_depth=100, n_feats=None, criterion='entropy', random_state=None):
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.criterion = criterion
        self.random_state = random_state
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            clf = DecisionTreeClassifier(min_samples_split = self.min_samples_split,
                                         random_state = self.random_state,
                                         criterion = self.criterion,
                                         max_features = self.n_feats,
                                         max_depth = self.max_depth,
                                         class_weight = "balanced")
            
            X_samp, y_samp = bootstrap_sample(X, y)
            clf.fit(X_samp, y_samp)
            self.trees.append(clf)
        return self.trees

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(tree_preds, 0, 1)
        y_pred = [most_common_label(tree_pred) for tree_pred in tree_preds]
        return np.array(y_pred)

In [95]:
clf = RandomForest(n_trees=100, max_depth=4, n_feats=5, random_state=876)
trees = clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

In [96]:
result = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print("Accuracy:",result2)

Confusion Matrix:
[[264   4]
 [  2 148]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       266
           1       0.99      0.97      0.98       152

    accuracy                           0.99       418
   macro avg       0.99      0.98      0.98       418
weighted avg       0.99      0.99      0.99       418

Accuracy: 0.9856459330143541
