# Importing Libraries

In [9]:
from sklearn import tree
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
import numpy as np
import re
import csv

# Modifying Input Data

In [2]:
trainingData = pd.read_csv("../Data/train.csv")
trainingData["Cabin"] = trainingData["Cabin"].apply(str)
trainingData["Embarked"] = trainingData["Embarked"].apply(str)

AgeBins = pd.IntervalIndex.from_tuples([(0,15),(15,30),(30,45),(45,60),(60,75),(75,90),(90,105),(105,120)])
TicketBins = pd.IntervalIndex.from_tuples([(0,50000), (50000,100000), (100000,150000),(150000,200000),(200000, 250000), (250000, 300000), (300000, 350000)])
CostBins = pd.IntervalIndex.from_tuples([(-1, 50),(50,100), (100,150), (150, 200), (200, 250), (250, 300), (300,350), (350,400), (400, 450), (450, 500), (500, 550)])

trainingData["AgeGroup"] = pd.cut(trainingData['Age'], bins=AgeBins).cat.add_categories(pd.Interval(-2,-1)).fillna(pd.Interval(-2,-1))
trainingData['TicketGroup'] = pd.cut(pd.to_numeric(trainingData['Ticket'], errors="coerce"), bins=TicketBins).cat.add_categories(pd.Interval(-2,-1)).fillna(pd.Interval(-2,-1))
trainingData['CostGroup'] = pd.cut(trainingData['Fare'], bins=CostBins).cat.add_categories(pd.Interval(-2,-1)).fillna(pd.Interval(-2,-1))

def nameClass(row):
    if (re.search("Mr\.",row["Name"])):
        return("Mr.")
    elif (re.search("Mrs\.",row["Name"])):
        return("Mrs.")
    elif (re.search("Miss\.",row["Name"])):
        return("Miss.")
    else:
        return("No Title")
        
trainingData["Title"] = trainingData.apply(lambda row: nameClass(row), axis=1)
trainingData.drop(["Name", "Age", "Ticket", "Fare"], axis=1,inplace=True)

features = ["Pclass", "Sex", "SibSp", "Parch", "Cabin", "Embarked", "AgeGroup", "TicketGroup", "CostGroup", "Title"]
results = trainingData["Survived"]
encoder = OrdinalEncoder()
encoder.fit(trainingData[features])
trainingData = encoder.transform(trainingData[features])

# CV-Sets

In [3]:
TrainingSets = []
TestSets = []
NUM_SETS = 5

for index in range(NUM_SETS):
    temp = []
    start = (NUM_SETS-index-1)*len(trainingData)//NUM_SETS
    end = (NUM_SETS-index)*len(trainingData)//NUM_SETS
    temp.append(np.append(trainingData[0:start],trainingData[end:], axis=0))
    temp.append(results[0:start].append(results[end:]))
    TrainingSets.append(temp)
    temp = []
    temp.append(trainingData[start:end])
    temp.append(results[start:end])
    TestSets.append(temp)

# Accuracy Function

In [4]:
def accuracy(data,model,result):
    predictions = model.predict(data)
    total = 0
    for pred in range(len(predictions)):
        if predictions[pred] == result[pred]:
            total+=1
    return total/len(data)

# Cross-Validation

In [5]:
def CV(measure, depth):
    results = []
    for index in range(NUM_SETS):
        dTree = tree.DecisionTreeClassifier(criterion=measure,max_depth=depth)
        dTree.fit(TrainingSets[index][0], TrainingSets[index][1])
        results.append(accuracy(TestSets[index][0], dTree, TestSets[index][1].to_numpy()))
    return results
    

for m in ["gini", "entropy"]:
    for d in [1,2,3,4,5,6,7,8,9,10]:
        print("Measurement: " + m)
        print("Depth: " + str(d))
        result = CV(m,d)
        print("Average Accuracy: " + str(sum(result)/NUM_SETS))
        print("Standard Deviation: " + str(np.std(result)))
        print()

Measurement: gini
Depth: 1
Average Accuracy: 0.7867428284476806
Standard Deviation: 0.026923752473539498

Measurement: gini
Depth: 2
Average Accuracy: 0.7732596823802649
Standard Deviation: 0.02175865121177268

Measurement: gini
Depth: 3
Average Accuracy: 0.8080534806352395
Standard Deviation: 0.018157507166960015

Measurement: gini
Depth: 4
Average Accuracy: 0.8001757579561861
Standard Deviation: 0.029684749858098134

Measurement: gini
Depth: 5
Average Accuracy: 0.8237273240851171
Standard Deviation: 0.0375734129157371

Measurement: gini
Depth: 6
Average Accuracy: 0.8327223651999247
Standard Deviation: 0.03204999831629725

Measurement: gini
Depth: 7
Average Accuracy: 0.8214864101437449
Standard Deviation: 0.035283495133146166

Measurement: gini
Depth: 8
Average Accuracy: 0.819270604481828
Standard Deviation: 0.01711168389757386

Measurement: gini
Depth: 9
Average Accuracy: 0.8102692862971566
Standard Deviation: 0.02856604358119529

Measurement: gini
Depth: 10
Average Accuracy: 0.81925

# Creating Tree Model

In [6]:
Tree = tree.DecisionTreeClassifier(criterion="entropy",max_depth=8)
Tree.fit(trainingData ,results)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=8,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

# Importing Test Data

In [7]:
trainingData = pd.read_csv("../Data/test.csv")
trainingData["Cabin"] = trainingData["Cabin"].apply(str)
trainingData["Embarked"] = trainingData["Embarked"].apply(str)

trainingData["AgeGroup"] = pd.cut(trainingData['Age'], bins=AgeBins).cat.add_categories(pd.Interval(-2,-1)).fillna(pd.Interval(-2,-1))
trainingData['TicketGroup'] = pd.cut(pd.to_numeric(trainingData['Ticket'], errors="coerce"), bins=TicketBins).cat.add_categories(pd.Interval(-2,-1)).fillna(pd.Interval(-2,-1))
trainingData['CostGroup'] = pd.cut(trainingData['Fare'], bins=CostBins).cat.add_categories(pd.Interval(-2,-1)).fillna(pd.Interval(-2,-1))

trainingData["Title"] = trainingData.apply(lambda row: nameClass(row), axis=1)
trainingData.drop(["Name", "Age", "Ticket", "Fare"], axis=1,inplace=True)

features = ["Pclass", "Sex", "SibSp", "Parch", "Cabin", "Embarked", "AgeGroup", "TicketGroup", "CostGroup", "Title"]
encoder = OrdinalEncoder()
encoder.fit(trainingData[features])
trainingData = encoder.transform(trainingData[features])

array([[2., 1., 0., ..., 6., 1., 1.],
       [2., 0., 1., ..., 0., 1., 2.],
       [1., 1., 0., ..., 4., 1., 1.],
       ...,
       [2., 1., 0., ..., 0., 1., 1.],
       [2., 1., 0., ..., 0., 1., 1.],
       [2., 1., 1., ..., 1., 1., 3.]])

# Predicting and Recording Data Points

In [13]:
with open('ScikitLearn_DecisionTree.csv', mode='w', newline='') as treedone:
    treedone = csv.writer(treedone)
    treedone.writerow(['PassengerId', 'Survived'])
    results = Tree.predict(trainingData)
    
    for index in range(418):
        label = results[index]
        if label == 0:
            treedone.writerow([index+892, 0])
        else:
            treedone.writerow([index+892, 1])