In [3]:
import pandas as pd 
from sklearn import tree
import numpy as np
import graphviz
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/GraphViz'

In [30]:
def clean_df(df):
    df["SibSp"] = df["SibSp"] + df["Parch"]
    df = df.replace({"Sex": {'male': 1, 'female': 0}})

    del df["Name"]
    del df["Parch"]
    del df["Ticket"]
    del df["Embarked"]
    del df["PassengerId"]
    del df["Cabin"]
    # I have doubts about this one
    del df["Fare"]

    df = df.fillna(df.mean().astype(int))

    return df


In [31]:
train = pd.read_csv("titanic/input/train.csv")
train = clean_df(train)
train_y = train.pop("Survived")

test = pd.read_csv("titanic/input/test.csv")
test = clean_df(test)

answer = pd.read_csv("titanic/input/gender_submission.csv")
del answer['PassengerId']

In [32]:
def create_train(train, y, criterion, max_depth, test):
    clf = tree.DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
    clf.fit(train, y)
    if max_depth:
        file_name = "titanic/output/" + criterion + str(max_depth)
    else:
        file_name = "titanic/output/" + criterion
    dot_data = tree.export_graphviz(clf, out_file=None, 
                      feature_names=train.columns,
                      class_names=["Didn't", "Survived"],
                      filled=True, rounded=True,  
                      special_characters=True) 
    graph = graphviz.Source(dot_data)
    graph.render(file_name)
    result = clf.predict(test)
    out_df = pd.DataFrame(data=result, columns=["Survived"])
    out_df.to_csv(file_name + ".csv")
    return out_df

In [33]:
d1 = create_train(train, train_y, "gini", None, test)
d2 = create_train(train, train_y, "gini", 2, test)
d3 = create_train(train, train_y, "entropy", None, test)
d4 = create_train(train, train_y, "entropy", 2, test)

In [34]:
def accuracy(df, answer):
    acc = (df == answer).value_counts()[True]
    acc = int(acc) / len(d1)
    return acc

In [35]:
print("Accuracy for gini - unlimited depth is %s" % accuracy(d1, answer))
print("Accuracy for gini - 2 depth is %s" % accuracy(d2, answer))
print("Accuracy for entropy - unlimited depth is %s" % accuracy(d3, answer))
print("Accuracy for entropy - 2 depth is %s" % accuracy(d4, answer))

Accuracy for gini - unlimited depth is 0.7751196172248804
Accuracy for gini - 2 depth is 0.8086124401913876
Accuracy for entropy - unlimited depth is 0.7751196172248804
Accuracy for entropy - 2 depth is 0.8277511961722488
