In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

In [2]:
titanic = pd.read_csv("../dataset/titanic/train.csv")

In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
drop_cols = ['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin', 'Embarked']

In [5]:
titanic = titanic.drop(drop_cols, axis=1)

In [12]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
dtypes: float64(1), int64(5)
memory usage: 41.8 KB


In [9]:
le = LabelEncoder()
titanic.Sex = le.fit_transform(titanic.Sex)

In [11]:
titanic.Age.fillna(np.mean(titanic.Age), inplace=True)

In [21]:
X = titanic.drop(["Survived"], axis=1)
y = titanic.Survived

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [29]:
# from sklearn.neighbors import KNeighborsClassifier
# model = KNeighborsClassifier(10)
# model.fit(X_train, y_train)
# print(model.score(X_test, y_test))

0.7627118644067796


In [24]:
# from sklearn.linear_model import LogisticRegression
# model = LogisticRegression()
# model.fit(X_train, y_train)
# print(model.score(X_test, y_test))

0.8135593220338984




In [33]:
def entropy(col):
    items, counts = np.unique(col.values, return_counts=True)
    size = col.values.shape[0]
    acc = 0
    for count in counts:
        pi = count/size
        acc += (pi * np.log2(pi))
    
    return -acc


In [35]:
vals = pd.Series([1, 1, 1, 0, 0, 0])
print(entropy(vals))

1.0


In [32]:
print(items, counts)

[0 1] [549 342]


In [41]:
def info_gain(X, Y, label):
    mean = np.mean(X[label])

    left_y = Y[X[label]<mean]
    right_y = Y[X[label]>=mean]
    
    s_total = Y.shape[0]
    s_left = left_y.shape[0]
    s_right = right_y.shape[0]
    
    if s_left == 0 or s_right == 0:
        return -10000
    
    return entropy(Y) - ((s_left/s_total)*entropy(left_y) + (s_right/s_total)*entropy(right_y))
    

In [45]:

for col in X_train.columns:
    print(col, info_gain(X_train, y_train, col))

Pclass 0.0693612979550764
Sex 0.20332074264151545
Age 0.0016815830685426025
SibSp 0.004524587061436547
Parch 0.014104142987097612


In [46]:
X_train.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch'], dtype='object')

In [47]:
class Node:
    
    def __init__(self, label=None, value=None, result=None):
        self.label = label
        self.value = value
        self.result = result
        
        self.left = None
        self.right = None

In [110]:
class DecisionTree:
    
    def __init__(self):
        self.root = None
        
    def best_col(self, x_data, y_data):
        gains = []
        
        for col in x_data.columns:
            gains.append((info_gain(x_data, y_data, col), col))
                         
        gains = sorted(gains)
        
        return gains[-1][1]
        
        
    def generate(self, x_data, y_data, max_dep=5):
        
        if max_dep == 0:
            return Node(result=np.mean(y_data))
        
        best = self.best_col(x_data, y_data)
        
        mean = np.mean(x_data[best])

        left_x = x_data[x_data[best]<mean]
        right_x = x_data[x_data[best]>=mean]
        
        left_y = y_data[x_data[best]<mean]
        right_y = y_data[x_data[best]>=mean]
        
        if left_y.shape[0] == 0 or right_y.shape[0] == 0:
            return Node(result=np.mean(y_data))
        
        node = Node(label=best, value=mean)
        
        node.left = self.generate(left_x, left_y, max_dep-1)
        node.right = self.generate(right_x, right_y, max_dep-1)
        
        return node
        
    
    def display(self, node, indent=0):
        if node == None:
            return
        
        print("\t"*indent, node.label, node.value, node.result)
        
        self.display(node.left, indent+1)
        self.display(node.right, indent+1)
        
    def predict(self, x_data):
        res = []
        for data in x_data.iterrows():
            val = self.find(data[1], self.root)
            if val > .5:
                res.append(1)
            else:
                res.append(0)
        return res
    
    def find(self, data, node):
        if node.label == None:
            return node.result
        
        if (data[node.label] < node.value):
            return self.find(data, node.left)
        else:
            return self.find(data, node.right)
        
    def score(self, x_data, y_data):
        pred = self.predict(x_data)
        
        return sum(pred == y_data) / y_data.shape[0]
            
        

In [111]:
tree = DecisionTree()

In [121]:
tree.root = tree.generate(X_train, y_train, 10)

In [122]:
# tree.display(tree.root)

In [123]:
print(tree.score(X_train, y_train))
print(tree.score(X_test, y_test))

0.9211409395973155
0.7898305084745763


In [95]:
# for r in X_train[:10].iterrows():
#     print(r[1])

Pclass     1.0
Sex        1.0
Age       54.0
SibSp      0.0
Parch      0.0
Name: 6, dtype: float64
Pclass     3.000000
Sex        1.000000
Age       29.699118
SibSp      0.000000
Parch      0.000000
Name: 718, dtype: float64
Pclass     2.0
Sex        1.0
Age       25.0
SibSp      1.0
Parch      2.0
Name: 685, dtype: float64
Pclass     3.0
Sex        1.0
Age       26.0
SibSp      1.0
Parch      0.0
Name: 73, dtype: float64
Pclass     3.0
Sex        0.0
Age       22.0
SibSp      0.0
Parch      0.0
Name: 882, dtype: float64
Pclass     3.0
Sex        0.0
Age       31.0
SibSp      1.0
Parch      1.0
Name: 328, dtype: float64
Pclass     1.0
Sex        1.0
Age       49.0
SibSp      1.0
Parch      0.0
Name: 453, dtype: float64
Pclass     2.0
Sex        1.0
Age       19.0
SibSp      1.0
Parch      1.0
Name: 145, dtype: float64
Pclass     2.0
Sex        1.0
Age       24.0
SibSp      0.0
Parch      0.0
Name: 234, dtype: float64
Pclass     3.0
Sex        1.0
Age       16.0
SibSp      0.0
Parch    

In [124]:
from sklearn.tree import DecisionTreeClassifier

In [128]:
dtc = DecisionTreeClassifier(max_depth=6)

In [129]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [130]:
dtc.score(X_test, y_test)

0.8203389830508474