In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("../datasets/titanic/train.csv")

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
mod = data.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis=1)

In [6]:
mod.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
dtype: object

In [7]:
mod["Sex"] = LabelEncoder().fit_transform(mod["Sex"])

In [8]:
mod.dtypes

Survived      int64
Pclass        int64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
dtype: object

In [9]:
mod.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,1,22.0,1,0
1,1,1,0,38.0,1,0
2,1,3,0,26.0,0,0
3,1,1,0,35.0,1,0
4,0,3,1,35.0,0,0


In [10]:
mod.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
count,891.0,891.0,891.0,714.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.699118,0.523008,0.381594
std,0.486592,0.836071,0.47799,14.526497,1.102743,0.806057
min,0.0,1.0,0.0,0.42,0.0,0.0
25%,0.0,2.0,0.0,20.125,0.0,0.0
50%,0.0,3.0,1.0,28.0,0.0,0.0
75%,1.0,3.0,1.0,38.0,1.0,0.0
max,1.0,3.0,1.0,80.0,8.0,6.0


In [11]:
mod_full = mod.dropna()

In [12]:
mod_blank = mod.loc[mod["Age"].isna()]

In [13]:
table = mod_full.groupby(["Sex", "Pclass"]).mean()[["Age"]].reset_index()

In [14]:
row = mod_blank.iloc[0]

table.loc[(table["Sex"] == row["Sex"]) &(table["Pclass"] == row["Pclass"])].iloc[0]["Age"]

26.507588932806325

In [15]:
def fillAge(row):
    row["Age"] = table.loc[(table["Sex"] == row["Sex"]) &(table["Pclass"] == row["Pclass"])].iloc[0]["Age"]
    return row

mod_filled = mod_blank.apply(fillAge, axis=1)

In [16]:
total = pd.concat([mod_full, mod_filled])

In [17]:
X = total.drop(["Survived"], axis=1)
y = total["Survived"]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [20]:
model = DecisionTreeClassifier()

In [21]:
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [22]:
model.score(X_test, y_test)

0.7932203389830509

In [24]:
X.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch'], dtype='object')

In [27]:
X["Age"].mean()

29.318642716644238

In [23]:
def entropy(column):
    items, counts = np.unique(column, return_counts=True)
    p = counts / len(column)
    return np.sum(-p*np.log(p))

In [29]:
def info_gain(X, y, label):
    
    pivot = X[label].mean()
    
    left = X[label] < pivot
    right = X[label] >= pivot
    
    y_left, y_right = y.loc[left], y.loc[right]
    
    p_left, p_right = len(y_left)/len(y), len(y_right)/len(y)
    
    return entropy(y) - p_left*entropy(y_left) - p_right*entropy(y_right)
    

In [30]:
for col in X.columns:
    print(col, info_gain(X, y, col))

Pclass 0.05253613916173083
Sex 0.15087048925218172
Age 0.0007663481843047659
SibSp 0.006643498134917131
Parch 0.010661126611803101


In [37]:
class Node:
    
    def __init__(self, label=None, pivot=None, result=None):
        self.label = label
        self.pivot = pivot
        self.result = result
        
        self.left = None
        self.right = None
        
    def __repr__(self):
        
        if self.label == None:
            return str(self.result > .5)
        else:
            return str(self.label + " : " + str(self.pivot))
        

In [106]:
class DecisionTreeImpl:
    
    def __init__(self, max_depth=5):
        self.max_depth = max_depth

        
    def fit(self, X, y):
        self.root = self.fit_rec(X, y, max_depth=self.max_depth)
        
    def fit_rec(self, X, y, max_depth):
        if max_depth == 0:
            return Node(result=y.mean())
        
        gains = []
        for col in X.columns:
            gains.append((info_gain(X, y, col), col))
        
        selected_gain, selected_col = sorted(gains)[-1]
        
        if selected_gain <= 0:
            return Node(result=y.mean())
        
        pivot = X[selected_col].mean()
        left = X[selected_col] < pivot
        right = X[selected_col] >= pivot
        X_left, X_right = X.loc[left], X.loc[right]
        y_left, y_right = y.loc[left], y.loc[right]
        
        node = Node(label=selected_col, pivot=pivot)
        node.left = self.fit_rec(X_left, y_left, max_depth-1)
        node.right = self.fit_rec(X_right, y_right, max_depth-1)
        return node
        
    
    def display(self):
        self.display_rec(self.root)
        
    def display_rec(self, node, indent=""):
        if node == None:
            return
        
        print(indent, node)
        self.display_rec(node.left, indent + "\t")
        self.display_rec(node.right, indent + "\t")
        
    def predict_point(self, row):
        return self.predict_point_rec(self.root, row)
        
    def predict_point_rec(self, node, row):
        if node.label == None:
            return node.result
        
        if row[node.label] < node.pivot:
            return self.predict_point_rec(node.left, row)
        else:
            return self.predict_point_rec(node.right, row)
        
    
    def predict(self, X):
        result = []
        for index, row in X.iterrows():
            result.append(self.predict_point(row))
        
        return (np.array(result) >= .5).astype(int)
        
    def score(self, X, y):
        yp = self.predict(X)
        return (yp == y).mean()
    

In [107]:
model = DecisionTreeImpl(max_depth=10)

In [108]:
model.fit(X_train, y_train)

In [109]:
# model.display()

In [110]:
# model.predict(X_test.iloc[:20])

In [111]:
# y_test.iloc[:20]

In [112]:
model.score(X_test, y_test)

0.7864406779661017