In [4]:
import numpy as np
import pandas as pd

In [44]:
pd.options.display.float_format = '{:,.2f}'.format

df = pd.read_excel('titanic.xls', 'titanic3', index_col=None, na_values=['NA'])


In [45]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.34,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [46]:
df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [47]:
del_cols = ['name', 'ticket', 'fare', 'cabin', 'embarked']

In [48]:
df  = df.drop(del_cols,1)

In [49]:
df.columns

Index(['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'boat', 'body',
       'home.dest'],
      dtype='object')

In [50]:
df = df.fillna(df["age"].mean())

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   sex        1309 non-null   object 
 3   age        1309 non-null   float64
 4   sibsp      1309 non-null   int64  
 5   parch      1309 non-null   int64  
 6   boat       1309 non-null   object 
 7   body       1309 non-null   float64
 8   home.dest  1309 non-null   object 
dtypes: float64(2), int64(4), object(3)
memory usage: 92.2+ KB


In [52]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [53]:
df["sex"] = le.fit_transform(df["sex"])

In [54]:
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,boat,body,home.dest
0,1,1,0,29.0,0,0,2.0,29.88,"St Louis, MO"
1,1,1,1,0.92,1,2,11.0,29.88,"Montreal, PQ / Chesterville, ON"
2,1,0,0,2.0,1,2,29.88,29.88,"Montreal, PQ / Chesterville, ON"
3,1,0,1,30.0,1,2,29.88,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,0,25.0,1,2,29.88,29.88,"Montreal, PQ / Chesterville, ON"


In [55]:
df.shape

(1309, 9)

In [56]:
df = df.drop(['boat','home.dest','body'], axis =1)

In [57]:
df.columns = ['output', 'pclass', 'sex', 'age', 'sibsp', 'parch']

In [58]:
df.head()

Unnamed: 0,output,pclass,sex,age,sibsp,parch
0,1,1,0,29.0,0,0
1,1,1,1,0.92,1,2
2,1,0,0,2.0,1,2
3,1,0,1,30.0,1,2
4,1,0,0,25.0,1,2


In [59]:
def entropy(col):
    counts = np.unique(col,return_counts=True)
    ent = 0.0
    for ix in counts[1]:
        p = ix/col.shape[0]
        ent += (-1.0*p*np.log2(p))
    return ent

In [60]:
def divide_data(x_data,fkey,fval):
    x_right = pd.DataFrame([],columns=x_data.columns)
    x_left = pd.DataFrame([],columns=x_data.columns)
    for ix in range(x_data.shape[0]):
        val = x_data[fkey].loc[ix]
        if val >=fval:
            x_right = x_right.append(x_data.iloc[ix])
        else:
            x_left = x_left.append(x_data.iloc[ix])
    return x_right,x_left

In [61]:
def information_gain(x_data,fkey,fval):
    right,left = divide_data(x_data,fkey,fval)
    
    l = float(left.shape[0])/x_data.shape[0]
    r = float(right.shape[0])/x_data.shape[0]
    if left.shape[0] == 0 or right.shape[0] == 0:
        return -99999
    i_gain = entropy(x_data.output) - (l * entropy(left.output) + r*entropy(right.output))
    return i_gain
    

In [62]:
class DecisionTree:
    def __init__(self,depth=0,max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.depth = depth
        self.max_depth = max_depth
        self.target = None
        
    def train(self,x_train):
        features = ['pclass', 'sex', 'age', 'sibsp', 'parch']
        info_gains = []
        for ix in features:
            i_gain = information_gain(x_train,ix,x_train[ix].mean())
            info_gains.append(i_gain)
        self.fkey = features[np.argmax(info_gains)]
        self.fval = x_train[self.fkey].mean()
        print("Splitting Tree",self.fkey)
        data_right,data_left = divide_data(x_train,self.fkey,self.fval)
        data_right = data_right.reset_index(drop=True)
        data_left = data_left.reset_index(drop=True)
        if data_left.shape[0] == 0 or data_right.shape[0] == 0:
            if x_train.output.mean() >= 0.5:
                self.target = "Positive"
            else:
                self.target = "Negative"
            return
        if self.depth >= self.max_depth:
            if x_train.output.mean() >= 0.5:
                self.target = "Positive"
            else:
                self.target = "Negative"
            return
        self.left = DecisionTree(self.depth+1,self.max_depth)
        self.left.train(data_left)
        self.right = DecisionTree(self.depth+1,self.max_depth)
        self.right.train(data_right)
        if x_train.output.mean() >= 0.5:
            self.target = "Positive"
        else:
            self.target = "Negative"
        return
    def predict(self,test):
        if test[self.fkey] > self.fval:
            if self.right is None:
                return self.target
            return self.right.predict(test)
        if test[self.fkey] < self.fval:
            if self.left is None:
                return self.target
            return self.left.predict(test)

In [63]:
split = int(0.7*df.shape[0])
train_data = df[:split]
test_data = df[split:]
test_data= test_data.reset_index(drop=True)

In [64]:
dt = DecisionTree()

In [65]:
dt.train(train_data)

Splitting Tree age
Splitting Tree pclass
Splitting Tree age
Splitting Tree age
Splitting Tree sibsp
Splitting Tree parch
Splitting Tree sex
Splitting Tree sex
Splitting Tree parch
Splitting Tree sibsp
Splitting Tree sex
Splitting Tree parch
Splitting Tree sibsp
Splitting Tree age
Splitting Tree age
Splitting Tree parch
Splitting Tree age
Splitting Tree age
Splitting Tree age
Splitting Tree parch
Splitting Tree parch
Splitting Tree sex
Splitting Tree sibsp
Splitting Tree parch
Splitting Tree sex
Splitting Tree sex
Splitting Tree sibsp
Splitting Tree parch
Splitting Tree sibsp
Splitting Tree sibsp
Splitting Tree parch
Splitting Tree parch
Splitting Tree age
Splitting Tree pclass
Splitting Tree parch
Splitting Tree sibsp
Splitting Tree age
Splitting Tree sibsp
Splitting Tree parch
Splitting Tree age
Splitting Tree sex
Splitting Tree age
Splitting Tree age
Splitting Tree parch
Splitting Tree sex
Splitting Tree sibsp
Splitting Tree age
Splitting Tree age
Splitting Tree pclass
Splitting Tree

In [66]:
y_pred = []
for ix in range(test_data.shape[0]):
    y_pred.append(dt.predict(test_data.loc[ix]))

In [67]:
y_pred[:10]

['Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive']

In [68]:
for i in range(len(y_pred)):
    if y_pred[i] == "Negative":
        y_pred[i] = 0
    else:
        y_pred[i] = 1

In [69]:
np.mean(y_pred == test_data['output'])

0.0

In [70]:
dt.left.left.fkey

'age'

In [71]:
from sklearn.tree import DecisionTreeClassifier

In [72]:
features = ['pclass', 'sex', 'age', 'sibsp', 'parch']

In [73]:
sk_tree = DecisionTreeClassifier()

In [74]:
sk_tree.fit(train_data[features],train_data["output"])

DecisionTreeClassifier()

In [75]:
sk_tree.predict(test_data[features])

array([3, 1, 3, 2, 3, 3, 1, 1, 2, 2, 1, 3, 3, 3, 3, 3, 2, 3, 2, 2, 1, 3,
       1, 2, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 2,
       2, 3, 3, 2, 2, 3, 3, 3, 1, 3, 1, 3, 2, 3, 2, 3, 3, 3, 1, 2, 2, 3,
       3, 3, 1, 2, 3, 3, 3, 3, 3, 1, 3, 1, 3, 3, 3, 1, 3, 1, 1, 1, 1, 1,
       1, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 2, 3, 1, 2, 2,
       2, 3, 1, 3, 3, 2, 3, 1, 2, 2, 1, 1, 3, 1, 1, 3, 1, 1, 1, 2, 3, 3,
       2, 2, 2, 3, 3, 3, 3, 3, 1, 2, 2, 2, 3, 1, 3, 2, 1, 3, 1, 3, 1, 3,
       3, 1, 3, 3, 3, 2, 3, 1, 1, 3, 1, 1, 3, 2, 1, 3, 3, 2, 3, 3, 3, 3,
       3, 2, 2, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 1,
       3, 3, 3, 3, 2, 3, 1, 2, 1, 1, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 1, 3, 3, 1, 3, 3, 3, 3, 1,
       2, 1, 1, 2, 1, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 1, 2, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3,
       3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1,

In [76]:
sk_tree.score(test_data[features],test_data["output"])

0.6259541984732825

In [77]:
from sklearn.ensemble import RandomForestClassifier

In [78]:
rf = RandomForestClassifier(n_estimators=10,criterion="entropy",max_depth=5)

In [79]:
rf.fit(train_data[features],train_data["output"])

RandomForestClassifier(criterion='entropy', max_depth=5, n_estimators=10)

In [80]:
rf.score(train_data[features],train_data["output"])

0.5938864628820961

In [81]:
rf.score(test_data[features],test_data["output"])

0.7150127226463104

In [82]:
cross_val_score(RandomForestClassifier(n_estimators=10,criterion="entropy",max_depth=5),df[features],df["output"],cv=5).mean()

NameError: name 'cross_val_score' is not defined

0.823870065363093