In [4]:
import numpy as np
import pandas as pd
from pprint import pprint
data = pd.read_csv('titanic.csv')

In [5]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
def checkAdult(age):
    if age>=18:
        return "Adult"
    else:
        return "Child"
    
data["Adult/Child"]=data["Age"].apply(checkAdult)
data.head() 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Adult/Child
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Adult


In [7]:
data.groupby(['Gender'])['Survived'].sum()

Gender
female    233
male      109
Name: Survived, dtype: int64

In [8]:
trainingData=data[["Pclass","Adult/Child","Gender","Survived"]]
trainingData.head()

Unnamed: 0,Pclass,Adult/Child,Gender,Survived
0,3,Adult,male,0
1,1,Adult,female,1
2,3,Adult,female,1
3,1,Adult,female,1
4,3,Adult,male,0


In [9]:
def catToNum(series):
    series = series.astype('category')
    return series.cat.codes

catData=trainingData[["Pclass","Adult/Child","Gender","Survived"]].apply(catToNum)
trainingData[["Pclass","Adult/Child","Gender","Survived"]]=catData
trainingData.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,Pclass,Adult/Child,Gender,Survived
0,2,0,1,0
1,0,0,0,1
2,2,0,0,1
3,0,0,0,1
4,2,0,1,0


In [10]:
len(trainingData)

891

In [11]:
trainingData = trainingData.dropna()
len(trainingData)

891

In [12]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(trainingData, test_size = 0.2)

In [13]:
train.head()

Unnamed: 0,Pclass,Adult/Child,Gender,Survived
18,2,0,0,0
668,2,0,1,0
755,1,1,1,1
835,0,0,0,1
64,0,1,1,0


In [14]:
test.head()

Unnamed: 0,Pclass,Adult/Child,Gender,Survived
741,0,0,1,0
455,2,0,1,1
465,2,0,1,0
795,1,0,1,0
218,0,0,0,1


In [15]:
len(train)

712

In [16]:
len(test)

179

In [17]:

def entropy(col):
    
    elements,freq = np.unique(col,return_counts = True)
    entropy = np.sum([(-freq[i]/np.sum(freq))*np.log2(freq[i]/np.sum(freq)) for i in range(len(elements))])
    return entropy


In [18]:
def InfoGain(data,split_attribute_name,target_name="Survived"):
    """
    1. data = The dataset for whose feature the IG should be calculated
    2. split_attribute_name = the name of the feature for which the information gain should be calculated
    3. target_name = the name of the target feature. The default for this example is "class"
    """    
    #Calculate the entropy of the total dataset
    parent_entropy = entropy(data[target_name])
    
    ##Calculate the entropy of the dataset
    
    #Calculate the values and the corresponding counts for the split attribute 
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    
    #Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    
    #Calculate the information gain
    Information_Gain = parent_entropy - Weighted_Entropy
    return Information_Gain
       

In [74]:
def ID3(data, originaldata, features, target_attribute_name="Survived", parent_node_class=None):
    #Define the stopping criteria --> If one of this is satisfied, we want to return a leaf node#

    # If all target_values have the same value, return this value
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]

    # If the dataset is empty, return the mode target feature value in the original dataset
    elif len(data) == 0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name], return_counts=True)[1])]

    # If the feature space is empty, return the mode target feature value of the direct parent node --> Note that
    # the direct parent node is that node which has called the current run of the ID3 algorithm and hence
    # the mode target feature value is stored in the parent_node_class variable.
    elif len(features) == 0:
        return parent_node_class

     # If none of the above holds true, grow the tree!
    # Set the default value for this node --> The mode target feature value of the current node
    parent_node = np.unique(data[target_attribute_name])[np.argmax(
        np.unique(data[target_attribute_name], return_counts=True)[1])]
    # Select the feature which best splits the dataset
    item_value = []
    for feat in features:
        item_value.append(InfoGain(data, feat))
    # item_value = InfoGain(data, features)
    # #Return the information gain values for the features in the dataset
    best_feature_index = np.argmax(item_value)
    best_feature = features[best_feature_index]
    # print(best_feature)
    #     #Create the tree structure. The root gets the name of the feature (best_feature) with the maximum information
    # #gain in the first run
    tree = {best_feature: {}}
    # #Remove the feature with the best inforamtion gain from the feature space
    # data[best_features] ...values remove best
    for x in np.unique(data[best_feature]):
        subdata = data[data[best_feature] != x]
        subdata = subdata.drop([best_feature], 1)
        features = [i for i in features if i != best_feature]
        # print(features)
        tree[best_feature][x] = ID3(
            subdata, originaldata, features, target_attribute_name, parent_node)
    
    return (tree)
    # #Grow a branch under the root node for each possible value of the root node feature
    # for values in data[best_features]:

    #     #Split the dataset along the value of the feature with the largest information gain and therwith create sub_datasets
    # subdata = data.bestvalue.drop values
    #     #Call the ID3 algorithm for each of those sub_datasets with the new parameters --> Here the recursion comes in!
    #     ID3(sub_data , )
    #     #Add the sub tree, grown from the sub_dataset to the tree under the root node
    # tree[best_features][value] = subtree

    #   return (tree)




In [75]:
tree = ID3(train,train,train.columns[:-1])
print(tree)

{'Gender': {0: {'Pclass': {0: {'Adult/Child': {0: 0, 1: 0}}, 1: {'Adult/Child': {0: 0, 1: 0}}, 2: {'Adult/Child': {0: 0, 1: 0}}}}, 1: {'Pclass': {0: {'Adult/Child': {0: 1, 1: 1}}, 1: {'Adult/Child': {0: 1, 1: 1}}, 2: {'Adult/Child': {0: 1, 1: 1}}}}}}
