In [39]:
import numpy as np
import pandas as pd
from pprint import pprint
data = pd.read_csv('titanic.csv')

In [40]:
data.head(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S


In [41]:
def checkAdult(age):
    if age>=18:
        return "Adult"
    else:
        return "Child"
    
data["Adult/Child"]=data["Age"].apply(checkAdult)
data.head() 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Adult/Child
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Adult


In [42]:
data.groupby(['Gender'])['Survived'].sum()

Gender
female    233
male      109
Name: Survived, dtype: int64

In [43]:
trainingData=data[["Pclass","Adult/Child","Gender","Survived"]]
trainingData.head()

Unnamed: 0,Pclass,Adult/Child,Gender,Survived
0,3,Adult,male,0
1,1,Adult,female,1
2,3,Adult,female,1
3,1,Adult,female,1
4,3,Adult,male,0


In [44]:
len(trainingData)

891

In [45]:
trainingData = trainingData.dropna()
len(trainingData)

891

In [46]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(trainingData, test_size = 0.2)

In [47]:
train.head()

Unnamed: 0,Pclass,Adult/Child,Gender,Survived
771,3,Adult,male,0
358,3,Child,female,1
761,3,Adult,male,0
874,2,Adult,female,1
877,3,Adult,male,0


In [48]:
test.head()

Unnamed: 0,Pclass,Adult/Child,Gender,Survived
487,1,Adult,male,0
803,3,Child,male,1
81,3,Adult,male,1
580,2,Adult,female,1
457,1,Child,female,1


In [49]:
len(train)

712

In [50]:
len(test)

179

In [51]:
def cal_entropy(col):
    # first convert the column to numpy array and then find the unique elements and their frequency
    col = np.array(col)
    elements,freq = np.unique(col,return_counts = True)
    # print(elements,freq)
    entropy = np.sum([(-freq[i]/np.sum(freq))*np.log2(freq[i]/np.sum(freq)) for i in range(len(elements))])
    return entropy

entropy = cal_entropy(train["Survived"])
print(entropy)

0.9554629988004848


In [52]:
def InfoGain(data,split_attribute_name,target_name="Survived"):
    """
    1. data = The dataset for whose feature the IG should be calculated
    2. split_attribute_name = the name of the feature for which the information gain should be calculated
    3. target_name = the name of the target feature. The default for this example is "class"
    """    
    #Calculate the entropy of the total dataset

    parent_entropy = cal_entropy(data[target_name])
    print(f"Total dataset/Parent Entropy: {parent_entropy}")

    #Calculate the values and the corresponding counts for the split attribute 
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    print(f"Split attribute Name: {split_attribute_name}")
    print(f"Split attribute values: {vals}")
    print(f"Split attribute values counts: {counts}") 

    #Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*cal_entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    print(f"Weighted Entropy: {Weighted_Entropy}")

    #Calculate the information gain
    print(f"Calculating Information Gain using: {parent_entropy} - {Weighted_Entropy}")
    Information_Gain = parent_entropy - Weighted_Entropy
    print(f"Information Gain: {Information_Gain}")
    return Information_Gain
       

In [53]:
def ID3(data, originaldata, features, target_attribute_name="Survived", parent_node_class=None):

    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    
    elif len(data) == 0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name], return_counts=True)[1])]

    elif len(features) == 0:
        return parent_node_class
    
    parent_node = np.unique(data[target_attribute_name])[np.argmax(
        np.unique(data[target_attribute_name], return_counts=True)[1])]
    item_value = []
    for feat in features:
        item_value.append(InfoGain(data, feat))
    best_feature_index = np.argmax(item_value)
    best_feature = features[best_feature_index]
    tree = {best_feature: {}}
    for x in np.unique(data[best_feature]):
        subdata = data[data[best_feature] != x]
        subdata = subdata.drop([best_feature], 1)
        features = [i for i in features if i != best_feature]
        tree[best_feature][x] = ID3(
            subdata, originaldata, features, target_attribute_name, parent_node)
    
    return (tree)

In [54]:
tree = ID3(train,train,train.columns[:-1])
print(tree)

Total dataset/Parent Entropy: 0.9554629988004848
Split attribute Name: Pclass
Split attribute values: [1 2 3]
Split attribute values counts: [175 144 393]
Weighted Entropy: 0.8758311719206275
Calculating Information Gain using: 0.9554629988004848 - 0.8758311719206275
Information Gain: 0.07963182687985726
Total dataset/Parent Entropy: 0.9554629988004848
Split attribute Name: Adult/Child
Split attribute values: ['Adult' 'Child']
Split attribute values counts: [480 232]
Weighted Entropy: 0.955266046536738
Calculating Information Gain using: 0.9554629988004848 - 0.955266046536738
Information Gain: 0.00019695226374683017
Total dataset/Parent Entropy: 0.9554629988004848
Split attribute Name: Gender
Split attribute values: ['female' 'male']
Split attribute values counts: [253 459]
Weighted Entropy: 0.7379627213363358
Calculating Information Gain using: 0.9554629988004848 - 0.7379627213363358
Information Gain: 0.21750027746414902
Total dataset/Parent Entropy: 0.6818848073951012
Split attribute

  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
  subdata = subdata.drop([best_feature], 1)
