In [63]:
import numpy as np
import pandas as pd
from pprint import pprint
data = pd.read_csv('titanic.csv')

In [64]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [65]:
def checkAdult(age):
    if age>=18:
        return "Adult"
    else:
        return "Child"
    
data["Adult/Child"]=data["Age"].apply(checkAdult)
data.head() 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Adult/Child
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Adult


In [66]:
data.groupby(['Gender'])['Survived'].sum()

Gender
female    233
male      109
Name: Survived, dtype: int64

In [67]:
trainingData=data[["Pclass","Adult/Child","Gender","Survived"]]
trainingData.head()

Unnamed: 0,Pclass,Adult/Child,Gender,Survived
0,3,Adult,male,0
1,1,Adult,female,1
2,3,Adult,female,1
3,1,Adult,female,1
4,3,Adult,male,0


In [68]:
def catToNum(series):
    series = series.astype('category')
    return series.cat.codes

catData=trainingData[["Pclass","Adult/Child","Gender","Survived"]].apply(catToNum)
trainingData[["Pclass","Adult/Child","Gender","Survived"]]=catData
trainingData.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,Pclass,Adult/Child,Gender,Survived
0,2,0,1,0
1,0,0,0,1
2,2,0,0,1
3,0,0,0,1
4,2,0,1,0


In [69]:
len(trainingData)

891

In [70]:
trainingData = trainingData.dropna()
len(trainingData)

891

In [71]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(trainingData, test_size = 0.2)

In [72]:
train.head()

Unnamed: 0,Pclass,Adult/Child,Gender,Survived
398,1,0,1,0
830,2,1,0,1
524,2,1,1,0
310,0,0,0,1
560,2,1,1,0


In [73]:
test.head()

Unnamed: 0,Pclass,Adult/Child,Gender,Survived
778,2,1,1,0
141,2,0,0,1
776,2,1,1,0
769,2,0,1,0
51,2,0,1,0


In [74]:
len(train)

712

In [75]:
len(test)

179

In [76]:
def cal_entropy(col):
    # first convert the column to numpy array and then find the unique elements and their frequency
    col = np.array(col)
    elements,freq = np.unique(col,return_counts = True)
    # print(elements,freq)
    entropy = np.sum([(-freq[i]/np.sum(freq))*np.log2(freq[i]/np.sum(freq)) for i in range(len(elements))])
    return entropy


In [77]:
entropy = cal_entropy(train["Survived"])
print(entropy)

0.9584773114470639


In [78]:
def InfoGain(data,split_attribute_name,target_name="Survived"):
    """
    1. data = The dataset for whose feature the IG should be calculated
    2. split_attribute_name = the name of the feature for which the information gain should be calculated
    3. target_name = the name of the target feature. The default for this example is "class"
    """    
    #Calculate the entropy of the total dataset

    parent_entropy = cal_entropy(data[target_name])
    print(f"Total dataset/Parent Entropy: {parent_entropy}")

    #Calculate the values and the corresponding counts for the split attribute 
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    print(f"Split attribute Name: {split_attribute_name}")
    print(f"Split attribute values: {vals}")
    print(f"Split attribute values counts: {counts}") 

    #Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*cal_entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    print(f"Weighted Entropy: {Weighted_Entropy}")

    #Calculate the information gain
    print(f"Calculating Information Gain using: {parent_entropy} - {Weighted_Entropy}")
    Information_Gain = parent_entropy - Weighted_Entropy
    print(f"Information Gain: {Information_Gain}")
    return Information_Gain
       

In [79]:
#Test the function 
InfoGain(data=train,split_attribute_name="Pclass",target_name="Survived") 

Total dataset/Parent Entropy: 0.9584773114470639
Split attribute Name: Pclass
Split attribute values: [0 1 2]
Split attribute values counts: [176 144 392]
Weighted Entropy: 0.8741116537872167
Calculating Information Gain using: 0.9584773114470639 - 0.8741116537872167
Information Gain: 0.08436565765984727


0.08436565765984727

In [80]:
def ID3(data,originaldata,features,target_attribute_name="Survived",parent_node_class = None):
    pass

In [81]:
tree = ID3(train,train,train.columns[:-1])
pprint(tree)

None
