In [1]:
import pandas as pd
import math

In [2]:

# Define the dataset
data = {
    'Age': ['<=30', '<=30', '31…40', '>40', '>40', '>40', '31…40', '<=30', '<=30', '>40', '<=30', '31…40', '31…40', '>40'],
    'Income': ['High', 'High', 'High', 'Medium', 'Low', 'Low', 'Low', 'Medium', 'Low', 'Medium', 'Medium', 'Medium', 'High', 'Medium'],
    'Student': ['No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No'],
    'Credit_rating': ['Fair', 'Excellent', 'Fair', 'Fair', 'Fair', 'Excellent', 'Excellent', 'Fair', 'Fair', 'Fair', 'Excellent', 'Excellent', 'Fair', 'Excellent'],
    'Buys_computer': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(data)
df


Unnamed: 0,Age,Income,Student,Credit_rating,Buys_computer
0,<=30,High,No,Fair,No
1,<=30,High,No,Excellent,No
2,31…40,High,No,Fair,Yes
3,>40,Medium,No,Fair,Yes
4,>40,Low,Yes,Fair,Yes
5,>40,Low,Yes,Excellent,No
6,31…40,Low,Yes,Excellent,Yes
7,<=30,Medium,No,Fair,No
8,<=30,Low,Yes,Fair,Yes
9,>40,Medium,Yes,Fair,Yes


In [4]:
def entropy(data):
    total = len(data)
    if total == 0:
        return 0
    
    positive = len(data[data['Buys_computer'] == 'Yes'])
    negative = len(data[data['Buys_computer'] == 'No'])
    
    if positive == 0 or negative == 0:
        return 0
    
    p_positive = positive / total
    p_negative = negative / total    

    return - (p_positive * math.log2(p_positive) + p_negative * math.log2(p_negative))

In [5]:
def information_gain(data, attribute):
    entropy_total = entropy(data)
    unique_values = data[attribute].unique()
    weighted_entropy = 0
    
    for value in unique_values:
        subset = data[data[attribute] == value]
        subset_entropy = entropy(subset)
        subset_weight = len(subset) / len(data)
        weighted_entropy += subset_weight * subset_entropy
#     print("Information Gain on attribute :", attribute,' is ', entropy_total - weighted_entropy)
    return entropy_total - weighted_entropy


In [6]:
def build_decision_tree(data, features):
    if len(data) == 0: # if no data is present in datatable
        return "No"
    elif len(data['Buys_computer'].unique()) == 1: # if only yes or only no is there as buys_computer
        return data['Buys_computer'].iloc[0]
    elif len(features) == 0: # if there are no feature into consideration
        return data['Buys_computer'].mode().iloc[0]
    else: 
        print('---------------*-------------')
        best_attribute = max(features, key=lambda attr: information_gain(data, attr)) # choosing best feature among features using most information_gain
        tree = {best_attribute: {}}
        unique_values = data[best_attribute].unique() # finding all unique values in attributes colums
        for value in unique_values:
            subset = data[data[best_attribute] == value]
            new_features = [feature for feature in features if feature != best_attribute]
            subtree = build_decision_tree(subset, new_features) # using recursion on sub tree
            tree[best_attribute][value] = subtree
#             print(tree)
        return tree


In [7]:
features = list(df.columns)
print('All features are:',features)
features.remove('Buys_computer')
print('Features into consideration are:',features)

All features are: ['Age', 'Income', 'Student', 'Credit_rating', 'Buys_computer']
Features into consideration are: ['Age', 'Income', 'Student', 'Credit_rating']


In [8]:
decision_tree = build_decision_tree(df, features)
# print(decision_tree)
decision_tree

---------------*-------------
---------------*-------------
---------------*-------------


{'Age': {'<=30': {'Student': {'No': 'No', 'Yes': 'Yes'}},
  '31…40': 'Yes',
  '>40': {'Credit_rating': {'Fair': 'Yes', 'Excellent': 'No'}}}}

In [9]:
def formatData(t,s):
    if not isinstance(t,dict) and not isinstance(t,list):
        print("\t"*s+str(t))
    else:
        for key in t:
            print("\t"*s+str(key))
            print("\t"*s+"    \\")
            if not isinstance(t,list):
                formatData(t[key],s+1)

formatData(decision_tree,0)

Age
    \
	<=30
	    \
		Student
		    \
			No
			    \
				No
			Yes
			    \
				Yes
	31…40
	    \
		Yes
	>40
	    \
		Credit_rating
		    \
			Fair
			    \
				Yes
			Excellent
			    \
				No


In [13]:
age = int(input("Enter age: "))
keys = decision_tree.keys()
if age > 40:
    age = '>40'
elif age > 30 and age <= 40:
    age= '31…40'
else:
    age = '<=30'
key = ''
for k in keys:
    key = k
classified = decision_tree[key][age]
# print(classified)

buys_computer = ['Yes','No']

if classified not in buys_computer:
    keys = classified
    key = ''
    for k in keys:
        key = k
    if key == 'Student':
        is_student = input("Enter student or not: ")
        classified = classified[key][is_student]
    elif key == 'Credit_rating':
        credit_rating = input("Enter credit rating: ")
        classified = classified[key][credit_rating]
print(f'Buys Computer: {classified}')

Enter age: 45
Enter credit rating: Excellent
Buys Computer: No
