In [48]:
import pandas as pd

# Raw dataset from playCricket.pdf
play_data = [
    ["Sunny","Hot","High","Weak","No"],
    ["Sunny","Hot","High","Strong","No"],
    ["Overcast","Hot","High","Weak","Yes"],
    ["Rain","Mild","High","Weak","Yes"],
    ["Rain","Cool","Normal","Weak","Yes"],
    ["Rain","Cool","Normal","Strong","No"],
    ["Overcast","Cool","Normal","Strong","Yes"],
    ["Sunny","Mild","High","Weak","No"],
    ["Sunny","Cool","Normal","Weak","Yes"],
    ["Rain","Mild","Normal","Weak","Yes"],
    ["Sunny","Mild","Normal","Strong","Yes"],
    ["Overcast","Mild","High","Strong","Yes"],
    ["Overcast","Hot","Normal","Weak","Yes"],
    ["Rain","Mild","High","Strong","No"]
]

df_play = pd.DataFrame(play_data, columns=["Outlook","Temperature","Humidity","Wind","Play"])
df_play.head()


Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [49]:
from collections import Counter
from math import log2

def entropy(y):
    total = len(y)
    counts = Counter(y)
    ent = 0
    for c in counts.values():
        p = c/total
        ent -= p*log2(p)
    return ent

print("Entropy of target Play =", round(entropy(df_play["Play"]),3))


Entropy of target Play = 0.94


In [50]:
def info_gain(X, y, feature, verbose=True):
    base = entropy(y)
    n = len(y)
    weighted = 0
    if verbose:
        print(f"\nFeature: {feature}, Parent entropy = {base:.3f}")
    for v in X[feature].unique():
        sub = y[X[feature]==v]
        e = entropy(sub)
        weighted += (len(sub)/n) * e
        if verbose:
            print(f"  {feature}={v} | size={len(sub)}, entropy={e:.3f}")
    ig = base - weighted
    if verbose: print(f"  IG = {ig:.3f}")
    return ig

def split_info(X, feature):
    n = len(X)
    si = 0
    for v in X[feature].unique():
        p = len(X[X[feature]==v])/n
        if p>0: si -= p*log2(p)
    return si

def gain_ratio(X, y, feature, verbose=True):
    ig = info_gain(X,y,feature,verbose)
    si = split_info(X,feature)
    gr = ig/si if si>0 else 0
    if verbose: print(f"  SplitInfo = {si:.3f}, GainRatio = {gr:.3f}")
    return gr


In [51]:
def most_common(y):
    return Counter(y).most_common(1)[0][0]

def build_tree(X,y,algo="ID3",depth=0):
    if len(set(y))==1:
        return {"leaf":True,"label":y.iloc[0]}
    if X.shape[1]==0:
        return {"leaf":True,"label":most_common(y)}

    print("\n" + "="*50)
    print(f"Depth {depth} | Algorithm = {algo}")

    best_feat, best_score = None, -1
    for f in X.columns:
        score = info_gain(X,y,f,True) if algo=="ID3" else gain_ratio(X,y,f,True)
        if score > best_score:
            best_feat, best_score = f, score
    print(f">>> Best feature = {best_feat}, score = {best_score:.3f}")

    node = {"leaf":False,"feature":best_feat,"children":{}}
    for v in X[best_feat].unique():
        mask = (X[best_feat]==v)
        if mask.sum()==0:
            node["children"][v] = {"leaf":True,"label":most_common(y)}
        else:
            child = build_tree(X[mask].drop(columns=[best_feat]), y[mask], algo, depth+1)
            node["children"][v] = child
    return node


In [52]:
def predict_one(tree,row):
    if tree["leaf"]:
        return tree["label"]
    f = tree["feature"]
    v = row[f]
    if v in tree["children"]:
        return predict_one(tree["children"][v], row)
    else:
        return most_common([c["label"] for c in tree["children"].values() if c["leaf"]])

def predict(tree,X):
    return [predict_one(tree,row) for _,row in X.iterrows()]


In [53]:
import numpy as np

def accuracy(y_true,y_pred):
    return np.mean(np.array(y_true)==np.array(y_pred))

def precision_recall_f1(y_true,y_pred):
    labels = sorted(set(y_true))
    per_class = {}
    micro_tp = micro_fp = micro_fn = 0
    for l in labels:
        tp = sum((y_true==l)&(y_pred==l))
        fp = sum((y_true!=l)&(y_pred==l))
        fn = sum((y_true==l)&(y_pred!=l))
        p = tp/(tp+fp) if tp+fp>0 else 0
        r = tp/(tp+fn) if tp+fn>0 else 0
        f = (2*p*r)/(p+r) if p+r>0 else 0
        per_class[l] = (p,r,f)
        micro_tp += tp; micro_fp += fp; micro_fn += fn

    macro_f = np.mean([v[2] for v in per_class.values()])
    micro_p = micro_tp/(micro_tp+micro_fp) if micro_tp+micro_fp>0 else 0
    micro_r = micro_tp/(micro_tp+micro_fn) if micro_tp+micro_fn>0 else 0
    micro_f = (2*micro_p*micro_r)/(micro_p+micro_r) if micro_p+micro_r>0 else 0

    return per_class, macro_f, micro_f


In [54]:
def manual_kfold(df,k=5,target="Play",algo="ID3"):
    X = df.drop(columns=[target])
    y = df[target].reset_index(drop=True)
    n = len(df)
    idx = np.arange(n)
    np.random.seed(0)
    np.random.shuffle(idx)

    fold_size = n//k
    accs, macros, micros = [], [], []
    for i in range(k):
        start, end = i*fold_size, (i+1)*fold_size
        test_idx = idx[start:end]
        train_idx = np.setdiff1d(idx,test_idx)

        Xtr, ytr = X.iloc[train_idx], y.iloc[train_idx]
        Xte, yte = X.iloc[test_idx], y.iloc[test_idx]

        tree = build_tree(Xtr,ytr,algo)
        ypred = np.array(predict(tree,Xte))

        acc = accuracy(yte.values,ypred)
        _, macro, micro = precision_recall_f1(yte.values,ypred)

        accs.append(acc); macros.append(macro); micros.append(micro)
        print(f"Fold {i+1} | Acc={acc:.2f}, Macro-F1={macro:.2f}, Micro-F1={micro:.2f}")

    print("\n=== Final Results (",algo,") ===")
    print("Avg Acc =", np.mean(accs))
    print("Avg Macro-F1 =", np.mean(macros))
    print("Avg Micro-F1 =", np.mean(micros))

print("\nID3 Evaluation:")
manual_kfold(df_play,5,"Play","ID3")

print("\nC4.5 Evaluation:")
manual_kfold(df_play,5,"Play","C4.5")



ID3 Evaluation:

Depth 0 | Algorithm = ID3

Feature: Outlook, Parent entropy = 0.980
  Outlook=Sunny | size=4, entropy=0.811
  Outlook=Overcast | size=3, entropy=0.000
  Outlook=Rain | size=5, entropy=0.971
  IG = 0.305

Feature: Temperature, Parent entropy = 0.980
  Temperature=Hot | size=4, entropy=1.000
  Temperature=Mild | size=6, entropy=0.918
  Temperature=Cool | size=2, entropy=1.000
  IG = 0.021

Feature: Humidity, Parent entropy = 0.980
  Humidity=High | size=7, entropy=0.985
  Humidity=Normal | size=5, entropy=0.722
  IG = 0.104

Feature: Wind, Parent entropy = 0.980
  Wind=Weak | size=7, entropy=0.863
  Wind=Strong | size=5, entropy=0.971
  IG = 0.072
>>> Best feature = Outlook, score = 0.305

Depth 1 | Algorithm = ID3

Feature: Temperature, Parent entropy = 0.811
  Temperature=Hot | size=2, entropy=0.000
  Temperature=Mild | size=2, entropy=1.000
  IG = 0.311

Feature: Humidity, Parent entropy = 0.811
  Humidity=High | size=3, entropy=0.000
  Humidity=Normal | size=1, entr

In [55]:
def print_tree(tree,indent=""):
    if tree["leaf"]:
        print(indent,"→",tree["label"])
    else:
        f = tree["feature"]
        for v,ch in tree["children"].items():
            print(indent,f,"=",v)
            print_tree(ch,indent+"   ")

print("\nFinal ID3 Tree on full data:")
id3_tree = build_tree(df_play.drop(columns=["Play"]), df_play["Play"], "ID3")
print_tree(id3_tree)

print("\nFinal C4.5 Tree on full data:")
c45_tree = build_tree(df_play.drop(columns=["Play"]), df_play["Play"], "C4.5")
print_tree(c45_tree)



Final ID3 Tree on full data:

Depth 0 | Algorithm = ID3

Feature: Outlook, Parent entropy = 0.940
  Outlook=Sunny | size=5, entropy=0.971
  Outlook=Overcast | size=4, entropy=0.000
  Outlook=Rain | size=5, entropy=0.971
  IG = 0.247

Feature: Temperature, Parent entropy = 0.940
  Temperature=Hot | size=4, entropy=1.000
  Temperature=Mild | size=6, entropy=0.918
  Temperature=Cool | size=4, entropy=0.811
  IG = 0.029

Feature: Humidity, Parent entropy = 0.940
  Humidity=High | size=7, entropy=0.985
  Humidity=Normal | size=7, entropy=0.592
  IG = 0.152

Feature: Wind, Parent entropy = 0.940
  Wind=Weak | size=8, entropy=0.811
  Wind=Strong | size=6, entropy=1.000
  IG = 0.048
>>> Best feature = Outlook, score = 0.247

Depth 1 | Algorithm = ID3

Feature: Temperature, Parent entropy = 0.971
  Temperature=Hot | size=2, entropy=0.000
  Temperature=Mild | size=2, entropy=1.000
  Temperature=Cool | size=1, entropy=0.000
  IG = 0.571

Feature: Humidity, Parent entropy = 0.971
  Humidity=High 