In [31]:
import pandas as pd
import numpy as np
from collections import Counter
from math import log2

# Load CSV (already cleaned)
df_drug = pd.read_csv("drug_200.csv")

print("Dataset shape:", df_drug.shape)
df_drug.head()


Dataset shape: (200, 6)


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [32]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
le_dict = {}
for col in ["Sex","BP","Cholesterol","Drug"]:   # leave Age, Na_to_K as numeric
    le = LabelEncoder()
    df_drug[col] = le.fit_transform(df_drug[col])
    le_dict[col] = le

print("Unique classes for Drug:", le_dict["Drug"].classes_)
df_drug.head()


Unique classes for Drug: ['drugA' 'drugB' 'drugC' 'drugX' 'drugY']


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,0,25.355,4
1,47,1,1,0,13.093,2
2,47,1,1,0,10.114,2
3,28,0,2,0,7.798,3
4,61,0,1,0,18.043,4


In [33]:
def entropy(y):
    total = len(y)
    counts = Counter(y)
    return -sum((c/total)*log2(c/total) for c in counts.values())


In [34]:
def info_gain(X, y, feature, verbose=True):
    base = entropy(y)
    n = len(y)
    best_ig, best_split = 0, None
    if verbose:
        print(f"\nFeature: {feature}, Parent entropy = {base:.3f}")

    if X[feature].dtype in [np.int64,np.float64] and feature in ["Age","Na_to_K"]:
        values = sorted(X[feature].unique())
        for t in values:
            left, right = y[X[feature]<=t], y[X[feature]>t]
            if len(left)==0 or len(right)==0: continue
            weighted = (len(left)/n)*entropy(left) + (len(right)/n)*entropy(right)
            ig = base - weighted
            if ig > best_ig:
                best_ig, best_split = ig, t
        if verbose: print(f"  Best split at {best_split}, IG = {best_ig:.3f}")
        return best_ig, best_split
    else:
        weighted = 0
        for v in X[feature].unique():
            sub = y[X[feature]==v]
            e = entropy(sub)
            weighted += (len(sub)/n)*e
            if verbose:
                print(f"  {feature}={v} | size={len(sub)}, entropy={e:.3f}")
        ig = base - weighted
        if verbose: print(f"  IG = {ig:.3f}")
        return ig, None

def split_info(X, feature, threshold=None):
    n = len(X)
    si = 0
    if threshold is None:
        for v in X[feature].unique():
            p = len(X[X[feature]==v])/n
            if p>0: si -= p*log2(p)
    else:
        left = len(X[X[feature]<=threshold])/n
        right = len(X[X[feature]>threshold])/n
        for p in [left,right]:
            if p>0: si -= p*log2(p)
    return si

def gain_ratio(X, y, feature, verbose=True):
    ig, t = info_gain(X,y,feature,verbose)
    si = split_info(X,feature,t)
    gr = ig/si if si>0 else 0
    if verbose:
        print(f"  SplitInfo = {si:.3f}, GainRatio = {gr:.3f}")
    return gr, t


In [35]:
def most_common(y):
    return Counter(y).most_common(1)[0][0]

def build_tree(X,y,algo="ID3",depth=0):
    if len(set(y))==1:
        return {"leaf":True,"label":y.iloc[0]}
    if X.shape[1]==0:
        return {"leaf":True,"label":most_common(y)}

    print("\n" + "="*50)
    print(f"Depth {depth} | Algorithm = {algo}")

    best_feat, best_score, best_thresh = None,-1,None
    for f in X.columns:
        if algo=="ID3":
            score,th = info_gain(X,y,f,True)
        else:
            score,th = gain_ratio(X,y,f,True)
        if score > best_score:
            best_feat, best_score, best_thresh = f, score, th
    print(f">>> Best feature = {best_feat}, threshold = {best_thresh}, score = {best_score:.3f}")

    node = {"leaf":False,"feature":best_feat,"threshold":best_thresh,"children":{}}
    if best_thresh is None:
        for v in X[best_feat].unique():
            mask = (X[best_feat]==v)
            child = build_tree(X[mask].drop(columns=[best_feat]), y[mask], algo, depth+1)
            node["children"][v] = child
    else:
        mask_left = (X[best_feat]<=best_thresh)
        mask_right = (X[best_feat]>best_thresh)
        node["children"]["<="] = build_tree(X[mask_left], y[mask_left], algo, depth+1)
        node["children"][">"] = build_tree(X[mask_right], y[mask_right], algo, depth+1)
    return node


In [36]:
def predict_one(tree,row):
    if tree["leaf"]: return tree["label"]
    f,th = tree["feature"], tree["threshold"]
    if th is None:
        v = row[f]
        if v in tree["children"]:
            return predict_one(tree["children"][v], row)
        else:
            return most_common([c["label"] for c in tree["children"].values() if c["leaf"]])
    else:
        if row[f]<=th:
            return predict_one(tree["children"]["<="], row)
        else:
            return predict_one(tree["children"][">"], row)

def predict(tree,X):
    return [predict_one(tree,row) for _,row in X.iterrows()]


In [37]:
def accuracy(y_true,y_pred):
    return np.mean(np.array(y_true)==np.array(y_pred))

def precision_recall_f1(y_true,y_pred):
    labels = sorted(set(y_true))
    per_class = {}
    micro_tp = micro_fp = micro_fn = 0
    for l in labels:
        tp = sum((y_true==l)&(y_pred==l))
        fp = sum((y_true!=l)&(y_pred==l))
        fn = sum((y_true==l)&(y_pred!=l))
        p = tp/(tp+fp) if tp+fp>0 else 0
        r = tp/(tp+fn) if tp+fn>0 else 0
        f = (2*p*r)/(p+r) if p+r>0 else 0
        per_class[l] = (p,r,f)
        micro_tp += tp; micro_fp += fp; micro_fn += fn

    macro_f = np.mean([v[2] for v in per_class.values()])
    micro_p = micro_tp/(micro_tp+micro_fp) if micro_tp+micro_fp>0 else 0
    micro_r = micro_tp/(micro_tp+micro_fn) if micro_tp+micro_fn>0 else 0
    micro_f = (2*micro_p*micro_r)/(micro_p+micro_r) if micro_p+micro_r>0 else 0

    return per_class, macro_f, micro_f


In [38]:
def manual_kfold(df,k=5,target="Drug",algo="ID3"):
    X = df.drop(columns=[target])
    y = df[target].reset_index(drop=True)
    n = len(df)
    idx = np.arange(n)
    np.random.seed(0)
    np.random.shuffle(idx)

    fold_size = n//k
    accs, macros, micros = [], [], []
    for i in range(k):
        start, end = i*fold_size,(i+1)*fold_size
        test_idx = idx[start:end]
        train_idx = np.setdiff1d(idx,test_idx)

        Xtr,ytr = X.iloc[train_idx], y.iloc[train_idx]
        Xte,yte = X.iloc[test_idx], y.iloc[test_idx]

        tree = build_tree(Xtr,ytr,algo)
        ypred = np.array(predict(tree,Xte))

        acc = accuracy(yte.values, ypred)
        _, macro, micro = precision_recall_f1(yte.values, ypred)
        accs.append(acc); macros.append(macro); micros.append(micro)

        print(f"Fold {i+1} | Acc={acc:.2f}, Macro-F1={macro:.2f}, Micro-F1={micro:.2f}")

    print("\n=== Final Results (",algo,") ===")
    print("Avg Acc =", np.mean(accs))
    print("Avg Macro-F1 =", np.mean(macros))
    print("Avg Micro-F1 =", np.mean(micros))

print("\nID3 Evaluation:")
manual_kfold(df_drug,5,"Drug","ID3")

print("\nC4.5 Evaluation:")
manual_kfold(df_drug,5,"Drug","C4.5")



ID3 Evaluation:

Depth 0 | Algorithm = ID3

Feature: Age, Parent entropy = 2.008
  Best split at 50, IG = 0.218

Feature: Sex, Parent entropy = 2.008
  Sex=0 | size=75, entropy=1.924
  Sex=1 | size=85, entropy=2.070
  IG = 0.006

Feature: BP, Parent entropy = 2.008
  BP=0 | size=66, entropy=1.520
  BP=1 | size=50, entropy=1.519
  BP=2 | size=44, entropy=0.962
  IG = 0.642

Feature: Cholesterol, Parent entropy = 2.008
  Cholesterol=0 | size=86, entropy=2.060
  Cholesterol=1 | size=74, entropy=1.742
  IG = 0.095

Feature: Na_to_K, Parent entropy = 2.008
  Best split at 14.642, IG = 0.993
>>> Best feature = Na_to_K, threshold = 14.642, score = 0.993

Depth 1 | Algorithm = ID3

Feature: Age, Parent entropy = 1.846
  Best split at 50, IG = 0.397

Feature: Sex, Parent entropy = 1.846
  Sex=1 | size=50, entropy=1.858
  Sex=0 | size=38, entropy=1.825
  IG = 0.002

Feature: BP, Parent entropy = 1.846
  BP=1 | size=26, entropy=1.000
  BP=2 | size=27, entropy=-0.000
  BP=0 | size=35, entropy=0.9

In [39]:
def print_tree(tree,indent=""):
    if tree["leaf"]:
        print(indent,"→",tree["label"])
    else:
        f,th = tree["feature"], tree["threshold"]
        if th is None:
            for v,ch in tree["children"].items():
                print(indent,f,"=",v)
                print_tree(ch,indent+"   ")
        else:
            print(indent,f,"<=",th)
            print_tree(tree["children"]["<="],indent+"   ")
            print(indent,f,">",th)
            print_tree(tree["children"][">"],indent+"   ")

print("\nFinal ID3 Tree (full data):")
id3_tree = build_tree(df_drug.drop(columns=["Drug"]), df_drug["Drug"], "ID3")
print_tree(id3_tree)

print("\nFinal C4.5 Tree (full data):")
c45_tree = build_tree(df_drug.drop(columns=["Drug"]), df_drug["Drug"], "C4.5")
print_tree(c45_tree)



Final ID3 Tree (full data):

Depth 0 | Algorithm = ID3

Feature: Age, Parent entropy = 1.969
  Best split at 50, IG = 0.195

Feature: Sex, Parent entropy = 1.969
  Sex=0 | size=96, entropy=1.865
  Sex=1 | size=104, entropy=2.050
  IG = 0.008

Feature: BP, Parent entropy = 1.969
  BP=0 | size=77, entropy=1.495
  BP=1 | size=64, entropy=1.527
  BP=2 | size=59, entropy=0.965
  IG = 0.620

Feature: Cholesterol, Parent entropy = 1.969
  Cholesterol=0 | size=103, entropy=2.041
  Cholesterol=1 | size=97, entropy=1.701
  IG = 0.093

Feature: Na_to_K, Parent entropy = 1.969
  Best split at 14.642, IG = 0.994
>>> Best feature = Na_to_K, threshold = 14.642, score = 0.994

Depth 1 | Algorithm = ID3

Feature: Age, Parent entropy = 1.788
  Best split at 50, IG = 0.358

Feature: Sex, Parent entropy = 1.788
  Sex=1 | size=60, entropy=1.850
  Sex=0 | size=49, entropy=1.695
  IG = 0.008

Feature: BP, Parent entropy = 1.788
  BP=1 | size=34, entropy=0.998
  BP=2 | size=36, entropy=-0.000
  BP=0 | size=3