<a href="https://colab.research.google.com/github/Ayush-hm/AI-Toolbox/blob/main/Tree%20Based%20Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
df=pd.DataFrame({'age':[10,12,14,16,18],'label':[0,0,1,1,1]})

In [3]:
df

Unnamed: 0,age,label
0,10,0
1,12,0
2,14,1
3,16,1
4,18,1


In [4]:
def gini(labels):
  n=len(labels)
  if n==0:
    return 0
  p1=sum(labels)/n
  p0=1-p1
  return 1-(p0**2+p1**2)

Gini = 0 → node is pure, all samples have the same label → you’ll never mislabel.

Higher Gini → more mixed labels → more chance of mislabeling if you guess according to the node’s class proportions

In [5]:
labels=df['label']
gini(labels)

0.48

In [9]:
G_before=gini(labels)

def split_gini(threshold):
  left=df[df['age']<=threshold]
  right=df[df['age']>threshold]
  n=len(df)
  GL=gini(left['label'])
  GR=gini(right['label'])
  G_after= GL*len(left)/n+GR*len(right)/n
  return G_before-G_after

for t in[11,13,15,17]:
  print(split_gini(t))


0.18
0.48
0.21333333333333332
0.07999999999999996


In [11]:
import numpy as np
ages=sorted(df['age'].unique())
thresholds=[(ages[i]+ages[i+1])/2 for i in range(len(ages)-1)]
thresholds

[np.float64(11.0), np.float64(13.0), np.float64(15.0), np.float64(17.0)]

In [12]:
best= None
for t in thresholds:
  g=split_gini(t)
  if best is None or g>best[1]:
    best=(t,g)
best

(np.float64(13.0), 0.48)

In [14]:
df['income']=[1200,1500,2700,3900,4800]
df.head()

Unnamed: 0,age,label,income
0,10,0,1200
1,12,0,1500
2,14,1,2700
3,16,1,3900
4,18,1,4800


In [17]:
def best_feature_split(df,feature):
  G_before=gini(df['label'])
  best=None

  values=np.sort(df[feature].unique())
  if len(values)==1:
    return None

  thresholds=(values[:-1]+values[1:])/2

  best=None

  for t in thresholds:
    left=df[df[feature]<=t]
    right=df[df[feature]>t]
    n=len(df)
    GL=gini(left['label'])
    GR=gini(right['label'])
    G_after= GL*len(left)/n+GR*len(right)/n

    gain=G_before-G_after
    if best is None or gain>best['gain']:
      best={'gain':gain,'feature':feature,'threshold':t}
  return best



In [18]:
best_feature_split(df,'age')

{'gain': 0.48, 'feature': 'age', 'threshold': np.float64(13.0)}

In [20]:
def best_split_over_features(df, features):
    candidates = [best_feature_split(df, f) for f in features]
    candidates = [c for c in candidates if c is not None]
    # pick the one with max gain
    return max(candidates, key=lambda c: c['gain'])


In [22]:
best_split_over_features(df,['age'])

{'gain': 0.48, 'feature': 'age', 'threshold': np.float64(13.0)}