<a href="https://colab.research.google.com/github/AnshulSingh-eZ/Machine_Learning_Lab_23CS067/blob/main/Exp_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install ucimlrepo


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [26]:
from ucimlrepo import fetch_ucirepo
adult = fetch_ucirepo(id=2)
X = adult.data.features # features (pandas DataFrame)
Y = adult.data.targets # target (pandas DataFrame)

In [54]:
print(X.dtypes)
print("_________________Y__________________")
print(Y.dtypes)

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object
_________________Y__________________
income    object
dtype: object


In [55]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()

In [57]:
for col in X.columns:
  if X[col].dtype == 'object':
    X[col] = enc.fit_transform(X[col])

Y = enc.fit_transform(Y.values.ravel())
Y = Y.reshape(-1, 1)

X = X.astype(float)
Y = Y.astype(float)

In [30]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd


# **Splitting data into train, test and validation dataset (60, 20, 20)**

In [66]:
X_train, X_temp, Y_train, Y_temp = train_test_split(
    X, Y,
    train_size = 0.6
)
X_test, X_val, Y_test, Y_val = train_test_split(
    X_temp, Y_temp,
    train_size = 0.5
)

In [67]:
X_train.shape

(29305, 14)

In [42]:
class Node(object):
  def __init__(self, feature=None, threshold=None, left=None, right=None, gain=None, value=None):
    self.feature = feature
    self.threshold = threshold
    self.left = left
    self.right = right
    self.gain = gain
    self.value = value

## **Decision Tree Class**

In [76]:
class decisionTree(object):
  def __init__(self, max_depth = 2, min_samples = 2):
    self.max_depth = max_depth
    self.min_samples = min_samples

  def split_dataset(self, dataset, feature, threshold):
    left_partition = []
    right_partition = []
    for r in dataset:
      if(r[feature] <= threshold):
        left_partition.append(r)
      else:
        right_partition.append(r)

    left_partition = np.array(left_partition)
    right_partition = np.array(right_partition)
    return left_partition, right_partition

  def entropy(self, y):
    entropy = 0
    labels = np.unique(y)
    for label in labels:
      total_labels = y[y==label]
      x = len(total_labels)/ len(y)
      entropy += -x*np.log2(x)
    return entropy

  def gini(self, y):
    labels = np.unique(y)
    gini = 1.0
    for label in labels:
        p = np.sum(y == label) / len(y)
        gini -= p**2
    return gini


  def info_gain(self, parent, right, left):
    parent_entropy = self.entropy(parent)
    right_entropy = self.entropy(right)
    left_entropy = self.entropy(left)
    right_wt = len(right)/len(parent)
    left_wt = len(left)/len(parent)
    weighted_entropy  = right_wt*right_entropy + left_wt*left_entropy
    infogain = parent_entropy - weighted_entropy
    return infogain

  def best_split(self, dataset, num_samples, num_features):
    best_split = {"gain":- 1, "feature": None, "threshold": None}
    for i in range(num_features):
      features_values = dataset[:, i]
      uniq_features_values = np.unique(features_values)
      for threshold in uniq_features_values:
        left_dataset, right_dataset = self.split_dataset(dataset, i, threshold)
        if len(left_dataset) > 0 and len(right_dataset) > 0:
          y, left_child, right_child = dataset[:,-1], left_dataset[:,-1], right_dataset[:,-1]
          information = self.info_gain(y, left_child, right_child)
          if(information>best_split["gain"]):
            best_split["gain"] = information
            best_split["feature"] = i
            best_split["threshold"] = threshold
            best_split["left_dataset"] = left_dataset
            best_split["right_dataset"] = right_dataset
    return best_split

  def calc_leaf_val(self, y):
    y = list(y)
    mode = max(y, key=y.count)
    return mode

  def build_tree(self, dataset, current_depth = 0):
    x, y  = dataset[:,:-1], dataset[:,-1]
    n_samples, n_features = x.shape
    if n_samples >= self.min_samples and current_depth <= self.max_depth:
      best_split = self.best_split(dataset, n_samples, n_features)
      if(best_split["gain"]):
        left_node = self.build_tree(best_split["left_dataset"], current_depth+1)
        right_node = self.build_tree(best_split["right_dataset"], current_depth+1)
        return Node(best_split["feature"], best_split["threshold"], left_node, right_node, best_split["gain"])
    leaf_val = self.calc_leaf_val(y)
    return Node(value=leaf_val)


  def fit(self, X, Y):
    dataset = np.concatenate((X, Y), axis=1)
    self.root = self.build_tree(dataset)

  def make_prediction(self, X, node):
    if node.value is not None:
      return node.value
    else:
      feature = X[node.feature]
      if feature<=node.threshold:
        return self.make_prediction(X, node.left)
      return self.make_prediction(X, node.right)

  def predict(self, X):
    predictions = []
    for x in X:
      prediction = self.make_prediction(x, self.root)
      predictions.append(prediction)
    np.array(predictions)
    return predictions

  def post_prune(self, node, X_val, Y_val):
    if node.left is not None and node.right is not None:
        self.post_prune(node.left, X_val, Y_val)
        self.post_prune(node.right, X_val, Y_val)

        backup_left = node.left
        backup_right = node.right
        backup_feature = node.feature
        backup_threshold = node.threshold
        backup_value = node.value

        node.left = None
        node.right = None
        node.feature = None
        node.threshold = None
        node.value = self.calc_leaf_val(self.get_node_labels(node, X_val, Y_val))

        y_pred = np.array([self.make_prediction(x, self.root) for x in X_val])
        acc_new = (y_pred == Y_val).mean()
        y_pred_orig = np.array([self.make_prediction(x, self.root) for x in X_val])
        acc_orig = (y_pred_orig == Y_val).mean()
        if acc_new < acc_orig:
            node.left = backup_left
            node.right = backup_right
            node.feature = backup_feature
            node.threshold = backup_threshold
            node.value = backup_value


In [74]:
def accuracy(y_true, y_pred):
    y_true = y_true.flatten()
    total_samples = len(y_true)
    correct_predictions = np.sum(y_true == y_pred)
    return (correct_predictions / total_samples)



## No tuning (with validation set) - Basic Pre Pruning - 56% accuracy

In [68]:
model = decisionTree()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
acc = accuracy(Y_test, predictions)
print(acc)




0.5611179361179361


# **Since Training took a lot of time, we will be training it again on a smaller dataset!!**

Tuning Hyperparameters using validation data

In [81]:
X = X[:500]
Y = Y[:500]

In [None]:
best_acc = 0
best_params = {}

for depth in [2, 4, 6, 8]:
    for min_s in [2, 5, 10]:
        model = decisionTree(max_depth=depth, min_samples=min_s)
        model.fit(X_train, Y_train)
        y_val_pred = model.predict(X_val)
        acc = (y_val_pred == Y_val).mean()

        if acc > best_acc:
            best_acc = acc
            best_params = {'max_depth': depth, 'min_samples': min_s}

print("Best validation accuracy:", best_acc)
print("Best params:", best_params)
