## Foundations Of Machine Learning

Name : Ahmik Virani <br>
Roll Number : ES22BTECH11001

Import the libraries

In [154]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

Load The Dataset

In [155]:
# Load datasets
red_wine = pd.read_csv('winequality-red.csv', sep=';')
white_wine = pd.read_csv('winequality-white.csv', sep=';')

# Combine datasets
wine_data = pd.concat([red_wine, white_wine], ignore_index=True)

# Create binary classification: 1 if quality >= 7, else 0
wine_data['quality'] = (wine_data['quality'] >= 7).astype(int)

# Split features and labels
X = wine_data.drop('quality', axis=1).values  # Features
y = wine_data['quality'].values  # Labels

Define the Node of the Tree

In [156]:
class Node:
    def __init__(self, feature=None, threshold=None, left_child=None, right_child=None, value=None):
        self.feature = feature          # Index of the feature for splitting the node
        self.threshold = threshold      # Value of feature used for splitting the node
        self.left = left_child          # Left child: these have vales <= threshold
        self.right = right_child        # Right child: these have vales > threshold
        self.value = value              # This is the predicted value of this wis a leaf node
    
    # This is a method to check if this is a leaf node
    def is_leaf_node(self):
        return self.value is not None

### Part (a) : Implement your own version of the decision tree using binary univariate split, entropy and information gain.

In [157]:
class DecisionTree:
    def __init__(self, min_sample_split=2, n_features=None, random_state=None):
        # Initialize the decision tree parameters
        self.min_samples_split = min_sample_split   # Define the minimum number of samples to split the node
        self.n_features = n_features                # Define the number of features to consider for splitting
        self.root = None                            # Define the root of the decision tree
        self.random_state = random_state            # To ensure reproducability, keep the random_state fixed
        if random_state:
            np.random.seed(random_state)
    
    def fit(self, X, y):
        # Determine the number of features to consider for each split
        self.n_features = X.shape[1] if not self.n_features else min(X.shape[1], self.n_features)
        self.root = self._grow_tree(X, y)       # Grow the decision tree by finding the best split

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_feats = X.shape        # Check the number of sample and features we are working on
        n_labels = len(np.unique(y))        # Find the number of unique labels
        
        # This is the stopping condition, note that here we allow the tree to grow while splitting is possible, we do not do pruning here
        if (n_labels == 1 or n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(n_feats, self.n_features, replace=False)           # Randomly select features for splitting

        best_feature, best_thresh = self._best_split(X, y, feat_idxs)                   # Find the best feature and threshold for splitting

        left_idxs, right_idxs = self._split(X[:, best_feature], best_thresh)            # Split the dataset based on best index and threshold

        # Recusrively grow the tree
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feature, best_thresh, left, right)

    def _best_split(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_threshold = None, None
        
        # Iterate over each feature
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)

            # Calculate information gain for each threshold
            for thr in thresholds:
                gain = self._information_gain(y, X_column, thr)

                # Update the information gain if a better split is found
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_threshold = thr
        
        return split_idx, split_threshold

    # A function to calculate the entropy
    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])

    # A function to calculate the information gain
    def _information_gain(self, y, X_column, threshold):
        parent_entropy = self._entropy(y)                           # Compute the entropy of the parent node

        # Split the data into left and right indexes based on the threshold
        left_idxs, right_idxs = self._split(X_column, threshold)
        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        # Find the weighted entropy of the child
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        # Return the information gain
        information_gain = parent_entropy - child_entropy
        return information_gain

    def _split(self, X_column, split_thresh):
        # Split the data based in threshold into left and right child indexes
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    # Function to extract the most occuring label
    def _most_common_label(self, y):
        counter = Counter(y)
        value = counter.most_common(1)[0][0]
        return value

    # Function to predict label for input samples
    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])
    
    # Function to traverse the node to make predictions
    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
    
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

In [158]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

# Initialize and train the decision tree classifier
tree = DecisionTree(random_state=42)
tree.fit(X_train, y_train)

# Make predictions on the test set
predictions = tree.predict(X_test)

# Define accuracy function
def accuracy(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)

# Calculate accuracy
acc = accuracy(y_test, predictions)
print(f"Accuracy: {acc:.4f}")

Accuracy: 0.8554


Results from the above observation:
1. The accuracy of the model using train_test_split is 85.54%
2. This accuracy is quite good considering the fact that question allowed accuracy above 78%

### Part (b): 10 Fold cross validation

In [159]:
# Set random seed for reproducibility
np.random.seed(42)

# Shuffle dataset
sample_size = wine_data.shape[0]
permuted_arr = np.random.permutation(sample_size)
wine_data = wine_data.iloc[permuted_arr, :]

In [160]:
# Cross-validation with fixed seed
K = 10
section_size = wine_data.shape[0] // K

tree = DecisionTree(random_state=42)  # Fixed random state
acc = []

# Going to all the 10 folds
for i in range(K):
    start_ind = i * section_size
    end_ind = wine_data.shape[0] if i == K - 1 else start_ind + section_size
    
    X_train = pd.concat([wine_data.iloc[0:start_ind, :-1], wine_data.iloc[end_ind:, :-1]]).values
    y_train = pd.concat([wine_data['quality'].iloc[0:start_ind], wine_data['quality'].iloc[end_ind:]]).values
    tree.fit(X_train, y_train)
    
    unseen_data = wine_data.iloc[start_ind:end_ind, :-1].values
    y_pred = tree.predict(unseen_data)
    y_actual = wine_data['quality'].iloc[start_ind:end_ind].values
    
    acc.append(accuracy_score(y_actual, y_pred))
    
acc_np = np.array(acc)
print("The mean of the 10 fold cross validation accuracy is : ", np.mean(acc_np))

The mean of the 10 fold cross validation accuracy is :  0.8462336521477696


In [161]:
i = np.argmax(acc_np)
print("While 10 fold cross validation, the best accuracy is: ", acc_np[i])

While 10 fold cross validation, the best accuracy is:  0.8582434514637904


Results from the above observations:
1. The average accuracy is 84.62%
2. The best result is 85.82%
3. As it should, the best result is more than the given value in part(a) above, which was: 85.54%
4. This can be used to validate that different training sets can form different tree, this happens because decision trees are sensitive to the data they are trained on
5. We can see that the percentage difference between best result and average result is slightly above 1%, which means more often than not, my model is quite consistent and gives similar accurate results. This indicates a stable model with minimal variance, which is a desirable characteristic.

### Part(c)

##### Gini Index

In [162]:
# This is the same decision tree as before, except that  I have changed the entropy with gini index

class DecisionTree:
    def __init__(self, min_sample_split=2, n_features=None, random_state=None):
        self.min_samples_split = min_sample_split
        self.n_features = n_features
        self.root = None
        self.random_state = random_state
        if random_state:
            np.random.seed(random_state)
    
    def fit(self, X, y):
        self.n_features = X.shape[1] if not self.n_features else min(X.shape[1], self.n_features)
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_feats = X.shape
        n_labels = len(np.unique(y))
        
        if (n_labels == 1 or n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(n_feats, self.n_features, replace=False)

        best_feature, best_thresh = self._best_split(X, y, feat_idxs)

        left_idxs, right_idxs = self._split(X[:, best_feature], best_thresh)

        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feature, best_thresh, left, right)

    def _best_split(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_threshold = None, None
        
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)

            for thr in thresholds:
                gain = self._information_gain_with_info_gain(y, X_column, thr)

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_threshold = thr
        
        return split_idx, split_threshold

    def _gini_index(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return 1 - np.sum([p ** 2 for p in ps if p > 0])

    def _information_gain_with_info_gain(self, y, X_column, threshold):
        parent_gini_index = self._gini_index(y)

        left_idxs, right_idxs = self._split(X_column, threshold)
        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        gini_index_l, gini_index_r = self._gini_index(y[left_idxs]), self._gini_index(y[right_idxs])
        child_gini_index = (n_l / n) * gini_index_l + (n_r / n) * gini_index_r

        information_gain = parent_gini_index - child_gini_index
        return information_gain

    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    def _most_common_label(self, y):
        counter = Counter(y)
        value = counter.most_common(1)[0][0]
        return value

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])
    
    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
    
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

In [163]:
# Set random seed for reproducibility
np.random.seed(1)

# Shuffle dataset
sample_size = wine_data.shape[0]
permuted_arr = np.random.permutation(sample_size)
wine_data = wine_data.iloc[permuted_arr, :]

In [164]:
# Cross-validation with fixed seed
K = 10
section_size = wine_data.shape[0] // K

obj = DecisionTree(random_state=42)  # Fixed random state
acc = []

for i in range(K):
    start_ind = i * section_size
    end_ind = wine_data.shape[0] if i == K - 1 else start_ind + section_size
        
    X_train = pd.concat([wine_data.iloc[0:start_ind, :-1], wine_data.iloc[end_ind:, :-1]]).values
    y_train = pd.concat([wine_data['quality'].iloc[0:start_ind], wine_data['quality'].iloc[end_ind:]]).values
    obj.fit(X_train, y_train)
        
    unseen_data = wine_data.iloc[start_ind:end_ind, :-1].values
    y_pred = obj.predict(unseen_data)
    y_actual = wine_data['quality'].iloc[start_ind:end_ind].values
    
    acc.append(accuracy_score(y_actual, y_pred))
    
acc_np = np.array(acc)
print("Calculating the accuracy using Gini Index instead of Entropy is : ", np.mean(acc_np))

Calculating the accuracy using Gini Index instead of Entropy is :  0.8456354053139915


In [165]:
i = np.argmax(acc_np)
print(acc_np[i])

0.8690292758089369


Results of the above observations:
1. The average accuracy of the cross validation is : 84.56%
2. The best accuracy is : 86.90%
3. Clearly, the average is close to the results of entopy, underperforming by just 0.06%, whereas the best case is much better.
4. We can see the tradeoff between varying entropy v/s gini index, where in entropy the results are quite close with low variance, whereas the better result overall is using gini index

From these results I come to the following conclusion:  Entropy and Gini Index measure impurity slightly differently, leading to minor variations in splits and performance. Gini can sometimes find more aggressive splits, which may explain the higher best-case performance, while entropy often leads to more balanced splits, resulting in lower variance.

##### Pruning

In [173]:
# This is the same decision tree as before except I have introduced pruning

class DecisionTree:
    def __init__(self, min_sample_split=2, max_depth=100, n_features=None, random_state=None):
        self.min_samples_split = min_sample_split
        self.max_depth = max_depth
        self.n_features = n_features
        self.root = None
        self.random_state = random_state
        if random_state:
            np.random.seed(random_state)
    
    def fit(self, X, y):
        self.n_features = X.shape[1] if not self.n_features else min(X.shape[1], self.n_features)
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_feats = X.shape
        n_labels = len(np.unique(y))
        
        if (depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(n_feats, self.n_features, replace=False)

        best_feature, best_thresh = self._best_split(X, y, feat_idxs)

        left_idxs, right_idxs = self._split(X[:, best_feature], best_thresh)

        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feature, best_thresh, left, right)

    def _best_split(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_threshold = None, None
        
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)

            for thr in thresholds:
                gain = self._information_gain(y, X_column, thr)

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_threshold = thr
        
        return split_idx, split_threshold

    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return 1 - np.sum([p * np.log2(p) for p in ps if p > 0])

    def _information_gain(self, y, X_column, threshold):
        parent_entropy = self._entropy(y)

        left_idxs, right_idxs = self._split(X_column, threshold)
        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        information_gain = parent_entropy - child_entropy
        return information_gain

    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    def _most_common_label(self, y):
        counter = Counter(y)
        value = counter.most_common(1)[0][0]
        return value

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])
    
    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
    
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

In [174]:
# Set random seed for reproducibility
np.random.seed(1)

# Shuffle dataset
sample_size = wine_data.shape[0]
permuted_arr = np.random.permutation(sample_size)
wine_data = wine_data.iloc[permuted_arr, :]

In [175]:
# Cross-validation with fixed seed
K = 10
section_size = wine_data.shape[0] // K

values = []
for d in range(1, 31, 1):
    obj = DecisionTree(max_depth=d, random_state=42)  # Fixed random state
    acc = []
    for i in range(K):
        start_ind = i * section_size
        end_ind = wine_data.shape[0] if i == K - 1 else start_ind + section_size
        
        x_train = pd.concat([wine_data.iloc[0:start_ind, :-1], wine_data.iloc[end_ind:, :-1]]).values
        y_train = pd.concat([wine_data['quality'].iloc[0:start_ind], wine_data['quality'].iloc[end_ind:]]).values
        obj.fit(x_train, y_train)
        
        unseen_data = wine_data.iloc[start_ind:end_ind, :-1].values
        y_pred = obj.predict(unseen_data)
        y_actual = wine_data['quality'].iloc[start_ind:end_ind].values
        
        acc.append(accuracy_score(y_actual, y_pred))
    
    acc_np = np.array(acc)
    values.append(np.mean(acc_np))

values = np.array(values)
i = np.argmax(values)
print(values[i])
print(values)

0.8517674940809499
[0.80344785 0.80175293 0.80851451 0.82282451 0.83022708 0.83051717
 0.8286731  0.82991563 0.83191378 0.82697818 0.83437911 0.83867535
 0.84176195 0.84237664 0.84576318 0.84360602 0.84883498 0.84728922
 0.84807114 0.84591726 0.84868254 0.85145933 0.85176749 0.84929723
 0.85145275 0.84992014 0.84945789 0.85022666 0.85022666 0.85022666]


In [176]:
print("The depth of the decision tree giving the best result it : ", i + 1)
print("The best accuracy is: ", values[i])

The depth of the decision tree giving the best result it :  23
The best accuracy is:  0.8517674940809499


The observations from the above results are as follows:
1. The depth at which the best result is observed is : 23, whereas the max depth is 26.
- **Why am I claiming the depth is 26:** Note that we can see that after depth 26, the value becomes constant, which means that it is always going to the same level
2. The accuracy at this depth : 85.17%
3. This is better than the average of both gini index and entropy accuracies.
4. This shows that the model, to some extent overfits
5. Also, by removing some nodes at the end, we are limiting the over fitting and generalizing to some extent