In [82]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statistics as stats

# Datasets

In [14]:
WineQualityDataset = pd.read_csv('../datasets/WineQT.csv')
WineQualityDataset.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
count,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0
mean,8.311111,0.531339,0.268364,2.532152,0.086933,15.615486,45.914698,0.99673,3.311015,0.657708,10.442111,5.657043,804.969379
std,1.747595,0.179633,0.196686,1.355917,0.047267,10.250486,32.78213,0.001925,0.156664,0.170399,1.082196,0.805824,463.997116
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0,0.0
25%,7.1,0.3925,0.09,1.9,0.07,7.0,21.0,0.99557,3.205,0.55,9.5,5.0,411.0
50%,7.9,0.52,0.25,2.2,0.079,13.0,37.0,0.99668,3.31,0.62,10.2,6.0,794.0
75%,9.1,0.64,0.42,2.6,0.09,21.0,61.0,0.997845,3.4,0.73,11.1,6.0,1209.5
max,15.9,1.58,1.0,15.5,0.611,68.0,289.0,1.00369,4.01,2.0,14.9,8.0,1597.0


In [74]:
Wine_X = WineQualityDataset.drop('quality', axis=1)
Wine_y = WineQualityDataset['quality']

Wine_X_train, Wine_X_test, Wine_y_train, Wine_y_test = train_test_split(Wine_X, Wine_y, test_size=0.1, random_state=42)
Wine_X_train, Wine_X_val, Wine_y_train, Wine_y_val = train_test_split(Wine_X_train, Wine_y_train, test_size=0.1, random_state=52)

Wine_X_train = Wine_X_train.to_numpy()
Wine_y_train = Wine_y_train.to_numpy()
Wine_y_train -= 3 # To make the range of values from 0 to 5

Wine_X_val = Wine_X_val.to_numpy()
Wine_y_val = Wine_y_val.to_numpy()
Wine_y_val -= 3 # To make the range of values from 0 to 5

Wine_X_test = Wine_X_test.to_numpy()
Wine_y_test = Wine_y_test.to_numpy()
Wine_y_test -= 3 # To make the range of values from 0 to 5

scaler = StandardScaler()
Wine_X_train = scaler.fit_transform(Wine_X_train)
Wine_X_val = scaler.fit_transform(Wine_X_val)
Wine_X_test = scaler.fit_transform(Wine_X_test)


print("For Wine Dataset:")
print(f"Size of training set = {len(Wine_X_train)}\nSize of validation set = {len(Wine_X_val)}\nSize of test set = {len(Wine_X_test)}")

For Wine Dataset:
Size of training set = 925
Size of validation set = 103
Size of test set = 115


In [59]:
class DecisionTree:
    def __init__(self, criterion='gini', max_depth=None, random_state=None, classes=None, task_type='classification'):
        if task_type == 'classification':
            self.tree = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, random_state=random_state)
        elif task_type == 'regression':
            self.tree = DecisionTreeRegressor(criterion=criterion, max_depth=max_depth, random_state=random_state)
        self.classes = classes

    def fit(self, X, y, sample_weight=None):
        self.tree.fit(X, y, sample_weight=sample_weight)
        return self

    def predict(self, X):
        return self.tree.predict(X)

    def predict_proba(self, X):
        proba = self.tree.predict_proba(X)
        if self.classes is not None and proba.shape[1] != len(self.classes):
            # Adjust the probability array to include missing classes
            full_proba = np.zeros((proba.shape[0], len(self.classes)))
            indices = np.array([np.where(self.classes == c)[0][0] for c in self.tree.classes_])
            full_proba[:, indices] = proba
            return full_proba
        return proba

    def evaluate(self, X, y):
        predictions = self.predict(X)
        return accuracy_score(y, predictions)


In [60]:
class RandomForest:
    def __init__(self, n_estimators=100, max_depth=None, max_features='auto'):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        if self.max_features == 'auto':
            self.max_features = n_features
        elif isinstance(self.max_features, float):
            self.max_features = int(self.max_features * n_features)

        for _ in range(self.n_estimators):
            idxs = np.random.choice(range(n_samples), size=n_samples, replace=True)
            sample_X, sample_y = X[idxs], y[idxs]

            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(sample_X, sample_y)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=tree_preds)
        return predictions

## Finding best hyperparmeters for the random forest model

In [86]:
n_estimators_options = [10, 50, 100, 200]
max_depth_options = [5, 10, 20]

best_accuracy = 0
best_params = {}

for n_estimators in n_estimators_options:
    for max_depth in max_depth_options:
        model = RandomForest(n_estimators=n_estimators, max_depth=max_depth)
        model.fit(Wine_X_train, Wine_y_train)
        preds = model.predict(Wine_X_test)
        accuracy = accuracy_score(Wine_y_test, preds)
        print(f"Accuracy for n_estimators = {n_estimators} and max_depth = {max_depth} is {accuracy}")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = {'n_estimators': n_estimators, 'max_depth': max_depth}

print()
print(f"Best parameters: {best_params}")
print(f"Best accuracy: {best_accuracy}")

Accuracy for n_estimators = 10 and max_depth = 5 is 0.6260869565217392
Accuracy for n_estimators = 10 and max_depth = 10 is 0.6086956521739131
Accuracy for n_estimators = 10 and max_depth = 20 is 0.6608695652173913
Accuracy for n_estimators = 50 and max_depth = 5 is 0.6347826086956522
Accuracy for n_estimators = 50 and max_depth = 10 is 0.6173913043478261
Accuracy for n_estimators = 50 and max_depth = 20 is 0.6173913043478261
Accuracy for n_estimators = 100 and max_depth = 5 is 0.6173913043478261
Accuracy for n_estimators = 100 and max_depth = 10 is 0.6260869565217392
Accuracy for n_estimators = 100 and max_depth = 20 is 0.6347826086956522
Accuracy for n_estimators = 200 and max_depth = 5 is 0.6260869565217392
Accuracy for n_estimators = 200 and max_depth = 10 is 0.6347826086956522
Accuracy for n_estimators = 200 and max_depth = 20 is 0.6347826086956522

Best parameters: {'n_estimators': 10, 'max_depth': 20}
Best accuracy: 0.6608695652173913


## Ada boost and Gradient boost

In [70]:
# class AdaBoost:
#     def __init__(self, n_estimators=50, learning_rate=1.0):
#         self.n_estimators = n_estimators
#         self.learning_rate = learning_rate
#         self.trees = []
#         self.tree_weights = []

#     def fit(self, X, y):
#         n_samples = X.shape[0]
#         sample_weights = np.full(n_samples, 1 / n_samples)

#         for _ in range(self.n_estimators):
#             tree = DecisionTree(max_depth=1)  # Using stumps as weak learners
#             tree.fit(X, y, sample_weight=sample_weights)
#             predictions = tree.predict(X)

#             misclassified = predictions != y
#             error = np.sum(sample_weights * misclassified) / np.sum(sample_weights)
#             alpha = self.learning_rate * np.log((1 - error) / error)

#             sample_weights *= np.exp(alpha * misclassified)
#             sample_weights /= np.sum(sample_weights)  # Normalize weights

#             self.trees.append(tree)
#             self.tree_weights.append(alpha)

#     def predict(self, X):
#         tree_preds = np.array([tree.predict(X) for tree in self.trees])
#         final_prediction = np.sign(np.dot(self.tree_weights, tree_preds))
#         return final_prediction

In [90]:
class AdaBoost:
    def __init__(self, M):
        self.M = M  # Number of trees
    
    def fit(self, X, y):
        self.models = []
        self.alphas = []

        N, _ = X.shape
        w = np.ones(N) / N

        for m in range(self.M):
            tree = DecisionTree(max_depth=1)  # Using stumps (depth-1 trees)
            tree.fit(X, y, sample_weight=w)
            P = tree.predict(X)

            err = np.sum(w * (P != y))
            alpha = 0.5 * np.log((1 - err) / err)

            w = w * np.exp(-alpha * y * P)  # Update weights
            w = w / w.sum()  # Normalize weights

            self.models.append(tree)
            self.alphas.append(alpha)

    def predict(self, X):
        N, _ = X.shape
        FX = np.zeros(N)
        for alpha, tree in zip(self.alphas, self.models):
            FX += alpha * tree.predict(X)
        return np.sign(FX)

In [92]:
adaboost = AdaBoost(M=10)  # 10 trees
adaboost.fit(Wine_X_train, Wine_y_train)
predictions = adaboost.predict(Wine_X_test)
accuracy = accuracy_score(Wine_y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.017391304347826087
