# 1. Implement Boosting method (50 points)

In [2]:
import numpy as np
from sklearn.svm import SVC
from scipy.io import loadmat
from tqdm import tqdm
from pandas import DataFrame
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [3]:
class AdaBoost:
    def __init__(self, n_estimators,model):
        '''
        n_estimators - number of estimators
        estimators - list that contains all estimators of our model
        estimator_weights-hold the weights assigned to each weak classifier in the final model.
        self.estimator_errors - hold the errors made by each weak classifier during training
        '''
        self.n_estimators = n_estimators
        self.estimators = []
        self.estimator_weights = np.zeros(n_estimators)
        self.estimator_errors = np.zeros(n_estimators)
        self.model=model
    
    def fit(self, X, y):
        # Initialize sample weights
        sample_weights = np.full(X.shape[0], 1/X.shape[0])

        for i in tqdm(range(self.n_estimators)):
            # Train weak classifier on weighted data
          estimator = self.model

          estimator.fit(X, y, sample_weight=sample_weights)

          # Compute error of weak classifier
          y_pred = estimator.predict(X)
          incorrect = y_pred != y
          estimator_error = np.sum(sample_weights[incorrect])

          # Compute weight of weak classifier
          if estimator_error == 0:
            estimator_weight = 1
          elif estimator_error == 1:
            estimator_weight = 0
          else:
            estimator_weight = np.log((1 - estimator_error) / estimator_error)

           # Update sample weights
          sample_weights *= np.exp((estimator_weight * incorrect) * 1e-3)
          sample_weights /= np.sum(sample_weights)

           # Save weak classifier and its weight
          self.estimators.append(estimator)
          self.estimator_weights[i] = estimator_weight
          self.estimator_errors[i] = estimator_error


    
    def predict(self, X,num_classes):
        n_classes = num_classes
        pred = np.zeros((X.shape[0], n_classes))
        for i, estimator in enumerate(self.estimators):
            pred += self.estimator_weights[i] * estimator.predict_proba(X)
        return np.argmax(pred, axis=1)+1


# 2. Load train and test mat files, perform Boosting with Decision Tree and report acuracy on the test dataset (20 points)

In [44]:
train_data=loadmat('train.mat')
X_train =train_data['features']
y_train=train_data['labels'].flatten()

In [45]:
test_data=loadmat('test.mat')
X_test =test_data['features']
y_test=test_data['labels'].flatten()
accuracies = []

In [46]:
DT = DecisionTreeClassifier(max_depth=10, random_state=20)
adaboost = AdaBoost(n_estimators=15, model=DT)
adaboost.fit(X_train, y_train)

100%|██████████| 15/15 [00:22<00:00,  1.47s/it]


In [47]:
y_pred=adaboost.predict(X_test, num_classes=10)
accuracies.append((y_pred == y_test).sum()/len(y_pred))
print(f'Accuracy from this model is {accuracies[-1]}')

Accuracy from this model is 0.918580375782881


# 3. Compare results with single decision tree, SVM and KNN model (20 points)

Decision tree

In [48]:
DT.fit(X_train,y_train)
y_pred=DT.predict(X_test)
accuracies.append((y_pred == y_test).sum()/len(y_pred))
print(f'Accuracy from this model is {accuracies[-1]}')

Accuracy from this model is 0.9196242171189979


Support Vector Classifier

In [49]:
svc = SVC(kernel='linear')
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
accuracies.append((y_pred == y_test).sum()/len(y_pred))
print(f'Accuracy from this model is {accuracies[-1]}')

Accuracy from this model is 0.9530271398747391


KNN

In [50]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
y_pred=knn.predict(X_test)
accuracies.append((y_pred == y_test).sum()/len(y_pred))
print(f'Accuracy from this model is {accuracies[-1]}')

Accuracy from this model is 0.9551148225469729


In [51]:
DataFrame(accuracies, index=["AdaBoost (DT)", "Decision Tree", "SVC", "KNN"], columns=["Accuracy"]).sort_values("Accuracy", ascending=False)

Unnamed: 0,Accuracy
KNN,0.955115
SVC,0.953027
Decision Tree,0.919624
AdaBoost (DT),0.91858


# 4. Explain reasons (10 points)

All classifiers have the high accuracy more than 90% except Boosting model.

We can observe that AdaBoost performs slightly better than the Decision Tree as it combines many Decision Trees

I infer that KNN and SVC use hyperplanes to separate the datapoints which leads to more accuracy.