In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
train_data = pd.read_csv("train.csv", header=None)
train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,247,248,249,250,251,252,253,254,255,256
0,5,-1.000,-1.000,-1.000,-0.813,-0.671,-0.809,-0.887,-0.671,-0.853,...,-0.671,-0.671,-0.033,0.761,0.762,0.126,-0.095,-0.671,-0.828,-1.0
1,3,-1.000,-1.000,-1.000,-1.000,-1.000,-0.928,-0.204,0.751,0.466,...,0.466,0.639,1.000,1.000,0.791,0.439,-0.199,-0.883,-1.000,-1.0
2,3,-1.000,-1.000,-1.000,-0.830,0.442,1.000,1.000,0.479,-0.328,...,1.000,0.671,0.345,-0.507,-1.000,-1.000,-1.000,-1.000,-1.000,-1.0
3,3,-1.000,-1.000,-1.000,-1.000,-1.000,-0.104,0.549,0.579,0.579,...,0.388,0.579,0.811,1.000,1.000,0.715,0.107,-0.526,-1.000,-1.0
4,3,-1.000,-1.000,-1.000,-1.000,-1.000,-1.000,-0.107,1.000,1.000,...,-0.280,0.322,0.813,1.000,1.000,0.633,-0.144,-0.994,-1.000,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1209,3,-1.000,-0.882,-0.334,0.267,0.333,0.749,1.000,1.000,1.000,...,0.968,1.000,1.000,1.000,0.809,0.325,-0.820,-1.000,-1.000,-1.0
1210,3,-0.985,-0.048,0.226,0.226,0.226,-0.355,-0.807,-1.000,-0.726,...,-0.307,-0.555,-0.555,-0.555,-0.556,-1.000,-1.000,-1.000,-1.000,-1.0
1211,3,-1.000,-1.000,-1.000,-0.988,-0.527,-0.208,0.620,1.000,0.467,...,-0.116,0.899,0.416,-0.510,-1.000,-1.000,-1.000,-1.000,-1.000,-1.0
1212,3,-1.000,-1.000,-1.000,-0.990,0.708,0.557,0.347,-0.107,-0.758,...,0.697,0.636,0.167,-0.968,-1.000,-1.000,-1.000,-1.000,-1.000,-1.0


In [3]:
X_train = train_data.iloc[:, 1:]
Y_train = train_data.iloc[:,0]
Y_train = Y_train.replace(3,-1)
Y_train = Y_train.replace(5,1)
Y_train = Y_train.to_numpy()

In [4]:
# Decision stump used as weak classifier
class DecisionStump:
    
    def __init__(self):
        self.polarity = 1
        self.feature_idx = None
        self.threshold = None
        self.alpha = None
        
    # Predict using the decision stump
    def predict(self, X):
        n_samples = X.shape[0]
        X_column = X[:, self.feature_idx]
        predictions = np.ones(n_samples)
        
        # Apply threshold based on polarity
        if self.polarity == 1:
            predictions[X_column < self.threshold] = -1
        else:
            predictions[X_column > self.threshold] = -1

        return predictions

In [5]:
class Adaboost:
    
    def __init__(self, n_clf=5):
        self.n_clf = n_clf
        self.clfs = []
        
    # Train the model using AdaBoost
    def fit(self, X, y):
        X = X.to_numpy()
        n_samples, n_features = X.shape

        # Initialize weights to 1/N
        w = np.full(n_samples, (1 / n_samples))

        self.clfs = []

        # Iterate through classifiers
        for _ in range(self.n_clf):
            clf = DecisionStump()
            min_error = float("inf")

            # find best threshold and feature
            for feature_i in range(n_features):
                X_column = X[:, feature_i]
                thresholds = np.unique(X_column)

                for threshold in thresholds:
                    # predict with polarity 1
                    p = 1
                    predictions = np.ones(n_samples)
                    predictions[X_column < threshold] = -1

                    # Error = sum of weights of misclassified samples
                    misclassified = w[y != predictions]
                    error = sum(misclassified)

                    if error > 0.5:
                        error = 1 - error
                        p = -1

                    # store the best configuration
                    if error < min_error:
                        clf.polarity = p
                        clf.threshold = threshold
                        clf.feature_idx = feature_i
                        min_error = error

            # calculate alpha
            Epsilon = 1e-10
            clf.alpha = 0.5 * np.log((1.0 - min_error + Epsilon) / (min_error + Epsilon))

            # calculate predictions and update weights
            predictions = clf.predict(X)

            # Calculate Gini impurity of the misclassified samples
            misclassified = y != predictions
            gini_impurity = 0.0
            for class_val in np.unique(y):
                prob = np.sum(w[misclassified][y[misclassified] == class_val])
                gini_impurity += (prob * (1.0 - prob))

            # Normalize Gini impurity to a value between 0 and 1
            gini_impurity /= np.sum(w)

            # Update alpha based on Gini impurity instead of error rate
            Epsilon = 1e-10
            clf.alpha = 0.5 * np.log((1.0 - gini_impurity + Epsilon) / (gini_impurity + Epsilon))

            w *= np.exp(-clf.alpha * y * predictions)
            # Normalize to one
            w /= np.sum(w)

            # Save classifier
            self.clfs.append(clf)

    def predict(self, X):
        X = X.to_numpy()
        clf_preds = [clf.alpha * clf.predict(X) for clf in self.clfs]
        y_pred = np.sum(clf_preds, axis=0)
        y_pred = np.sign(y_pred)

        return y_pred

In [None]:
test_data = pd.read_csv("test.csv", header=None)

In [None]:
error_vals = []
boosts = [i for i in range(10, 201, 10)]

for i in range(10, 201, 10):
    clf = Adaboost(i)
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_train)
    acc = accuracy_score(Y_train, y_pred)
    error = 1-acc
    error_vals.append(error)
    print("Error for {} is {}".format(i, error))
    y_test_pred = clf.predict(test_data)
    y_test_pred = pd.DataFrame(y_test_pred)
    y_test_pred = y_test_pred.replace({-1:3, 1:5})
    y_test_pred = y_test_pred.astype(int)
    y_test_pred.to_csv('new_test_pred_{}.csv'.format(i), index=False, header=False, escapechar=None)

In [None]:
plt.plot(boosts, error_vals)
plt.xlabel('Number of boosts')
plt.ylabel('Error')
plt.title('AdaBoost with Decision Stumps')
plt.show()

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train)
acc = accuracy_score(Y_train, y_pred)
print(acc)
single_tree_pred = clf.predict(test_data)
single_tree_pred = pd.DataFrame(single_tree_pred)
single_tree_pred = single_tree_pred.replace({-1:3, 1:5})