In [None]:
import matplotlib.pyplot as plt

from sklearn.datasets import make_gaussian_quantiles
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

import scipy.io as sio
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, cohen_kappa_score


In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
import copy

class Adaboost:
    """
    D : Dictionnary of weights D[i] = list of m weights
    """
    D = np.empty(0)

    """
    alpha_list : List of alphas of the Adaboost best h
    """
    alpha_list = np.empty(0)

    """
    h_list : List of h_t best classifiers selected
    """
    h_list = np.empty(0)

    """
    T : number of iterations
    """
    T = 0

    """
    decisionTreeClassifier : Decision tree classifier
    """
    decisionTreeClassifier = 0

    """
    regularization_power :  None if no regularization else a positive integer.
    -> Regularize with a (Sum(alpha_t D_t(i)) power regularization)
    """
    regularization_power = None

    """
    regularization_C :  None if no regularization else a positive integer.
    -> Regularize with a (Sum(alpha_t D_t(i)) power regularization)
    """
    regularization_C = 1

    """
    Version : We try multiple regularizations
    - version 0 : base regularization : Weight update -> C * (Psi_t-1(i) - Psi_t(i))
    - version 1 : Updated : Weight update -> - C * Psi_t(i)
    """
    version = 0

    """
    Random : Randomize weights at the beginning
    """
    random = False

    def __init__(self, T, regularization_power=None, regularization_C=None, version=0, random=False):
        self.version = version
        self.regularization_power = regularization_power
        self.regularization_C = regularization_C
        self.D = np.empty(0)
        self.alphas = np.empty(0)
        self.h_t = np.empty(0)
        self.T = T
        self.random = random
        self.decisionTreeClassifier = DecisionTreeClassifier(criterion="entropy", max_depth=1, presort=True)

    def fit(self, trainFeatures, trainLabels):
        error_rate = []
        feat = []

        """
        fit Adaboost to the training set, with T iterations.
        """

        m, n = trainFeatures.shape

        self.D = np.ndarray(shape=(self.T + 1, m))

        if self.random:
            self.D.fill( 1.0 / m)
            self.D[0] = np.random.rand(m)
            self.D[0] /= self.D[0].sum()
        else:
            self.D.fill(1.0 / m)

        psi_t = np.ndarray(shape=(self.T, m))

        for t in range(self.T):
            h_t = self.decisionTreeClassifier.fit(trainFeatures, trainLabels, self.D[t])

            h_t_predict_all = h_t.predict(trainFeatures)
            epsilon_t =  np.where(h_t_predict_all != trainLabels, 1.0, 0) * self.D[t]
            epsilon_t = epsilon_t.sum()

            alpha_t = np.log((1 - epsilon_t) / epsilon_t) / 2

            if epsilon_t == 0:
                alpha_t = 1

            self.alpha_list = np.append(self.alpha_list, alpha_t)

            regularization_vector = np.zeros(m)

            if self.regularization_power != None:
                regularization_vector -=  np.power(np.transpose(self.D[:t+1]).dot(self.alpha_list[:t+1] / np.linalg.norm(self.alpha_list[:t+1])), self.regularization_power)
                psi_t[t] = - regularization_vector
                if t > 0 and self.version == 0:
                    regularization_vector += np.power(np.transpose(self.D[:t]).dot(self.alpha_list[:t]/ np.linalg.norm(self.alpha_list[:t])), self.regularization_power)
                regularization_vector *= self.regularization_C

            self.D[t + 1] =  self.D[t] * np.exp(- alpha_t * trainLabels * h_t_predict_all + regularization_vector)

            if self.regularization_power != None:
                self.D[t + 1] /= np.linalg.norm(self.D[t + 1], ord=1)
            else:
                Z_t = 2 * np.sqrt(epsilon_t * ( 1 - epsilon_t))
                self.D[t + 1] /= Z_t

            if len(self.alpha_list) > 1 and self.alpha_list[-2] == alpha_t:
                print (" Stoping the training at t = {0}, convergence.".format(t))
                break

            error_rate += [epsilon_t]
            feat += [np.argmax(h_t.feature_importances_)]

            self.h_list = np.append(self.h_list, copy.copy(h_t))
            if epsilon_t == 0:
                break

        return self.D

    def predict_proba(self, x, T=0):
        """
        Return the g(x), not sgn(g(x))
        """
        if T == 0 or T > len(self.alpha_list):
            T = len(self.alpha_list)

        scores = np.zeros(x.shape[0])
        for i in range(T):
            scores = scores + self.alpha_list[i] * self.h_list[i].predict(x)
        return scores

    def predict(self, x, T=0):
        """
        Once trained, predict a label given x
        T : if we compute adaboost(1000) and we want adaboost(10), it's the 10 first terms of adaboost(1000). Let's use it.
        """
        if T == 0 or T > len(self.h_list):
            T = len(self.h_list)

        scores = np.zeros(x.shape[0])

        for i in range(T):
            scores = scores + self.alpha_list[i] * self.h_list[i].predict(x)
        return np.where(scores > 0, 1, -1)

    def error(self, x, y, T):
        """
        Return the percentage of errors in the prediction.
        """
        return np.where(self.predict(x, T) * y == 1, 0, 1).sum() * 1.0 / x.shape[0]

    def score(self, x, y, T = None):
        if T == None or T > self.T:
            T = self.T
        return np.where(self.predict(x, T) * y == 1, 1, 0).sum() * 1.0 / x.shape[0]

    def toString(self):
        print ("T : {0}, Alphas : {0}".format(self.T, len(self.alpha_list)))

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
def loadData():
  data_path = os.path.join('gdrive/My Drive/HIS_Final','data')
  data = sio.loadmat(os.path.join(data_path, 'Indian_pines_corrected.mat'))['indian_pines_corrected']
  labels = sio.loadmat(os.path.join(data_path, 'Indian_pines_gt.mat'))['indian_pines_gt']
  return data,labels

In [None]:
test_ratio = 0.2
windowSize = 50

In [None]:
from tqdm import tqdm
import pandas as pd
def extract_pixels(dataset, ground_truth):
    df = pd.DataFrame()
    for i in tqdm(range(dataset.shape[2])):
        df = pd.concat([df, pd.DataFrame(dataset[:, :, i].ravel())], axis=1)
    df = pd.concat([df, pd.DataFrame(ground_truth.ravel())], axis=1)
    df.columns = [f'band-{i}' for i in range(1, 1+dataset.shape[2])]+['class']
    return df

In [None]:
dataset, ground_truth = loadData()
df = extract_pixels(dataset, ground_truth)
df.head()

100%|██████████| 200/200 [00:00<00:00, 257.72it/s]


Unnamed: 0,band-1,band-2,band-3,band-4,band-5,band-6,band-7,band-8,band-9,band-10,band-11,band-12,band-13,band-14,band-15,band-16,band-17,band-18,band-19,band-20,band-21,band-22,band-23,band-24,band-25,band-26,band-27,band-28,band-29,band-30,band-31,band-32,band-33,band-34,band-35,band-36,band-37,band-38,band-39,band-40,...,band-162,band-163,band-164,band-165,band-166,band-167,band-168,band-169,band-170,band-171,band-172,band-173,band-174,band-175,band-176,band-177,band-178,band-179,band-180,band-181,band-182,band-183,band-184,band-185,band-186,band-187,band-188,band-189,band-190,band-191,band-192,band-193,band-194,band-195,band-196,band-197,band-198,band-199,band-200,class
0,3172,4142,4506,4279,4782,5048,5213,5106,5053,4750,4816,4769,4610,4805,4828,4861,4767,4624,4549,4463,4462,4446,4445,4336,4381,4319,4207,4305,4311,3991,4168,3942,4061,4362,4318,4252,4869,5284,5055,3591,...,1396,1381,1396,1381,1353,1346,1341,1332,1324,1310,1318,1330,1310,1292,1280,1275,1266,1264,1233,1241,1232,1215,1215,1187,1168,1171,1150,1134,1123,1135,1094,1090,1112,1090,1062,1069,1057,1020,1020,3
1,2580,4266,4502,4426,4853,5249,5352,5353,5347,5065,5141,5100,4994,5172,5290,5289,5217,5053,5033,4939,4931,4941,4902,4824,4859,4805,4698,4794,4806,4452,4628,4433,4643,4967,4853,4760,5449,5768,5684,3987,...,1421,1415,1428,1415,1379,1370,1360,1353,1352,1336,1346,1351,1330,1315,1305,1292,1282,1286,1259,1259,1250,1229,1232,1195,1177,1184,1153,1137,1138,1137,1108,1104,1117,1091,1079,1085,1064,1029,1020,3
2,3687,4266,4421,4498,5019,5293,5438,5427,5383,5132,5227,5172,5097,5313,5411,5412,5341,5191,5140,5069,5110,5119,5046,4981,5023,4987,4862,4965,4992,4595,4756,4529,4801,5077,4983,4868,5515,5972,5913,4027,...,1446,1440,1443,1425,1390,1379,1376,1363,1355,1347,1361,1356,1341,1330,1321,1304,1290,1289,1263,1269,1261,1245,1241,1214,1185,1188,1156,1147,1149,1144,1111,1114,1114,1100,1065,1092,1061,1030,1016,3
3,2749,4258,4603,4493,4958,5234,5417,5355,5349,5096,5147,5078,5040,5237,5321,5344,5255,5121,5035,4956,4994,4980,4905,4857,4900,4831,4720,4848,4847,4484,4613,4402,4674,4966,4848,4776,5473,5894,5789,4086,...,1432,1427,1426,1416,1386,1374,1375,1359,1343,1343,1354,1351,1333,1329,1313,1296,1280,1281,1251,1255,1253,1238,1223,1207,1188,1188,1154,1143,1144,1146,1122,1108,1109,1109,1071,1088,1060,1030,1006,3
4,2746,4018,4675,4417,4886,5117,5215,5096,5098,4834,4853,4857,4734,4879,4976,4958,4885,4754,4647,4532,4586,4591,4492,4453,4497,4398,4297,4408,4401,4102,4227,4075,4264,4529,4490,4438,5001,5378,5321,3779,...,1401,1397,1395,1390,1368,1349,1354,1340,1330,1324,1336,1332,1320,1307,1287,1283,1267,1265,1239,1240,1239,1229,1212,1202,1178,1178,1143,1135,1138,1135,1110,1107,1112,1094,1072,1087,1052,1034,1019,3


In [None]:
X = df.iloc[:, :-1].values

y = df.iloc[:, -1].values
X.shape,y.shape

((21025, 200), (21025,))

In [None]:
def splitTrainTestSet(X, y, testRatio, randomState=345):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testRatio, random_state=randomState,
                                                        stratify=y)
    return X_train, X_test, y_train, y_test

In [None]:
from sklearn.decomposition import PCA
kpca = PCA(n_components=50, whiten=True) 
#pca = PCA(n_components=numComponents, svd_solver='arpack')
principalComponents = kpca.fit_transform(X)

principalComponents.shape

(21025, 50)

In [None]:
Xtrain, Xtest, ytrain, ytest = splitTrainTestSet(principalComponents, y, test_ratio)

Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape

((16820, 50), (4205, 50), (16820,), (4205,))

In [None]:
bdt_real = AdaBoostClassifier(
    n_estimators=100,
    learning_rate=1.5,)

In [None]:
bdt_real.fit(Xtrain, ytrain)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.5,
                   n_estimators=100, random_state=None)

In [None]:
y_pred=bdt_real.predict(Xtest)

In [None]:
my_adaboost = Adaboost(100, 2, 10, 0)
my_adaboost.fit(Xtrain, ytrain)

In [None]:
accuracy_score(y_pred, ytest)

0.5089179548156956