In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from tqdm import tqdm
import itertools

In [2]:
def get_clf_eval(y_test, pred):
    accuracy = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred, average='micro')
    precision = precision_score(y_test, pred, average='micro')
    f1 = f1_score(y_test, pred, average='micro')
    print('Accuracy:{0:.4f}, Recall:{1:.4f}, Precision:{2:.4f}, F1-Score:{3:.4f}'.format(accuracy, recall, precision, f1))

In [3]:
def make_confusion(y_test, y_pred):
    confusion = confusion_matrix(y_test,y_pred)
    confusion_df = pd.DataFrame(confusion,columns=['Predicted_Negative','Predicted_Positive'],index=['Actual_Negative','Actual_Positive'])
    
    return confusion_df

In [4]:
class DiscriminentAnalysis():
    def __init__(self, alpha=0.0, beta=0.0):
        self.learned = False
        self.alpha = alpha
        self.beta = beta
        self.class_names = []
        self.class_priors = {}
        self.class_means = {}
        self.regularized_covariances = {}
        self.rda_covariances = {}
        self.reset()

    def reset(self):
        self.learned = False
        self.class_names = []
        self.class_priors = {}
        self.class_means = {}
        self.regularized_covariances = {}
        self.rda_covariances = {}

    def fit(self, X, y):
        self.class_names = np.unique(y)
        class_covariances = {}
        pooled_covariances = 0
        for i in self.class_names:
            class_indices = np.where(y == i)[0]
            class_samples = X[class_indices, :]
            self.class_priors[i] = float(len(class_indices)) / len(y)
            self.class_means[i] = np.mean(class_samples, axis=0)
            class_covariances[i] = np.cov(class_samples, rowvar=0)
            pooled_covariances += class_covariances[i] * self.class_priors[i]
        # Calculate RDA regularized covariance matricies for each class
        for i in self.class_names:
            self.regularized_covariances[i] = (self.beta * pooled_covariances) + ((1 - self.beta) *class_covariances[i])
            # self.regularized_covariances[i] = (self.beta * class_covariances[i]) + ((1 - self.beta) * pooled_covariances)

        for i in self.class_names:
            self.rda_covariances[i] = ((1-self.alpha) * self.regularized_covariances[i]) + (self.alpha * (1/self.class_priors[i]) * np.trace(self.regularized_covariances[i]) * np.eye(self.regularized_covariances[i].shape[0]))
        
        self.learned = True
        return self

    def predict(self, x):
        if not self.learned:
            raise NameError('Fit model first')
        # Determine probability of each class given input vector
        
        class_prob = {}
        for i in self.class_names:
            # Divid the class delta calculation into 3 parts
            part1 = -0.5 * np.linalg.det(self.rda_covariances[i])
            part2 = -0.5 * np.dot(np.dot((x - self.class_means[i]).T, np.linalg.pinv(self.rda_covariances[i])), (x - self.class_means[i]))
            part3 = np.log(self.class_priors[i])
            class_prob[i] = part1 + part2 + part3
        return max(class_prob, key=class_prob.get)

In [21]:
class GridSearchRDA():
    def __init__(self, model, param_grid):
        self.model = model
        self.param_grid = param_grid
        self.alpha = 0
        self.beta = 0
        self.best_covariance = {}
        self.best_score = 0


    def fit(self, X, y, cv=3):
        
        # metric_score = []

        alpha_list = self.param_grid['alpha']
        beta_list = self.param_grid['beta']

        cv_x = np.split(X, cv)
        cv_y = np.split(y, cv)

        for alpha in alpha_list:
            for beta in beta_list:
                score_list = []
                for i in range(cv):
                    self.model.reset()
                    self.model.alpha = alpha
                    self.model.beta = beta

                    test_x_cv = cv_x[i]
                    train_x_cv = np.array(cv_x[:i] + cv_x[i + 1:])
                    train_x_cv2 = train_x_cv.reshape(46, 22)
                    train_x_cv2 = np.array(train_x_cv)

                    test_y_cv = cv_y[i]
                    train_y_cv = np.array(cv_y[:i] + cv_y[i + 1:])
                    train_y_cv2 = train_y_cv.reshape(46,)

                    self.model.fit(train_x_cv2, train_y_cv2)

                    pred = []

                    for data in test_x_cv:
                        pred.append(self.model.predict(data))
                    
                    score_list.append(f1_score(test_y_cv, pred, average='micro'))
                
                mean_score = np.mean(np.array(score_list))
                print("alpha:{0:.1f}, beta:{1:.1f}, f1-score:{2:.4f}".format(alpha, beta,mean_score))

                if mean_score > self.best_score:
                    self.best_score = mean_score
                    self.alpha = alpha
                    self.beta = beta
                    self.best_covariance = self.model.rda_covariances

In [22]:
data = pd.read_excel('../data.xlsx')

In [23]:
y = data['Class Label'].to_numpy()
x = data.loc[:, data.columns != 'Class Label'].to_numpy()

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify=y, random_state=102)

In [25]:
dis = DiscriminentAnalysis()

In [26]:
params = {
    'alpha': np.linspace(0.0, 1.0, num=11, endpoint=True),
    'beta': np.linspace(0.0, 1.0, num=11, endpoint=True)
}

In [27]:
grid = GridSearchRDA(dis, params)

In [28]:
grid.fit(x_train, y_train, cv=3)

IndexError: index 3 is out of bounds for axis 0 with size 2

In [None]:
rda_cov = {int(k):v.tolist() for k,v in cov.items()}

In [None]:
# import json
# with open('covariance.json','w') as f:
#     json.dump(rda_cov, f)