In [85]:
!pip install xlsxwriter



In [86]:
import pandas as pd
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Softmax
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from sklearn import metrics
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
import numpy as np
import math
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
import os
import xlsxwriter
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score

In [87]:
def read_csv(path):
    all = pd.read_csv(path, sep=',', engine='python', quoting=csv.QUOTE_NONE)
    X, y = all.drop(all.columns[-1], 1).to_numpy(), all[all.columns[-1]].to_numpy()
    return X, y

In [88]:
class Net(nn.Module):
    def __init__(self, input_size, hidden1_size, hidden2_size, num_classes, epochs=3, lr=0.001):
        super(Net, self).__init__()
        self.input_size = input_size
        self.hidden1_size = hidden1_size
        self.hidden2_size = hidden2_size
        self.num_classes = num_classes
        self.epochs = epochs
        self.lr = lr
        self.fc1 = nn.Linear(input_size, hidden1_size)
        self.fc2 = nn.Linear(hidden1_size, hidden2_size)
        self.fc3 = nn.Linear(hidden2_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = F.relu(out)
        out = self.fc2(out)
        out = F.relu(out)
        out = self.fc3(out)
        out = F.softmax(out, dim=0)
        return out

    def fit(self, x, y):
        tx = torch.tensor(x).float()
        ty = torch.tensor(y)
        dataset = TensorDataset(tx, ty)
        data_loader = DataLoader(dataset, batch_size=10, shuffle=True)
        loss_function = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.parameters(), lr=self.lr)

        for epoch in range(self.epochs):
            optimizer.zero_grad()
            for data in data_loader:  # `data` is a batch of data
                X, y = data  # X is the batch of features, y is the batch of targets.
                self.zero_grad()  # sets gradients to 0 before loss calc. You will do this likely every step.
                output = self(X.view(-1, self.input_size))  # pass in the reshaped batch (recall they are 28x28 atm)
                loss = loss_function(output, y)  # calc and grab the loss value
                loss.backward()  # apply this loss backwards thru the network's parameters
                optimizer.step()  # attempt to optimize weights to account for loss/gradients

    def predict(self, x):
        ans = []
        output = self(torch.tensor(x).view(-1, self.input_size).float())
        for idx, i in enumerate(output):
            ans.append(torch.argmax(i).item())
        return ans

    def predict_proba(self, x):
        ans = []
        output = self(torch.tensor(x).view(-1, self.input_size).float())
        for idx, i in enumerate(output):
            row = i.tolist()
            ans.append(row if sum(row) == 1 else [float(i)/sum(row) for i in row])
        return ans

    def get_params(self, deep):
      return {
          'input_size': self.input_size, 
          'hidden1_size': self.hidden1_size, 
          'hidden2_size': self.hidden2_size, 
          'num_classes': self.num_classes, 
          'epochs': self.epochs,
          'lr': self.lr
      }
    
    def set_params(self, **parameters):
      for parameter, value in parameters.items():
          setattr(self, parameter, value)
      return self


In [89]:
class NoiseNet(Net):
  
  def add_noise(self, X):
    for j in range(len(X[0])):
      std = np.std([x[j] for x in X])
      for i in range(len(X)):
        X[i][j] = X[i][j] + np.random.normal(0, std / 4)
    return X

  def fit(self, X, y):
    Net.fit(self, self.add_noise(X), y)

In [90]:
class AdvancedNoise(Net):

  def add_noise(self, X):
    iso = IsolationForest(contamination=0.1)
    yhat = iso.fit_predict(X)
    for j in range(len(X[0])):
      std = np.std([x[j] for x in X])
      for i in range(len(X)):
        if yhat[i] == 1:
          X[i][j] = X[i][j] + np.random.normal(0, std / 4)
    return X
    

  def fit(self, X, y):
    Net.fit(self, self.add_noise(X), y)

In [91]:
def evaluate(y_pred_p, y_test):
    y_pred = [pred_p.index(max(pred_p)) for pred_p in y_pred_p]
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=1)
    accuracy = accuracy_score(y_test, y_pred) 
    fpr = (np.mean([len([_ for (p, t) in list(zip(y_pred, y_test)) if p == cl and t != cl]) / len([t for t in y_test if t != cl]) for cl in np.unique(y_test)])) if len(np.unique(y_test)) > 1 else 0
    auc_score = (roc_auc_score(y_test, y_pred_p, multi_class='ovr') if len(np.unique(y_test)) > 2 else roc_auc_score(y_test, np.array(y_pred_p)[:, 1])) if len(np.unique(y_test)) > 1 else 1
    return {
        'precision': round(precision, 3),
        'tpr': round(recall, 3),
        'accuracy': round(accuracy, 3),
        'fpr': round(fpr, 3),
        'auc': round(auc_score, 3)
    }

In [92]:
def inner_cross_validation(algorithm, distributions, X, y):
  clf = RandomizedSearchCV(algorithm, distributions, n_iter=50, cv=3, scoring='accuracy')
  search = clf.fit(X, y)
  return search.best_estimator_, search.best_params_, round(np.mean(search.cv_results_['mean_fit_time']), 3), round(np.mean(search.cv_results_['mean_score_time']), 3)

In [93]:
def outer_cross_validation(algorithm, distributions, X, y, folds=10):
  kf = KFold(n_splits=folds)
  cycle = 0
  results = []
  for train_index, test_index in kf.split(X):
    cycle = cycle + 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    best_model, best_params, train_time, score_time = inner_cross_validation(algorithm, distributions, X_train, y_train)
    y_pred_p = best_model.predict_proba(X_test)
    y_pred = best_model.predict(X_test)
    evaluation = {**{'cycle': cycle, 'params': str(best_params), 'train time': train_time, 'score time': score_time}, 
                  **evaluate(y_pred_p, y_test)}
    print(evaluation)
    results.append(evaluation)
  return results

In [100]:
def write_excel(workbook_name, data):
  workbook = xlsxwriter.Workbook(workbook_name)
  worksheet = workbook.add_worksheet()
  for i in range(len(data)):
    for j in range(len(data[i])):
      worksheet.write(i, j, data[i][j])
  workbook.close()


all_results = [['dataset', 'algorithm', 'cv', 'hyper parameters', 'accuracy', 'tpr', 'fpr', 'precision', 'auc', 'train time', 'inference time']]
for f in os.listdir('datasets'):
  print(f"******** current file: {f} ********")
  if os.path.splitext(f'datasets/{f}')[1] != '.csv':
    continue
  try:
    X, y = read_csv(f"datasets/{f}")
    input_size = len(X[0])
    num_classes = len(set(y))
    algorithms = [
        {
          'name': 'FNN + Advanced Noise',
          'alg': AdvancedNoise(input_size, 64, 3, num_classes),
          'distributions': {
              'epochs': [3, 4, 5, 6, 7],
              'lr': [0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.0035, 0.004, 0.0045, 0.005]
        }
        },
        {
          'name': 'FNN + Noise',
          'alg': NoiseNet(input_size, 64, 3, num_classes),
          'distributions': {
              'epochs': [3, 4, 5, 6, 7],
              'lr': [0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.0035, 0.004, 0.0045, 0.005]
        }
        },
        {
          'name': 'FNN',
          'alg': Net(input_size, 64, 3, num_classes),
          'distributions': {
              'epochs': [3, 4, 5, 6, 7],
              'lr': [0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.0035, 0.004, 0.0045, 0.005]
          }
        }
    ]
    for algorithm in algorithms:
      print("-"*80)
      print(" "*30 + algorithm['name'] + " "*30)
      print("-"*80)
      results = outer_cross_validation(algorithm['alg'], algorithm['distributions'], X, y)
      for r in results:
        all_results = all_results + [[f, algorithm['name'], r['cycle'], r['params'], r['accuracy'], r['tpr'], r['fpr'], r['precision'], r['auc'], r['train time'], r['score time']]]
  except Exception as e:
    print(f">>>>>>>>>>>>>>>>>> something inevitably went wrong: {e} <<<<<<<<<<<<<<<<<<<<<\n")

write_excel('results.xls', all_results)

******** current file: .ipynb_checkpoints ********
******** current file: parkinsons.csv ********
--------------------------------------------------------------------------------
                              FNN + Advanced Noise                              
--------------------------------------------------------------------------------
{'cycle': 1, 'params': "{'lr': 0.002, 'epochs': 4}", 'train time': 0.282, 'score time': 0.001, 'precision': 0.5, 'tpr': 0.825, 'accuracy': 0.65, 'fpr': 0, 'auc': 1}
{'cycle': 2, 'params': "{'lr': 0.0025, 'epochs': 6}", 'train time': 0.282, 'score time': 0.001, 'precision': 0.714, 'tpr': 0.714, 'accuracy': 0.6, 'fpr': 0.286, 'auc': 1.0}
{'cycle': 3, 'params': "{'lr': 0.004, 'epochs': 7}", 'train time': 0.28, 'score time': 0.001, 'precision': 0.698, 'tpr': 0.688, 'accuracy': 0.65, 'fpr': 0.312, 'auc': 0.885}
{'cycle': 4, 'params': "{'lr': 0.002, 'epochs': 7}", 'train time': 0.282, 'score time': 0.001, 'precision': 1.0, 'tpr': 1.0, 'accuracy': 1.0, 'fpr'

In [99]:
import xlrd

def read_excel():
    loc = "results.xls"
    wb = xlrd.open_workbook(loc)
    sheet = wb.sheet_by_index(0)
    ans = []
    for i in range(1, sheet.nrows):
        current = []
        for j in range(sheet.ncols):
            current.append(sheet.cell_value(i, j))
        ans.append(current)
    return ans

def calculate_chi_score():
  accuracy_column = 4
  dataset_column = 0
  algorithm_column = 1
  all_results = read_excel()
  all_datasets = list(np.unique([r[dataset_column] for r in all_results]))
  all_algorithms = list(np.unique([r[algorithm_column] for r in all_results]))

  alg_all_ranks = {alg: [] for alg in all_algorithms}
  for dataset in all_datasets:
      alg_results = {}
      for alg in all_algorithms:
          result = np.mean([r[accuracy_column] for r in all_results if r[dataset_column] == dataset and r[algorithm_column] == alg])
          alg_results[alg] = result
      relative_order = [k for k, v in sorted(alg_results.items(), key=lambda item: item[1])]
      for i in range(len(relative_order)):
          alg_all_ranks[relative_order[i]].append(i + 1)

  alg_mean_ranks = {alg: np.mean(ranks) for alg, ranks in alg_all_ranks.items()}
  Rj = alg_mean_ranks.values()
  N = 20
  L = 3
  x_formula = 12 * N / (L * (L + 1)) * sum([(rj - ((L + 1) / 2))**2 for rj in Rj])
  print(alg_mean_ranks)
  print(x_formula)


calculate_chi_score()

{'FNN': 2.45, 'FNN + Advanced Noise': 1.95, 'FNN + Noise': 1.6}
7.3000000000000025
