In [None]:
# Import the dependencies in use
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import pickle
from sklearn import preprocessing
from sklearn.metrics import f1_score, accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
import zipfile

def print_tripple_arr(a1, a2, a3):
  for i in range(max([len(a1), len(a2), len(a3)])):
    x1 = a1[i] if i < len(a1) else None
    x2 = a2[i] if i < len(a2) else None
    x3 = a3[i] if i < len(a3) else None
    print(f"{x1} {x2} {x3}")

In [None]:
# The datasets are stored in google drive as zip files
# this cell unzips them into the working directory of the colab instance
# goes from ~8Gb zipped to ~40GB unzipped

drive_dir = "/content/drive/MyDrive"
dset_zip = "ExtraSensory.per_uuid_features_labels.zip"

def extract_xtrasensory_labels_to_pwd():
  print(f"extracting {drive_dir}/{dset_zip} to pwd")
  with zipfile.ZipFile(f"{drive_dir}/{dset_zip}", "r") as zip_f:
    zip_f.extractall(path="./lbl")

lbl_glob_str = "./lbl/*.features_labels.csv.gz"
glb = glob.glob(lbl_glob_str)
if not glb:
  extract_xtrasensory_labels_to_pwd()
  glb = glob.glob(lbl_glob_str)

glb.sort()


extracting /content/drive/MyDrive/ExtraSensory.per_uuid_features_labels.zip to pwd


In [None]:
def get_cols_of_interest():
  df_cols = pd.read_csv("/content/lbl/00EABED2-271D-49D8-B599-1D4A09240601.features_labels.csv.gz", compression='gzip').columns.to_list()
  raw_ret = list(filter(lambda s: 'raw_acc:' in s, df_cols))
  watch_ret = list(filter(lambda s: 'watch_acceleration:' in s, df_cols))
  lbl_ret = list(filter(lambda s: 'label:' in s, df_cols))
  return raw_ret, watch_ret, lbl_ret

def get_dat_and_lbl_df(dat_cols, lbl_cols, glb):
  dat_lst = []
  lbl_lst =[]

  for f_tgz in glb:
    df = pd.read_csv(f_tgz, compression='gzip').get(dat_cols + lbl_cols)
    df[lbl_cols] = df[lbl_cols].fillna(0)
    df = df.dropna()
    dat_lst.append(df[dat_cols])
    lbl_lst.append(df[lbl_cols])
  return dat_lst, lbl_lst

def df_list_to_np_arr(df_lst):
  ret_lst = []
  for df in df_lst:
    ret_lst.append(df.to_numpy())
  return np.concatenate(ret_lst, axis=0)

raw_coi, watch_coi, lbl_coi = get_cols_of_interest()
# dat_df_lst, lbl_df_lst = get_dat_and_lbl_df(raw_coi + watch_coi, lbl_coi, glb[0:3])
dat_df_lst, lbl_df_lst = get_dat_and_lbl_df(raw_coi + watch_coi, lbl_coi, glb)

dat_arr = df_list_to_np_arr(dat_df_lst)
lbl_arr = df_list_to_np_arr(lbl_df_lst)

In [None]:
def calculate_metrics(pred, target, threshold=0.5):
  def _balanced_acc_and_acc(y_true, y_pred):
    '''
    Ballanced accuracy = 1/2 * (tp/(tp+fn) + tn/(tn/fp))
    '''
    TP_c, TN_c, FP_c, FN_c = 0,0,0,0
    for t, p in zip(y_true, y_pred):
      TP, TN, FP, FN = 0,0,0,0
      trues = p[p == t]
      falses = p[p != t]
      TP = np.count_nonzero(trues)
      TN = len(trues) - TP
      FP = np.count_nonzero(falses)
      FN = len(falses) - FP
      TP_c += TP
      TN_c += TN
      FP_c += FP
      FN_c += FN
    specificity = TP_c/(TP_c+FN_c)
    sensitivity = TN_c/(TN_c+FP_c)
    return {
        'ba': 0.5 * (specificity + sensitivity), 
        'a':(TP_c + TN_c)/(TP_c + TN_c + FN_c + FP_c), 
        'tp_rate': specificity, 
        'tn_rate': sensitivity,
        }

  pred = np.array(pred > threshold, dtype=float)
  tmp_dict  = _balanced_acc_and_acc(y_true=target, y_pred=pred)
  return {
          # 'micro/precision': precision_score(y_true=target, y_pred=pred, average='micro'),
          # 'micro/recall': recall_score(y_true=target, y_pred=pred, average='micro'),
          'micro/f1': f1_score(y_true=target, y_pred=pred, average='micro', zero_division=0),
          # 'macro/precision': precision_score(y_true=target, y_pred=pred, average='macro'),
          # 'macro/recall': recall_score(y_true=target, y_pred=pred, average='macro'),
          'macro/f1': f1_score(y_true=target, y_pred=pred, average='macro', zero_division=0),
          # 'samples/precision': precision_score(y_true=target, y_pred=pred, average='samples'),
          # 'samples/recall': recall_score(y_true=target, y_pred=pred, average='samples'),
          'samples/f1': f1_score(y_true=target, y_pred=pred, average='samples', zero_division=0),
          'ba': tmp_dict['ba'],
          'ss_a': accuracy_score(y_true=target, y_pred=pred),
          'a': tmp_dict['a'],
          'tp_rate': tmp_dict['tp_rate'],
          'tn_rate': tmp_dict['tn_rate'],
          }

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dat_arr, lbl_arr)
# X_train, X_test, y_train, y_test = train_test_split(dat_arr[0:20000], lbl_arr[0:20000])

# cls = GaussianNB()
# cls = DecisionTreeClassifier(max_depth=3)
# cls = RandomForestClassifier(max_depth=3, verbose=1)
# cls = AdaBoostClassifier(n_estimators=20)
# multi_cls = MultiOutputClassifier(cls, n_jobs=-1)
# multi_cls = multi_cls.fit(X_train, y_train)
# y_pred = multi_cls.predict(X_test)
# calculate_metrics(y_pred, y_test)

In [None]:
# with open(f"{drive_dir}/multi_Adaboost20Estimator.pkl", "wb") as f:
#   pickle.dump(multi_cls, f)

with open(f"{drive_dir}/multi_dtree.pkl", "rb") as f:
  multi_tree = pickle.load(f)

with open(f"{drive_dir}/multi_GaussNB.pkl", "rb") as f:
  multi_gauss_nb = pickle.load(f)

with open(f"{drive_dir}/multi_Adaboost10Estimator.pkl", "rb") as f:
  multi_ada_10e = pickle.load(f)

with open(f"{drive_dir}/multi_Adaboost20Estimator.pkl", "rb") as f:
  multi_ada_20e = pickle.load(f)

print(f"multi_tree:{calculate_metrics(multi_tree.predict(X_test), y_test)}")
print(f"multi_gauss_nb:{calculate_metrics(multi_gauss_nb.predict(X_test), y_test)}")
print(f"multi_ada_10e:{calculate_metrics(multi_ada_10e.predict(X_test), y_test)}")
print(f"multi_ada_20e:{calculate_metrics(multi_ada_20e.predict(X_test), y_test)}")


multi_tree:{'micro/f1': 0.5222879955552883, 'macro/f1': 0.15201188265675494, 'samples/f1': 0.4125033282802891, 'ba': 0.7110329510902392, 'ss_a': 0.12266339869281045, 'a': 0.9465567730360118, 'tp_rate': 0.4393849593554698, 'tn_rate': 0.9826809428250086}
multi_gauss_nb:{'micro/f1': 0.26902548274174726, 'macro/f1': 0.15071347024371246, 'samples/f1': 0.2897431381353074, 'ba': 0.7454941092876233, 'ss_a': 4.901960784313725e-05, 'a': 0.7200144175317186, 'tp_rate': 0.7748818248839933, 'tn_rate': 0.7161063936912532}
multi_ada_10e:{'micro/f1': 0.5125710127236021, 'macro/f1': 0.11907784343018996, 'samples/f1': 0.41484303233077746, 'ba': 0.7025423346560074, 'ss_a': 0.10754901960784313, 'a': 0.9467807253620403, 'tp_rate': 0.42084315114368454, 'tn_rate': 0.98424151816833}
multi_ada_20e:{'micro/f1': 0.5274041120879379, 'macro/f1': 0.1336248260161289, 'samples/f1': 0.42552021000060214, 'ba': 0.7086032135192797, 'ss_a': 0.12727124183006536, 'a': 0.9485377418941433, 'tp_rate': 0.4318680100804217, 'tn_ra

In [None]:
# get_cols_of_interest()

In [None]:
# df_cols = pd.read_csv("/content/lbl/00EABED2-271D-49D8-B599-1D4A09240601.features_labels.csv.gz", compression='gzip').columns.to_list()
# df_cols