In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import itertools

In [2]:
# Load results.csv file in results folder
path = Path.cwd().joinpath('results', 'result.csv')
column = ['Date_Time', 'Dataset', 'Epoch', 'Slide_Win', 'FC_Layer_Num', 'Topk']

df = pd.read_csv(path)
df

Unnamed: 0,Date_Time,Dataset,Epoch,Slide_Win,Dim,Slide_Stride,FC_Layer_Num,FC_Dim,Decay,Val_Ratio,Topk,F1 Score,Precision,Recall,Confusion Matrix,Anomalies
0,"28-Sep-2022, 12:20:28",anomaly,100,30,64,2,2,128,0.10,0.1,2,0.059182,0.032661,0.317757,"[[33604, 1007], [73, 34]]",0.0031
1,"28-Sep-2022, 12:52:18",anomaly,100,30,64,2,2,128,0.10,0.1,20,0.081119,0.047776,0.271028,"[[34033, 578], [78, 29]]",0.0031
2,"28-Sep-2022, 13:24:07",anomaly,100,30,64,2,2,128,0.10,0.1,25,0.079618,0.048077,0.233645,"[[34116, 495], [82, 25]]",0.0031
3,"28-Sep-2022, 13:56:01",anomaly,100,30,64,2,2,128,0.01,0.1,2,0.060160,0.032421,0.420561,"[[33268, 1343], [62, 45]]",0.0031
4,"28-Sep-2022, 14:27:02",anomaly,100,30,64,2,2,128,0.01,0.1,20,0.054422,0.029630,0.336449,"[[33432, 1179], [71, 36]]",0.0031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2508,"02-Oct-2022, 16:14:13",mean,10,30,256,1,3,512,0.00,0.1,5,0.024369,0.013449,0.130841,"[[33584, 1027], [93, 14]]",0.0031
2509,"02-Oct-2022, 18:45:49",all,100,20,256,1,3,128,0.00,0.1,5,0.023241,0.012044,0.333333,"[[82361, 2953], [72, 36]]",0.0013
2510,"02-Oct-2022, 20:00:26",all_s,100,20,256,1,3,128,0.00,0.1,5,0.008027,0.004037,0.694444,"[[66810, 18504], [33, 75]]",0.0013
2511,"02-Oct-2022, 21:14:50",pca,100,20,256,1,3,128,0.00,0.1,5,0.036182,0.019411,0.268519,"[[83849, 1465], [79, 29]]",0.0013


In [3]:
# Select only Date_Time, Dataset, Epoch, Slide_Win, FC_Layer_Num and Topk
df1 = pd.DataFrame(df, columns=column)
df1

Unnamed: 0,Date_Time,Dataset,Epoch,Slide_Win,FC_Layer_Num,Topk
0,"28-Sep-2022, 12:20:28",anomaly,100,30,2,2
1,"28-Sep-2022, 12:52:18",anomaly,100,30,2,20
2,"28-Sep-2022, 13:24:07",anomaly,100,30,2,25
3,"28-Sep-2022, 13:56:01",anomaly,100,30,2,2
4,"28-Sep-2022, 14:27:02",anomaly,100,30,2,20
...,...,...,...,...,...,...
2508,"02-Oct-2022, 16:14:13",mean,10,30,3,5
2509,"02-Oct-2022, 18:45:49",all,100,20,3,5
2510,"02-Oct-2022, 20:00:26",all_s,100,20,3,5
2511,"02-Oct-2022, 21:14:50",pca,100,20,3,5


In [4]:
# Get all unsampled credit datasets except anomaly dataset

all_data = re.compile('all.*|pca.*|first.*|last.*|mean.*|median.*')
temp = []
sort_col = ['Dataset', 'Epoch', 'Slide_Win', 'FC_Layer_Num', 'Topk']

for i in range(len(df1)):
    dataset = df1.loc[i, 'Dataset']

    if all_data.match(dataset):
        temp.append(df1.loc[i, :])

df2 = pd.DataFrame(temp).sort_values(by=sort_col).reset_index(drop=True)
df2

Unnamed: 0,Date_Time,Dataset,Epoch,Slide_Win,FC_Layer_Num,Topk
0,"05-Nov-2022, 01:33:23",all,100,5,2,5
1,"05-Nov-2022, 14:55:58",all,100,5,2,10
2,"06-Nov-2022, 04:57:19",all,100,5,2,20
3,"06-Nov-2022, 14:39:15",all,100,5,3,5
4,"07-Nov-2022, 03:27:31",all,100,5,3,10
...,...,...,...,...,...,...
260,"02-Oct-2022, 22:29:23",pca_s,100,20,3,5
261,"08-Nov-2022, 08:38:34",pca_s,100,20,3,10
262,"07-Nov-2022, 18:39:44",pca_s,100,20,3,20
263,"03-Oct-2022, 13:46:38",pca_s,100,30,2,5


In [5]:
df2['Dataset'].value_counts().sort_index()

all         21
all_s       23
first       25
first_s     21
last        20
last_s      21
mean        22
mean_s      22
median      23
median_s    21
pca         24
pca_s       22
Name: Dataset, dtype: int64

In [6]:
# sample_dict to store completed sample for respective dataset
sample_dict = {}
sample_list = []

for dataset in df2['Dataset'].unique():
    sample_dict[dataset] = []

for i in range(len(df2)):
    dataset = df2.loc[i, 'Dataset']
    win = df2.loc[i, 'Slide_Win']
    layer = df2.loc[i, 'FC_Layer_Num']
    topk = df2.loc[i, 'Topk']
    combi = (win, layer, topk)
    
    sample_dict[dataset].append(combi)
    sample_list.append(combi)
    
df2['combi'] = sample_list
df2
    

Unnamed: 0,Date_Time,Dataset,Epoch,Slide_Win,FC_Layer_Num,Topk,combi
0,"05-Nov-2022, 01:33:23",all,100,5,2,5,"(5, 2, 5)"
1,"05-Nov-2022, 14:55:58",all,100,5,2,10,"(5, 2, 10)"
2,"06-Nov-2022, 04:57:19",all,100,5,2,20,"(5, 2, 20)"
3,"06-Nov-2022, 14:39:15",all,100,5,3,5,"(5, 3, 5)"
4,"07-Nov-2022, 03:27:31",all,100,5,3,10,"(5, 3, 10)"
...,...,...,...,...,...,...,...
260,"02-Oct-2022, 22:29:23",pca_s,100,20,3,5,"(20, 3, 5)"
261,"08-Nov-2022, 08:38:34",pca_s,100,20,3,10,"(20, 3, 10)"
262,"07-Nov-2022, 18:39:44",pca_s,100,20,3,20,"(20, 3, 20)"
263,"03-Oct-2022, 13:46:38",pca_s,100,30,2,5,"(30, 2, 5)"


In [7]:
# Select unique sample and sort in ascending order
for dataset in sample_dict.keys():
    sample_dict[dataset] = sorted(list(set(sample_dict[dataset])))
    
sample_dict

{'all': [(5, 2, 5),
  (5, 2, 10),
  (5, 2, 20),
  (5, 3, 5),
  (5, 3, 10),
  (5, 3, 20),
  (10, 2, 5),
  (10, 2, 10),
  (10, 2, 20),
  (10, 3, 5),
  (10, 3, 10),
  (10, 3, 20),
  (20, 2, 5),
  (20, 2, 10),
  (20, 2, 20),
  (20, 3, 5),
  (20, 3, 10),
  (20, 3, 20),
  (30, 2, 5)],
 'all_s': [(5, 2, 5),
  (5, 2, 10),
  (5, 2, 20),
  (5, 3, 5),
  (5, 3, 10),
  (5, 3, 20),
  (10, 2, 5),
  (10, 2, 10),
  (10, 2, 20),
  (10, 3, 5),
  (10, 3, 10),
  (10, 3, 20),
  (20, 2, 5),
  (20, 2, 10),
  (20, 2, 20),
  (20, 3, 5),
  (20, 3, 10),
  (20, 3, 20),
  (30, 2, 5)],
 'first': [(5, 2, 5),
  (5, 2, 10),
  (5, 2, 20),
  (5, 3, 5),
  (5, 3, 10),
  (5, 3, 20),
  (10, 2, 5),
  (10, 2, 10),
  (10, 2, 20),
  (10, 3, 5),
  (10, 3, 10),
  (10, 3, 20),
  (20, 2, 5),
  (20, 2, 10),
  (20, 2, 20),
  (20, 3, 5),
  (20, 3, 10),
  (20, 3, 20),
  (30, 2, 5),
  (30, 3, 5)],
 'first_s': [(5, 2, 5),
  (5, 2, 10),
  (5, 2, 20),
  (5, 3, 5),
  (5, 3, 10),
  (5, 3, 20),
  (10, 2, 5),
  (10, 2, 10),
  (10, 2, 20),
  (10

In [8]:
stat = {}

# list of sampling rate used
win = [5, 10, 20]
layer = [2, 3]
topk = [5, 10, 20]

# Iterate through datasets
for k, v in sample_dict.items():
    
    # Compare required samples with completed samples
    for sample in itertools.product(win, layer, topk):
        status = 1 if sample in v else 0
        
        if k in stat.keys():
            stat[k].append(status)
        else:
            stat[k] = [status]

stat

{'all': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'all_s': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'first': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'first_s': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'last': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'last_s': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'mean': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'mean_s': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'median': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'median_s': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'pca': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'pca_s': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
def highlight_uncomplete(s):
    is_zero = s == 0
    return ['background-color: red' if i else 'background-color: None' for i in is_zero]

In [10]:
# Convert stat to dataframe
df_stat = pd.DataFrame(stat, index=list(itertools.product(win, layer, topk)))
df_stat.to_csv('stat1.csv')
df_stat.style.apply(highlight_uncomplete)

Unnamed: 0,all,all_s,first,first_s,last,last_s,mean,mean_s,median,median_s,pca,pca_s
"(5, 2, 5)",1,1,1,1,1,1,1,1,1,1,1,1
"(5, 2, 10)",1,1,1,1,1,1,1,1,1,1,1,1
"(5, 2, 20)",1,1,1,1,1,1,1,1,1,1,1,1
"(5, 3, 5)",1,1,1,1,1,1,1,1,1,1,1,1
"(5, 3, 10)",1,1,1,1,1,1,1,1,1,1,1,1
"(5, 3, 20)",1,1,1,1,1,1,1,1,1,1,1,1
"(10, 2, 5)",1,1,1,1,1,1,1,1,1,1,1,1
"(10, 2, 10)",1,1,1,1,1,1,1,1,1,1,1,1
"(10, 2, 20)",1,1,1,1,1,1,1,1,1,1,1,1
"(10, 3, 5)",1,1,1,1,1,1,1,1,1,1,1,1


In [11]:
uncomplete = {}

for data in df_stat.columns:
    info = {}
    combi = df_stat.loc[df_stat[data] == 0, data].index.to_list()
    
    for i in combi:
        key = f"({i[0]}, {i[1]})"
        if key in info.keys():
            info[key].append(i[2])
        else:
            info[key] = [i[2]]

    df_info = pd.DataFrame(list(info.values()), index=list(info.keys()))
    df_info.rename(columns={0: '1st', 1: '2nd', 2: '3rd'}, inplace=True)
    df_info.rename_axis(data, inplace=True)
    
    uncomplete[data] = df_info.fillna('-')


In [12]:
for dataset in df_stat.columns:
    if dataset.endswith('10'):
        print()
        print(dataset.split('_')[0].upper())
        print('-'*15)
        
    print(uncomplete[dataset])
    print('_'*50)

Empty DataFrame
Columns: []
Index: []
__________________________________________________
Empty DataFrame
Columns: []
Index: []
__________________________________________________
Empty DataFrame
Columns: []
Index: []
__________________________________________________
Empty DataFrame
Columns: []
Index: []
__________________________________________________
Empty DataFrame
Columns: []
Index: []
__________________________________________________
Empty DataFrame
Columns: []
Index: []
__________________________________________________
Empty DataFrame
Columns: []
Index: []
__________________________________________________
Empty DataFrame
Columns: []
Index: []
__________________________________________________
Empty DataFrame
Columns: []
Index: []
__________________________________________________
Empty DataFrame
Columns: []
Index: []
__________________________________________________
Empty DataFrame
Columns: []
Index: []
__________________________________________________
Empty DataFrame
Colum