In [1]:
import pandas as pb
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import os.path
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, KFold
import gzip

from saxpy.znorm import znorm
from saxpy.sax import ts_to_string
from saxpy.alphabet import cuts_for_asize

In [2]:
X_L = []
X_R = []
Y = []

for root, dirs, files in os.walk('Recording Data'):
    for file in files:
        if file.endswith(".txt") and file != 'Subject codes.txt':
            file_path = os.path.join(root, file)
            match root[-1]:
                case '1' | '2':
                    has_dyslexia = 1
                case _:
                    has_dyslexia = 0
            data = pd.read_csv(file_path, sep='\t', index_col=None, decimal=",")
            data['r_dist'] = ts_to_string(znorm(np.array((data['RX'].diff() ** 2 + data['RY'].diff() ** 2) ** 0.5)), cuts_for_asize(8))
            data['l_dist'] = ts_to_string(znorm(np.array((data['LX'].diff() ** 2 + data['LY'].diff() ** 2) ** 0.5)), cuts_for_asize(8))
            X_L.append(data['l_dist'][1:])
            X_R.append(data['r_dist'][1:])
            Y.append(has_dyslexia)

In [3]:
X_L_train, X_L_test, Y_L_train, Y_L_test = train_test_split(X_L, Y, test_size=0.3, random_state=42)

In [4]:
X_R_train, X_R_test, Y_R_train, Y_R_test = train_test_split(X_R, Y, test_size=0.3, random_state=42)

In [5]:
k_m = [1, 5, 10, 15, 20, 25]

In [9]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)

f_scores = {}
for k in k_m:
    f_scores[k] = [[], []]
    
for k in k_m:
    for train_index, test_index in kf.split(X_L):
        X_L_train, X_L_test = [X_L[i] for i in train_index], [X_L[i] for i in test_index]
        X_R_train, X_R_test = [X_R[i] for i in train_index], [X_R[i] for i in test_index]
        Y_train, Y_test = [Y[i] for i in train_index], [Y[i] for i in test_index]

        predicts_L = []
        predicts_R = []
        
        for x1 in X_L_test:
            x1 = np.array(x1)
            Cx1 = len(gzip.compress(x1))
            distance_from_x1 = []
            for x2 in X_L_train:
                x2 = np.array(x2)
                Cx2 = len(gzip.compress(x2))
                x1x2 = np.concatenate((x1, x2))
                Cx1x2 = len(gzip.compress(np.array(x1x2)))
                ncd = (Cx1x2 - min(Cx1 , Cx2)) / max (Cx1 , Cx2 )
                distance_from_x1.append(ncd)
                        
            sorted_idx = np.argsort(np.array(distance_from_x1))
            top_k_class = np.array(Y_train)[sorted_idx[:k]].tolist()
            predict_class = max(set(top_k_class), key = top_k_class.count)
            predicts_L.append(predict_class)

        predicts_R = []
        for x1 in X_R_test:
            x1 = np.array(x1)
            Cx1 = len(gzip.compress(x1))
            distance_from_x1 = []
            for x2 in X_R_train:
                x2 = np.array(x2)
                Cx2 = len(gzip.compress(x2))
                x1x2 = np.concatenate((x1, x2))
                Cx1x2 = len(gzip.compress(np.array(x1x2)))
                ncd = (Cx1x2 - min(Cx1 , Cx2)) / max (Cx1 , Cx2 )
                distance_from_x1.append(ncd)
                        
            sorted_idx = np.argsort(np.array(distance_from_x1))
            top_k_class = np.array(Y_train)[sorted_idx[:k]].tolist()
            predict_class = max(set(top_k_class), key = top_k_class.count)
            predicts_R.append(predict_class)

        predicts = [(x + y) // 2 for x,y in zip(predicts_L, predicts_R)]

        sensitivity, specificity = calculate_sensitivity_specificity(Y_test, predicts)
        f1 = classification_report(Y_test, predicts, output_dict=True)['macro avg']['f1-score']
        f_scores[k][0].append(f1)
        f_scores[k][1].append([sensitivity, specificity])

In [10]:
sensitivity = {k: sum([scores[0] for scores in f_scores[k][1]]) / len([scores[0] for scores in f_scores[k][1]]) for k in k_m}
specificity = {k: sum([scores[1] for scores in f_scores[k][1]]) / len([scores[1] for scores in f_scores[k][1]]) for k in k_m}
data = [f_scores[k][0] + [sum(f_scores[k][0]) / len(f_scores[k][0]), sensitivity[k], specificity[k]] for k in k_m]

In [11]:
table = pd.DataFrame(data, index=k_m, columns=['1_split', '2_split', '3_split', '4_split', '5_split', 'avg_score', 'sensitivity', 'specificity'])

In [12]:
table

Unnamed: 0,1_split,2_split,3_split,4_split,5_split,avg_score,sensitivity,specificity
1,0.744828,0.701832,0.782353,0.729532,0.712733,0.734255,0.618758,0.875451
5,0.918681,0.805701,0.864469,0.834821,0.836765,0.852088,0.840458,0.875451
10,0.945906,0.885093,0.864865,0.782353,0.836765,0.862996,0.868333,0.864925
15,0.945906,0.885093,0.891813,0.808574,0.864469,0.879171,0.900556,0.864925
20,0.891176,0.831818,0.810256,0.808574,0.864469,0.841259,0.827026,0.864925
25,0.918681,0.805701,0.837719,0.810256,0.836765,0.841825,0.820458,0.87669


In [6]:
def calculate_sensitivity_specificity(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    
    tn, fp, fn, tp = cm.ravel()
    
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    
    return sensitivity, specificity