In [1]:
import pandas as pb
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import os.path
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, KFold
import gzip

In [2]:
X_L = []
X_R = []
Y = []

for root, dirs, files in os.walk('Recording Data'):
    for file in files:
        if file.endswith(".txt") and file != 'Subject codes.txt':
            file_path = os.path.join(root, file)
            match root[-1]:
                case '1' | '2':
                    has_dyslexia = 1
                case _:
                    has_dyslexia = 0
            data = pd.read_csv(file_path, sep='\t', index_col=None, decimal=",")
            data['r_dist'] = (data['RX'].diff() ** 2 + data['RY'].diff() ** 2) ** 0.5
            data['l_dist'] = (data['LX'].diff() ** 2 + data['LY'].diff() ** 2) ** 0.5
            X_L.append(data['l_dist'][1:])
            X_R.append(data['r_dist'][1:])
            Y.append(has_dyslexia)

In [3]:
X_L_train, X_L_test, Y_L_train, Y_L_test = train_test_split(X_L, Y, test_size=0.3, random_state=42)

In [4]:
X_R_train, X_R_test, Y_R_train, Y_R_test = train_test_split(X_R, Y, test_size=0.3, random_state=42)

In [5]:
k_m = [1, 5, 10, 15, 20, 25]

In [17]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)

f_scores = []
for k in k_m:
    for train_index, test_index in kf.split(X_L):
        X_L_train, X_L_test = [X_L[i] for i in train_index], [X_L[i] for i in test_index]
        X_R_train, X_R_test = [X_R[i] for i in train_index], [X_R[i] for i in test_index]
        Y_train, Y_test = [Y[i] for i in train_index], [Y[i] for i in test_index]

        predicts_L = []
        predicts_R = []
        
        for x1 in X_L_test:
            x1 = np.array(x1)
            Cx1 = len(gzip.compress(x1))
            distance_from_x1 = []
            for x2 in X_L_train:
                x2 = np.array(x2)
                Cx2 = len(gzip.compress(x2))
                x1x2 = np.concatenate((x1, x2))
                Cx1x2 = len(gzip.compress(np.array(x1x2)))
                ncd = (Cx1x2 - min(Cx1 , Cx2)) / max (Cx1 , Cx2 )
                distance_from_x1.append(ncd)
                        
            sorted_idx = np.argsort(np.array(distance_from_x1))
            top_k_class = np.array(Y_train)[sorted_idx[:k]].tolist()
            predict_class = max(set(top_k_class), key = top_k_class.count)
            predicts_L.append(predict_class)

        predicts_R = []
        for x1 in X_R_test:
            x1 = np.array(x1)
            Cx1 = len(gzip.compress(x1))
            distance_from_x1 = []
            for x2 in X_R_train:
                x2 = np.array(x2)
                Cx2 = len(gzip.compress(x2))
                x1x2 = np.concatenate((x1, x2))
                Cx1x2 = len(gzip.compress(np.array(x1x2)))
                ncd = (Cx1x2 - min(Cx1 , Cx2)) / max (Cx1 , Cx2 )
                distance_from_x1.append(ncd)
                        
            sorted_idx = np.argsort(np.array(distance_from_x1))
            top_k_class = np.array(Y_train)[sorted_idx[:k]].tolist()
            predict_class = max(set(top_k_class), key = top_k_class.count)
            predicts_R.append(predict_class)

        predicts = [(x + y) // 2 for x,y in zip(predicts_L, predicts_R)]
        
        f1 = classification_report(Y_test, predicts, output_dict=True)['macro avg']['f1-score']
        f_scores.append(f1)

In [18]:
f_scores_split = np.array_split(f_scores, 6)

In [19]:
table = pd.DataFrame(f_scores_split, index=[1, 5, 10, 15, 20, 25], columns=['1_split', '2_split', '3_split', '4_split', '5_split'])

In [20]:
table['avg_score'] = table.sum(axis=1) / 5

In [21]:
table

Unnamed: 0,1_split,2_split,3_split,4_split,5_split,avg_score
1,0.810256,0.795902,0.701832,0.82764,0.782353,0.783597
5,0.864469,0.909388,0.805701,0.795902,0.679275,0.810947
10,0.945906,0.876667,0.834821,0.801533,0.805701,0.852926
15,0.891813,0.805944,0.805701,0.795902,0.744828,0.808838
20,0.864469,0.815,0.834821,0.82764,0.834821,0.83535
25,0.864469,0.815,0.834821,0.795902,0.834821,0.829003
