In [1]:
import os
import struct
import numpy as np
import pandas as pd

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

from DataLoader import DataLoader
data_loader = DataLoader()

Вопросы на семинар:
1. Чьи именно голоса из .byt записаны в .wav?

### Part 1

In [3]:
proj_df_list, proj_raw_data, proj_stat_data, proj_stat_shared_data =\
    data_loader.read_projections('../../projections')

In [4]:
index_to_features = data_loader.get_preprocessed_data(proj_raw_data)

In [5]:
X = []
y = []

step = 50
delta = 25

for index, matrix in index_to_features.items():
    for i in range(0, matrix.shape[1]-step, delta):
        submatrix = matrix[:, i:i+step]
        subfeatures_mean = np.mean(submatrix, axis=1)
        subfeatures_std = np.std(submatrix, axis=1)
        subfeatures = []
        for m, s in zip(subfeatures_mean, subfeatures_std):
            subfeatures.append(m)
            subfeatures.append(s)
        subfeatures = np.array(subfeatures)
        X.append(subfeatures)
        y.append(index)

X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test size: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (360, 30)
y_train shape: (360,)
X_test size: (91, 30)
y_test shape: (91,)


In [6]:
X_train[:, :28].shape

(360, 28)

In [7]:
def train_catboost_and_get_acc(iterations=500, depth=6,
                               learning_rate=0.1,
                               num_of_features=15):
    model = CatBoostClassifier(iterations=iterations, depth=depth,
                               learning_rate=learning_rate,
                               loss_function='MultiClass', verbose=False)
    model.fit(X_train[:, :num_of_features*2], y_train)
    y_pred = model.predict(X_test[:, :num_of_features*2])
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

def get_best_acc(num_of_features, verbose=False):
    mean_best = 0
    retries = 5
    for _ in range(retries):
        best_acc = 0
        for lr in [0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0]:
            accuracy = train_catboost_and_get_acc(1000, 5, lr, num_of_features)
            best_acc = max(best_acc, accuracy)
            if verbose:
                print(f'CatBoost: depth = {5}, learning_rate = {lr}, accuracy: {accuracy}')
        mean_best += best_acc
        if verbose:
            print('-'*20)
    
    return mean_best / retries

In [8]:
num_features_to_acc = {}

for num_features in tqdm(range(15, 0, -1)):
    acc = get_best_acc(num_features)
    num_features_to_acc[num_features] = acc
    print(f'Num of features: {num_features}, accuracy = {acc}')

  7%|▋         | 1/15 [02:45<38:33, 165.29s/it]

Num of features: 15, accuracy = 0.6153846153846154


 13%|█▎        | 2/15 [05:37<36:41, 169.38s/it]

Num of features: 14, accuracy = 0.6373626373626373


 20%|██        | 3/15 [08:30<34:13, 171.10s/it]

Num of features: 13, accuracy = 0.6153846153846154


 27%|██▋       | 4/15 [11:01<29:53, 163.04s/it]

Num of features: 12, accuracy = 0.5714285714285714


 33%|███▎      | 5/15 [13:26<26:06, 156.63s/it]

Num of features: 11, accuracy = 0.5934065934065934


 40%|████      | 6/15 [16:01<23:23, 155.89s/it]

Num of features: 10, accuracy = 0.6373626373626373


 47%|████▋     | 7/15 [18:20<20:04, 150.55s/it]

Num of features: 9, accuracy = 0.5824175824175825


 53%|█████▎    | 8/15 [20:32<16:53, 144.76s/it]

Num of features: 8, accuracy = 0.5934065934065934


 60%|██████    | 9/15 [22:19<13:17, 132.87s/it]

Num of features: 7, accuracy = 0.5494505494505495


 67%|██████▋   | 10/15 [24:04<10:21, 124.26s/it]

Num of features: 6, accuracy = 0.5054945054945055


 73%|███████▎  | 11/15 [25:40<07:42, 115.51s/it]

Num of features: 5, accuracy = 0.5274725274725275


 80%|████████  | 12/15 [26:51<05:06, 102.12s/it]

Num of features: 4, accuracy = 0.4725274725274725


 87%|████████▋ | 13/15 [27:56<03:01, 90.75s/it] 

Num of features: 3, accuracy = 0.45054945054945056


 93%|█████████▎| 14/15 [29:00<01:22, 82.59s/it]

Num of features: 2, accuracy = 0.38461538461538464


100%|██████████| 15/15 [29:53<00:00, 119.60s/it]

Num of features: 1, accuracy = 0.2087912087912088





In [9]:
sorted_data = sorted(num_features_to_acc.items(), key=lambda item: item[0], reverse=True)
x, y = zip(*sorted_data)

fig = go.Figure(data=go.Scatter(x=x, y=y))
fig.update_layout(
    title="Точность классификации в зависимости от числа компонент",
    xaxis_title="Число компонент",
    yaxis_title="Точность классификации",
)
fig.update_xaxes(tickmode='array', tickvals=list(range(1, 16)))

fig.show()

Part 2

In [11]:
file_to_meta = {}
file_to_data = {}
folder_path = '../../eeg/wernicke'

for i, filename in enumerate(os.listdir(folder_path)):
    if filename.endswith('.BYT'):
        file_path = os.path.join(folder_path, filename)
        eeg_raw_data = data_loader.read_eeg(file_path)
        meta = data_loader.parse_eeg(eeg_raw_data)
        if len(meta) > 0:
            file_to_meta[filename] = meta
            file_to_data[filename] = eeg_raw_data