In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np


In [2]:
from pathlib import Path
paths = ['respondent_1', 'respondent_2', 'respondent_3', 
         'respondent_4','respondent_5','respondent_6',
         'respondent_7','respondent_8','respondent_9',
         'respondent_10','respondent_11',]
file_paths = {}
for name in paths:
    folder = Path(name)
    files = [name + '/' + f.name for f in folder.iterdir() if f.is_file()]
    for i, file in enumerate(sorted(files, key=lambda x: x[-24:-5])):
        timestamp = file[-24:-5]
        file_paths[(name, timestamp)] = file

file_paths

{('respondent_1',
  '2025_04_21_20_20_49'): 'respondent_1/results_2025_04_21_20_20_49.json',
 ('respondent_1',
  '2025_04_22_22_21_04'): 'respondent_1/results_2025_04_22_22_21_04.json',
 ('respondent_1',
  '2025_04_23_14_36_06'): 'respondent_1/results_2025_04_23_14_36_06.json',
 ('respondent_1',
  '2025_04_28_10_32_05'): 'respondent_1/results_2025_04_28_10_32_05.json',
 ('respondent_1',
  '2025_04_28_21_30_12'): 'respondent_1/results_2025_04_28_21_30_12.json',
 ('respondent_1',
  '2025_04_29_11_47_33'): 'respondent_1/results_2025_04_29_11_47_33.json',
 ('respondent_1',
  '2025_04_29_23_14_39'): 'respondent_1/results_2025_04_29_23_14_39.json',
 ('respondent_1',
  '2025_04_30_10_43_23'): 'respondent_1/results_2025_04_30_10_43_23.json',
 ('respondent_1',
  '2025_04_30_23_25_57'): 'respondent_1/results_2025_04_30_23_25_57.json',
 ('respondent_1',
  '2025_05_01_15_54_55'): 'respondent_1/results_2025_05_01_15_54_55.json',
 ('respondent_1',
  '2025_05_05_17_37_13'): 'respondent_1/results_2025

In [3]:
def calculate_fatigue_score(num, answer):
    """
    Функция принимает DataFrame с ответами на вопросы опросника и вычисляет итоговый балл.

    Вопросы 1, 2, 5-16: "Да" = 2 балла, "Нет" = 0 баллов.
    Вопросы 3, 17, 18: "Нет" = 2 балла, "Да" = 0 баллов.

    :param df: pandas DataFrame, где строки - респонденты, столбцы - ответы ("Да" или "Нет").
    :return: pandas Series с итоговым баллом каждого респондента.
    """
    # Определяем номера вопросов для разных систем начисления баллов
    positive_scoring_questions = {1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
    negative_scoring_questions = {3, 17, 18}

    # Функция для подсчёта баллов для одного респондента
    if answer == "Затрудняюсь":
        return 1
    score = 0
    if num in positive_scoring_questions:
        score = 2 if answer == "Да" else 0
    elif num in negative_scoring_questions:
        score = 2 if answer == "Нет" else 0
    return score



In [4]:
import pandas as pd
import json

def transform_df(name: str, df: pd.DataFrame):
    df["id"] = name
    df['datetime'] = pd.to_datetime(date, format='%Y_%m_%d_%H_%M_%S')
    # Извлекаем день недели в новый столбец
    df['weekday'] = df['datetime'].dt.day_name()
    df['weekday_num'] = df['datetime'].dt.dayofweek
    df["hour"] = df['datetime'].dt.hour
    # добавляем логику что пользовтели могли засиживаться до 5 утра и это часть одного дня
    df["hour"] = df["hour"].apply(lambda h: h + 24 if h < 5 else h)

    df['date'] = df['datetime'].dt.date
    df['date'] =  df[['datetime', 'hour', 'date']].apply(lambda row: (row['datetime'] - pd.Timedelta(days=1)).date() if row['hour'] > 23 else row['date'], axis=1)
    return df

answersLog_df = pd.DataFrame()
typingAnswersLog_df = pd.DataFrame()
testAnswersLog_df = pd.DataFrame()
stressTestLog_df = pd.DataFrame()
pageMetaData_df = pd.DataFrame()
mouseTrack_df = pd.DataFrame()

# Читаем файлы
for ((name, date), file_path) in file_paths.items():
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    # JSON в датафреймы 
    # Ответы на математические вопросы
    if "answers" in data:
        temp_df = transform_df(name, pd.DataFrame(data["answers"]))
        answersLog_df = pd.concat([answersLog_df, temp_df], ignore_index=True)
    
    # Ответы на перепечтку
    if "typingAnswersLog" in data:
        temp_df = transform_df(name, pd.DataFrame(data["typingAnswersLog"]))
        typingAnswersLog_df = pd.concat([typingAnswersLog_df, temp_df], ignore_index=True)

    # Ответы на опросник Острого Умственного Утомления
    if "testAnswersLog" in data:
        temp_df = transform_df(name, pd.DataFrame(data["testAnswersLog"]))
        temp_df["score"] = temp_df.apply(lambda x: calculate_fatigue_score(x['num'], x['answer']), axis=1)
        testAnswersLog_df = pd.concat([testAnswersLog_df, temp_df], ignore_index=True)
    # Шкала стресса
    if "stressTestLog" in data:
        temp_df = transform_df(name, pd.DataFrame(data["stressTestLog"]))
        stressTestLog_df = pd.concat([stressTestLog_df, temp_df], ignore_index=True)
    # Данные о переходе между страницами
    if "pageMetaData" in data:
        temp_df = transform_df(name, pd.DataFrame(data["pageMetaData"]))
        pageMetaData_df = pd.concat([pageMetaData_df, temp_df], ignore_index=True)
    # данные о движениях мыши
    if "mouseTrack" in data:
        temp_df = transform_df(name, pd.DataFrame(data["mouseTrack"]))
        mouseTrack_df = pd.concat([mouseTrack_df, temp_df], ignore_index=True)

mouseTrack_df = mouseTrack_df

In [5]:
# В измерителе сломалось время старта - поправляем с помощью даты завершения предыдущего задания и данных о переходе между страницами

answersLog_df = answersLog_df.sort_values(['id', 'datetime', 'q_num'])

# Джойн начального времени страницы математического теста для первого вопроса
math_start = pageMetaData_df[pageMetaData_df['page'] == 'math-test'][['id', 'datetime', 'startTime']]
math_start = math_start.rename(columns={'startTime': 'startTime_1'})
answersLog_df = answersLog_df.merge(math_start, on=['id', 'datetime'], how='left')
print(answersLog_df.columns)
# Сдвигаем endTime назад для остальных q_num
answersLog_df['endTime_prev'] = answersLog_df.groupby(['id', 'datetime'])['endTime'].shift(1)
# Если q_num > 0 => используем endTime_prev как startTime_1
answersLog_df['startTime_1'] = answersLog_df.apply(
    lambda row: int(row['endTime_prev']) if row['q_num'] > 0 else row['startTime_1'],
    axis=1
).round(0)

# Удаляем временные колонки
answersLog_df['startTime'] = answersLog_df['startTime_1']
answersLog_df = answersLog_df.drop(columns=['endTime_prev', 'startTime_1'])
answersLog_df[['id', 'datetime', 'q_num', 'startTime', 'endTime']].head(20)


Index(['q_num', 'startTime', 'endTime', 'num1', 'num2', 'question', 'answer',
       'correct', 'correctFlg', 'backspaceCount', 'id', 'datetime', 'weekday',
       'weekday_num', 'hour', 'date', 'startTime_1'],
      dtype='object')


Unnamed: 0,id,datetime,q_num,startTime,endTime
0,respondent_1,2025-04-21 20:20:49,0,1745256049348,1745256059751
1,respondent_1,2025-04-21 20:20:49,1,1745256059751,1745256071623
2,respondent_1,2025-04-21 20:20:49,2,1745256071623,1745256081000
3,respondent_1,2025-04-21 20:20:49,3,1745256081000,1745256086244
4,respondent_1,2025-04-21 20:20:49,4,1745256086244,1745256094319
5,respondent_1,2025-04-21 20:20:49,5,1745256094319,1745256103948
6,respondent_1,2025-04-21 20:20:49,6,1745256103948,1745256112005
7,respondent_1,2025-04-21 20:20:49,7,1745256112005,1745256122887
8,respondent_1,2025-04-21 20:20:49,8,1745256122887,1745256132559
9,respondent_1,2025-04-21 20:20:49,9,1745256132559,1745256141713


### Обработка шкалы стресса

Обработка и интерпретация результатов. Подсчитывается сумма всех ответов – интегральный показатель психической напряженности (ППН).
Чем больше ППН, тем выше уровень психологического стресса.
1. Низкий уровень стресса – ППН меньше 99 баллов, свидетельствует о состоянии психологической адаптированности к рабочим нагрузкам.
2. ППН в интервале 155 – 100 баллов – средний уровень стресса. Вам требуется соблюдать режим труда и отдыха, наладить свой режим дня. Снизьте нагрузки, хотя бы на время. Возьмите паузу для отдыха, прогулки или сна.
3. ППН больше 155 баллов – высокий уровень стресса, свидетельствует о состоянии дезадаптации и психического дискомфорта, необходимости применения широкого спектра средств и методов для снижения нервно-психической напряженности, психологической разгрузки, изменения стиля мышления и жизни. Вам рекомендована консультация психолога.

In [6]:
# обработка шкалы: суммирование ответов и определение уровня
stress_test_scores = stressTestLog_df.groupby(['id', 'datetime','date', 'weekday', 'weekday_num', 'hour']).agg(
    stress_score= ('answer', 'sum')
).reset_index()
stress_test_scores['stress_lvl'] = stress_test_scores['stress_score'].apply(lambda x: 3 if x > 155 else 2 if x > 99 else 1)
stress_test_scores

Unnamed: 0,id,datetime,date,weekday,weekday_num,hour,stress_score,stress_lvl
0,respondent_1,2025-04-21 20:20:49,2025-04-21,Monday,0,20,82,1
1,respondent_1,2025-04-22 22:21:04,2025-04-22,Tuesday,1,22,95,1
2,respondent_1,2025-04-23 14:36:06,2025-04-23,Wednesday,2,14,42,1
3,respondent_1,2025-04-28 10:32:05,2025-04-28,Monday,0,10,62,1
4,respondent_1,2025-04-28 21:30:12,2025-04-28,Monday,0,21,67,1
...,...,...,...,...,...,...,...,...
121,respondent_9,2025-05-07 17:53:42,2025-05-07,Wednesday,2,17,56,1
122,respondent_9,2025-05-12 08:29:32,2025-05-12,Monday,0,8,52,1
123,respondent_9,2025-05-12 21:42:24,2025-05-12,Monday,0,21,55,1
124,respondent_9,2025-05-13 09:44:59,2025-05-13,Tuesday,1,9,59,1


In [7]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

for i, (name, group) in enumerate(stress_test_scores.groupby('id')):
    fig = make_subplots(rows=2 , cols=1, shared_xaxes=True, 
                        subplot_titles=("Скор стресса", "Уровень стресса"))
    row_i = 1
    
    fig.add_trace(
        go.Scatter(x=group['datetime'], y=group['stress_score'], name='Скор стресса'),
        row=row_i, col=1
    )

    fig.add_trace(
        go.Scatter(x=group['datetime'], y=group['stress_score'], 
                    mode='markers',            
                    marker=dict(opacity=0.6),name='Скор стресса'),
        row=row_i, col=1
    )

    
    fig.add_trace(
        go.Scatter(x=group['datetime'], y=group['stress_lvl'], name='Уровень стресса'),
        row=1 + row_i, col=1
    )

    fig.add_trace(
        go.Scatter(x=group['datetime'], y=group['stress_lvl'], 
                    mode='markers',            
                    marker=dict(opacity=0.6),name='Уровень стресса'),
        row=1 + row_i, col=1
    )
    
    fig.update_yaxes(title_text=f"Скор стресса", row=row_i, col=1)
    fig.update_yaxes(title_text=f"Уровень стресса", row=1 + row_i, col=1)
    fig.update_xaxes(title_text=f"{name}. Время", row=row_i + 1, col=1)


    fig.update_layout(height=400, width= 800, title_text=f'{name}. Динамика стресса', showlegend=False)

    fig.show()


### Обработка опросника острого умственного утомления

Индекс умственного утомления (ИУУ)
1) ИУУ < 10 баллов - Отсутствие признаков умственного утомления
2) 10 <= ИУУ < 16 баллов - Легкая степень умственного утомления
3) 16<= ИУУ < 28 баллов - Умеренная степень умственного утомления
4) ИУУ >= 28 баллов - Сильная степень умственного утомления

In [8]:
# Данные о предыдущей деятельности и самооценке работочпособности
activity_df = testAnswersLog_df[testAnswersLog_df['num'] == 19][['id', 'datetime', 'answer']]
activity_df.columns = ['id', 'datetime', 'activity_type']
activity_df['cog_load_flg'] = activity_df['activity_type'].str.startswith('умственная работа').astype(int)
self_capacity_df = testAnswersLog_df[testAnswersLog_df['num'] == 20][['id', 'datetime', 'answer']]
self_capacity_df.columns = ['id', 'datetime', 'self_score']
self_capacity_df['self_score'] = self_capacity_df['self_score'].astype(int)
display(activity_df.head())
self_capacity_df.head()

Unnamed: 0,id,datetime,activity_type,cog_load_flg
18,respondent_1,2025-04-21 20:20:49,умственная работа(включая комп игры)/работа с ...,1
38,respondent_1,2025-04-22 22:21:04,умственная работа(включая комп игры)/работа с ...,1
58,respondent_1,2025-04-23 14:36:06,другая работа,0
78,respondent_1,2025-04-28 10:32:05,"отдых (не у экрана, сон)",0
98,respondent_1,2025-04-28 21:30:12,умственная работа(включая комп игры)/работа с ...,1


Unnamed: 0,id,datetime,self_score
19,respondent_1,2025-04-21 20:20:49,3
39,respondent_1,2025-04-22 22:21:04,2
59,respondent_1,2025-04-23 14:36:06,4
79,respondent_1,2025-04-28 10:32:05,8
99,respondent_1,2025-04-28 21:30:12,5


In [9]:
fatigue_test_scores = testAnswersLog_df.groupby(['id', 'datetime','date', 'weekday', 'weekday_num', 'hour']).agg(
    fatigue_score=('score', 'sum')
).reset_index()
fatigue_test_scores['fatigue_lvl'] = fatigue_test_scores['fatigue_score'].apply(lambda x: 4 if x >= 28 else 3 if x >= 16 else 2 if x >= 10 else 1)
fatigue_test_scores['fatigue_flg'] = fatigue_test_scores['fatigue_score'].apply(lambda x: 1 if x >= 16 else 0)
fatigue_test_scores = pd.merge(fatigue_test_scores, activity_df, on=['id', 'datetime'])
fatigue_test_scores = pd.merge(fatigue_test_scores, self_capacity_df, on=['id', 'datetime'])

fatigue_test_scores.head()

Unnamed: 0,id,datetime,date,weekday,weekday_num,hour,fatigue_score,fatigue_lvl,fatigue_flg,activity_type,cog_load_flg,self_score
0,respondent_1,2025-04-21 20:20:49,2025-04-21,Monday,0,20,25,3,1,умственная работа(включая комп игры)/работа с ...,1,3
1,respondent_1,2025-04-22 22:21:04,2025-04-22,Tuesday,1,22,26,3,1,умственная работа(включая комп игры)/работа с ...,1,2
2,respondent_1,2025-04-23 14:36:06,2025-04-23,Wednesday,2,14,20,3,1,другая работа,0,4
3,respondent_1,2025-04-28 10:32:05,2025-04-28,Monday,0,10,7,1,0,"отдых (не у экрана, сон)",0,8
4,respondent_1,2025-04-28 21:30:12,2025-04-28,Monday,0,21,25,3,1,умственная работа(включая комп игры)/работа с ...,1,5


In [10]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
for i, (name, group) in enumerate(fatigue_test_scores.groupby('id')):

    fig = make_subplots(rows=3, cols=1, shared_xaxes=True, 
                        subplot_titles=("Индекс умственного утомления", "Уровень утомления", "Самооценка работоспособности"))

    fig.add_trace(
        go.Scatter(x=group['datetime'], y=group['fatigue_score'], name='Индекс острого умственного утомления'),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(x=group['datetime'], 
                y=group['fatigue_score'], 
                    mode='markers',            
                    marker=dict(opacity=0.6),
                name='Индекс острого умственного утомления'
                ),
        row=1, col=1
    )

    fig.add_trace(
        go.Scatter(x=group['datetime'], y=group['fatigue_lvl'], name='Уровень острого умственного утомления'),
        row=2, col=1
    )

    fig.add_trace(
        go.Scatter(x=group['datetime'], 
                y=group['fatigue_lvl'], 
                    mode='markers',            
                    marker=dict(opacity=0.6),
                name='Уровень острого умственного утомления'),
        row=2, col=1
    )

    fig.add_trace(
        go.Scatter(x=group['datetime'], y=group['self_score'], name='Самооценка работоспособности'),
        row=3, col=1
    )

    fig.add_trace(
        go.Scatter(x=group['datetime'], 
                y=group['self_score'], 
                    mode='markers',            
                    marker=dict(opacity=0.6),
                name='Самооценка работоспособности'),
        row=3, col=1
    )

    # Общий заголовок
    fig.update_layout(height=400, width=800, title_text=f'{name}. Динамика утомления', showlegend=False)

    fig.update_xaxes(title_text="Время", row=3, col=1)

    fig.show()


## Аримфметические задачи

In [11]:
answersLog_df['response_time'] = (answersLog_df['endTime'] - answersLog_df['startTime'])/1000 # переводим в секунды

Расчет метрик по времени и корректности

In [12]:
meta_df = answersLog_df.groupby(['id', 'date','weekday',  'datetime', 'hour', 'weekday_num']).agg({
    'response_time': ['mean', 'std', 'median'],
    'correctFlg': 'mean'
}).reset_index()
meta_df.columns = ['_'.join(col).strip('_') for col in meta_df.columns.values]

merge_1 = pd.merge(fatigue_test_scores, stress_test_scores , on=['id', 'date', 'hour', 'datetime', 'weekday', 'weekday_num'], how='inner')
q_df = pd.merge(meta_df, merge_1, on=['id', 'date', 'hour','datetime', 'weekday', 'weekday_num'], how='inner')
q_df.set_index(['id', 'date', 'weekday']).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datetime,hour,weekday_num,response_time_mean,response_time_std,response_time_median,correctFlg_mean,fatigue_score,fatigue_lvl,fatigue_flg,activity_type,cog_load_flg,self_score,stress_score,stress_lvl
id,date,weekday,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
respondent_1,2025-04-21,Monday,2025-04-21 20:20:49,20,0,9.2365,1.827715,9.503,1.0,25,3,1,умственная работа(включая комп игры)/работа с ...,1,3,82,1
respondent_1,2025-04-22,Tuesday,2025-04-22 22:21:04,22,1,6.495,1.138248,6.168,0.9,26,3,1,умственная работа(включая комп игры)/работа с ...,1,2,95,1
respondent_1,2025-04-23,Wednesday,2025-04-23 14:36:06,14,2,7.8542,2.030773,7.931,1.0,20,3,1,другая работа,0,4,42,1
respondent_1,2025-04-28,Monday,2025-04-28 10:32:05,10,0,6.9324,1.107334,6.9085,1.0,7,1,0,"отдых (не у экрана, сон)",0,8,62,1
respondent_1,2025-04-28,Monday,2025-04-28 21:30:12,21,0,6.4134,1.335141,6.631,1.0,25,3,1,умственная работа(включая комп игры)/работа с ...,1,5,67,1


In [13]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
q_df = q_df.sort_values(['id', 'datetime'])
for i, (name, group) in enumerate(q_df.groupby('id')):

    fig = make_subplots(rows=3, cols=1, shared_xaxes=True, 
                        subplot_titles=("Response time mean", "Response time std", "Accuracy"))

    fig.add_trace(
        go.Scatter(x=group['datetime'], y=group['response_time_mean'], name='response_time_mean'),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(x=group['datetime'], 
                y=group['response_time_mean'], 
                    mode='markers',            
                    marker=dict(opacity=0.6),
                name='response_time_mean'
                ),
        row=1, col=1
    )

    fig.add_trace(
        go.Scatter(x=group['datetime'], y=group['response_time_std'], name='response_time_std'),
        row=2, col=1
    )

    fig.add_trace(
        go.Scatter(x=group['datetime'], 
                y=group['response_time_std'], 
                    mode='markers',            
                    marker=dict(opacity=0.6),
            ),
        row=2, col=1
    )


    fig.add_trace(
        go.Scatter(x=group['datetime'], y=group['correctFlg_mean'], name='accuracy'),
        row=3, col=1
    )

    fig.add_trace(
        go.Scatter(x=group['datetime'], 
                y=group['correctFlg_mean'], 
                    mode='markers',            
                    marker=dict(opacity=0.6),
            ),
        row=3, col=1
    )

    fig.update_layout(height=300, width=800, title_text=f'{name}. Динамика времени на ответ', showlegend=False)

    fig.show()


### Все метрики и утомление

In [14]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression
import numpy as np
from scipy import stats

ids = q_df['id'].unique()
num_ids = len(ids)

numeric_columns = ['weekday_num', 'hour', 'stress_score', 'stress_lvl', 'fatigue_score', 
                   'self_score', 'cog_load_flg', 'response_time_mean', 'response_time_median', 'response_time_std', 'correctFlg_mean']

colors = [
    '#FF0000',  # Bright Red
    '#00FF00',  # Bright Green
    '#0000FF',  # Bright Blue
    '#FF00FF',  # Magenta
    '#00FFFF',  # Cyan
    '#FFFF00',  # Yellow
    '#FFA500',  # Orange
    '#800080',  # Purple
    '#008000',  # Green
    '#000080',  # Navy
    '#800000',  # Maroon
    '#008080',  # Teal
    '#FF1493',  # Deep Pink
    '#1E90FF',  # Dodger Blue
    '#7CFC00',  # Lawn Green
    '#FFD700',  # Gold
    '#FF4500'   # Orange Red
]

metrics = [col for col in numeric_columns if col != 'fatigue_score']
n_metrics = len(metrics)
cols = 3
rows = (n_metrics + 2) // cols


fig = make_subplots(
    rows=rows,
    cols=cols,
    subplot_titles=[f"{metric} vs Fatigue Score" for metric in metrics],
    horizontal_spacing=0.1,
    vertical_spacing=0.1
)


for i, metric in enumerate(metrics):
    row = i // cols + 1
    col = i % cols + 1
    
    
    for j, respondent_id in enumerate(ids):
        respondent_data = q_df[q_df['id'] == respondent_id].sort_values('datetime')

        # Calculate correlations
        pearson_corr = np.nan
        spearman_corr = np.nan
        kendall_corr = np.nan
        if respondent_data[metric].nunique() > 1:
            pearson_corr = respondent_data[metric].corr(respondent_data['fatigue_score'], method='pearson')
            spearman_corr = respondent_data[metric].corr(respondent_data['fatigue_score'], method='spearman')
            kendall_corr = respondent_data[metric].corr(respondent_data['fatigue_score'], method='kendall')
            
        scatter = go.Scatter(
            y=respondent_data[metric],
            x=respondent_data['fatigue_score'],
            mode='markers',
            name=f'ID: {respondent_id}',
            marker=dict(
                color=colors[j % len(colors)],
                size=8,
                opacity=0.8
            ),
            showlegend=(i == 0),
            legendgroup=f'{respondent_id}',
            hovertemplate=(
                f"ID: {respondent_id}<br>" +
                "DateTime: %{customdata}<br>" +
                f"{metric}: %{{x:.2f}}<br>" +
                "Score: %{y:.2f}<br>" +
                f"Pearson r: {pearson_corr:.3f}<br>" +
                f"Spearman r: {spearman_corr:.3f}<br>" +
                f"Kendall τ: {kendall_corr:.3f}<br>"
            ),
            customdata=respondent_data['datetime']
        )
        fig.add_trace(scatter, row=row, col=col)

        # Add trendline
        y_vals = respondent_data[metric].values
        x_vals = respondent_data['fatigue_score'].values.reshape(-1, 1)

        if len(x_vals) > 1:
            model = LinearRegression().fit(x_vals, y_vals)
            x_range = np.linspace(x_vals.min(), x_vals.max(), 100).reshape(-1, 1)
            y_pred = model.predict(x_range)

            # Create annotation text with correlation values
            annotation_text = (
                f"ID: {respondent_id}<br>" +
                f"Pearson r: {pearson_corr:.3f}<br>" +
                f"Spearman r: {spearman_corr:.3f}<br>" +
                f"Kendall τ: {kendall_corr:.3f}<br>" 
            )

            fig.add_trace(
                go.Scatter(
                    x=x_range.flatten(),
                    y=y_pred,
                    mode='lines',
                    name=f'Trend {respondent_id}',
                    line=dict(
                        color=colors[j % len(colors)],
                        width=2,
                        dash='dash'
                    ),
                    showlegend=False,
                    legendgroup=f'{respondent_id}',
                    hovertemplate=annotation_text
                ),
                row=row, col=col
            )

# Update layout
fig.update_layout(
    height=300 * rows,
    width=1200,
    title_text="Metrics vs Score by Respondent",
    font=dict(size=12),
    margin=dict(l=50, r=50, t=100, b=50),
    showlegend=True,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=1.05
    )
)

# Update axes labels
for i in range(1, rows * cols + 1):
    if i > len(metrics):
        continue
    fig['layout'][f'xaxis{i}'].update(title="Fatigue score")
    fig['layout'][f'yaxis{i}'].update(title=metrics[i-1])

fig.show()

По общим метрикам:
- самооценка очень хорошо коррелирует с утомлением
- не вариативности стресса

По метрикам решения задач:
- ошибки не информативные, но можно их использовать в виде штрафа по времени (прибавить к среднему времени coef * кол-во ошибок)
- время ответа может быть хорошим показателем


### Корреляции

In [15]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

ids = q_df['id'].unique()
num_ids = len(ids)

rows, cols = num_ids, 3

numeric_df = q_df.select_dtypes(include=[np.number])
numeric_columns = numeric_df.columns

selected_rows = ['fatigue_lvl', 'fatigue_score', 'cog_load_flg', 'self_score', 'stress_lvl', 'stress_score', 'weekday_num', 'hour']

fig = make_subplots(
    rows=rows,
    cols=cols,
    subplot_titles=[f"ID {id} - {method}" for id in ids for method in ['Pearson', 'Spearman', 'Kendall']],
    horizontal_spacing=0.1,
    vertical_spacing=0.05
)

for i, (name, sub_df) in enumerate(q_df.groupby('id')):
    pearson_corr = sub_df[numeric_columns].corr(method='pearson')
    spearman_corr = sub_df[numeric_columns].corr(method='spearman')
    kendall_corr = sub_df[numeric_columns].corr(method='kendall')
    for j, (corr, method) in enumerate([
        (pearson_corr, 'Pearson'),
        (spearman_corr, 'Spearman'),
        (kendall_corr, 'Kendall')
    ]):
        corr_values = corr.values.copy()
        np.fill_diagonal(corr_values, None)
    
        corr_values = corr_values[[list(corr.index).index(row) for row in selected_rows]]
        corr_index = selected_rows
        
        heatmap = go.Heatmap(
            z=corr_values,
            x=corr.columns,
            y=corr_index,
            colorscale='RdBu_r',
            zmid=0,
            colorbar=dict(title="r", len=0.3),
            showscale=(i == num_ids - 1 and j == 2) 
        )
    
        fig.add_trace(heatmap, row=i+1, col=j+1)

fig.update_layout(
    height=300 * num_ids, 
    width=900,
    title_text="Correlation Matrices by Respondent and Method",
    font=dict(size=10),
    margin=dict(l=50, r=80, t=100, b=50)
)

for i in range(1, rows * cols + 1):
    fig['layout'][f'xaxis{i}'].update(tickangle=45)

fig.show()

- у большинства респондентов есть корреляция (в основном положительная) между урвнем утомления и средним временем

In [16]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

rows, cols = (len(q_df['id'].unique()) + 1) // 2, 2
fig = make_subplots(rows=rows, cols=cols, subplot_titles=q_df['id'].unique(), shared_xaxes=True)
group_col = 'fatigue_lvl'
metric = 'response_time_mean'
temp = q_df.groupby('id')[[group_col, metric]]\
    .corr(method='spearman')\
    .drop(columns=group_col)\
    .reset_index()
display(temp[temp['level_1'] == group_col].sort_values(metric, ascending=False))
print(f'Корреляция {metric} со скором утомления по Spearman r =', temp[(temp['level_1'] == group_col) & (temp[metric] > 0.1)][metric].mean())

for i, name in enumerate(q_df['id'].unique()):
    row = i // cols + 1
    col = i % cols + 1

    for group_value in sorted(q_df[group_col].dropna().unique()):
        group_df = q_df[(q_df[group_col] == group_value) & (q_df['id'] == name)]
        # print(group_df.shape, group_value)

        # Boxplot (без hover)
        fig.add_trace(
            go.Box(
                y=group_df[metric],
                name=str(group_value),
                boxmean=True,
                marker=dict(opacity=0),
                showlegend=False
            ),
            row=row, col=col
        )

        # Scatter с datetime в hover
        fig.add_trace(
            go.Scatter(
                x=[str(group_value)] * len(group_df),
                y=group_df[metric],
                mode='markers',
                marker=dict(size=5, color='black', opacity=0.4),
                text=group_df['datetime'].astype(str),
                hovertemplate=(
                    f"{group_col}: {group_value}<br>" +
                    f"{metric}: %{{y}}<br>" +
                    "Время: %{text}<extra></extra>"
                ),
                showlegend=False
            ),
            row=row, col=col
        )
ordered_groups = sorted(q_df[group_col].dropna().unique())
fig.update_layout(xaxis=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))
fig.update_layout(xaxis2=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))

fig.update_layout(
    height=1200,
    width=800,
    title=f"Распределения метрик по группам {group_col} по метрике {metric}",
    margin=dict(t=100),
    font=dict(size=12)
)
for i in range(1, rows * cols + 1):
    fig.update_xaxes(title_text=group_col, row=(i - 1) // cols + 1, col=(i - 1) % cols + 1)
    fig.update_yaxes(title_text=metric, row=(i - 1) // cols + 1, col=(i - 1) % cols + 1)

fig.show()

Unnamed: 0,id,level_1,response_time_mean
16,respondent_7,fatigue_lvl,0.63901
12,respondent_5,fatigue_lvl,0.620752
2,respondent_10,fatigue_lvl,0.580948
0,respondent_1,fatigue_lvl,0.367406
8,respondent_3,fatigue_lvl,0.289499
10,respondent_4,fatigue_lvl,0.178422
18,respondent_8,fatigue_lvl,0.119523
4,respondent_11,fatigue_lvl,0.082479
6,respondent_2,fatigue_lvl,-0.093934
20,respondent_9,fatigue_lvl,-0.42051


Корреляция response_time_mean со скором утомления по Spearman r = 0.39936552250063667


In [17]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

rows, cols = (len(q_df['id'].unique()) + 1) // 2, 2
fig = make_subplots(rows=rows, cols=cols, subplot_titles=q_df['id'].unique(), shared_xaxes=True)
group_col = 'fatigue_lvl'
metric = 'correctFlg_mean'
temp = q_df.groupby('id')[[group_col, metric]]\
    .corr(method='spearman')\
    .drop(columns=group_col)\
    .reset_index()
display(temp[temp['level_1'] == group_col].sort_values(metric, ascending=False))
print(f'Корреляция {metric} со скором утомления по Spearman r =', temp[(temp['level_1'] == group_col) & (temp[metric] > 0.15)][metric].mean())

for i, name in enumerate(q_df['id'].unique()):
    row = i // cols + 1
    col = i % cols + 1

    for group_value in sorted(q_df[group_col].dropna().unique()):
        group_df = q_df[(q_df[group_col] == group_value) & (q_df['id'] == name)]
        # print(group_df.shape, group_value)

        # Boxplot (без hover)
        fig.add_trace(
            go.Box(
                y=group_df[metric],
                name=str(group_value),
                boxmean=True,
                marker=dict(opacity=0),
                showlegend=False
            ),
            row=row, col=col
        )

        # Scatter с datetime в hover
        fig.add_trace(
            go.Scatter(
                x=[str(group_value)] * len(group_df),
                y=group_df[metric],
                mode='markers',
                marker=dict(size=5, color='black', opacity=0.4),
                text=group_df['datetime'].astype(str),
                hovertemplate=(
                    f"{group_col}: {group_value}<br>" +
                    f"{metric}: %{{y}}<br>" +
                    "Время: %{text}<extra></extra>"
                ),
                showlegend=False
            ),
            row=row, col=col
        )
ordered_groups = sorted(q_df[group_col].dropna().unique())
fig.update_layout(xaxis=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))
fig.update_layout(xaxis2=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))
fig.update_layout(
    height=1200,
    width=800,
    title=f"Распределения метрик по группам {group_col} по метрике {metric}",
    margin=dict(t=100),
    font=dict(size=12)
)
for i in range(1, rows * cols + 1):
    fig.update_xaxes(title_text=group_col, row=(i - 1) // cols + 1, col=(i - 1) % cols + 1)
    fig.update_yaxes(title_text=metric, row=(i - 1) // cols + 1, col=(i - 1) % cols + 1)


fig.show()

Unnamed: 0,id,level_1,correctFlg_mean
12,respondent_5,fatigue_lvl,0.806381
2,respondent_10,fatigue_lvl,0.424334
8,respondent_3,fatigue_lvl,0.247436
6,respondent_2,fatigue_lvl,0.184482
10,respondent_4,fatigue_lvl,0.097991
16,respondent_7,fatigue_lvl,0.050508
18,respondent_8,fatigue_lvl,-0.3
0,respondent_1,fatigue_lvl,-0.344337
20,respondent_9,fatigue_lvl,-0.60553
4,respondent_11,fatigue_lvl,


Корреляция correctFlg_mean со скором утомления по Spearman r = 0.4156582148179534


- respondent_9 - корреляция со средним временем отличная от остальных
- respondent_2 - 2 точки в левом ящике могут быть выбросами
- respondent_11 и respondent_6 - нет вариативности уровней утомлений
- у остальных видно четкое деление между разными уровнями утомлений

In [18]:
import plotly.express as px
# ta_df['']
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
q_df['rt_scaled'] = q_df.groupby('id')['response_time_mean'].transform(
    lambda x: (x - x.mean()) / x.std()
)
temp = q_df.groupby('id')[['fatigue_score', 'rt_scaled']]\
    .corr(method='spearman')\
    .drop(columns='fatigue_score')\
    .reset_index()
display(temp[temp['level_1'] == 'fatigue_score']['rt_scaled'].abs().mean())

fig = px.scatter(data_frame=q_df, x='fatigue_score', y='rt_scaled', color='id', trendline='ols',
                 height=400,
                 width=800
                 )
fig.show()

0.34714319958341855

#### Рассчет средних корреляций

In [19]:
temp = q_df.groupby('id')[['stress_score', 'response_time_mean']]\
    .corr(method='spearman')\
    .drop(columns='stress_score')\
    .reset_index()

print('Корреляция response_time_mean со скором стресса по Spearman r =', temp[temp['level_1'] == 'stress_score']['response_time_mean'].mean())

temp = q_df.groupby('id')[['stress_score', 'correctFlg_mean']]\
    .corr(method='spearman')\
    .drop(columns='stress_score')\
    .reset_index()

print('Корреляция correctFlg_mean со скором стресса утомления по Spearman r =', temp[temp['level_1'] == 'stress_score']['correctFlg_mean'].mean())

Корреляция response_time_mean со скором стресса по Spearman r = 0.22770736512139428
Корреляция correctFlg_mean со скором стресса утомления по Spearman r = 0.13115502177546107


In [20]:
# Среднии корреляции по модулю
temp = q_df.groupby('id')[['fatigue_score', 'response_time_mean']]\
    .corr(method='pearson')\
    .drop(columns='fatigue_score')\
    .reset_index()
print('Корреляция по модулю среднего времени ответ со скором утомления по Pearson r =', temp[temp['level_1'] == 'fatigue_score']['response_time_mean'].abs().mean())

temp = q_df.groupby('id')[['fatigue_score', 'response_time_mean']]\
    .corr(method='spearman')\
    .drop(columns='fatigue_score')\
    .reset_index()

print('Корреляция по модулю среднего времени ответ со скором утомления по Spearman r =', temp[temp['level_1'] == 'fatigue_score']['response_time_mean'].abs().mean())

temp = q_df.groupby('id')[['fatigue_score', 'response_time_mean']]\
    .corr(method='kendall')\
    .drop(columns='fatigue_score')\
    .reset_index()

print('Корреляция по модулю среднего времени ответ со скором утомления по Kendall τ =', temp[temp['level_1'] == 'fatigue_score']['response_time_mean'].abs().mean())

Корреляция по модулю среднего времени ответ со скором утомления по Pearson r = 0.35050433086834104
Корреляция по модулю среднего времени ответ со скором утомления по Spearman r = 0.34714319958341855
Корреляция по модулю среднего времени ответ со скором утомления по Kendall τ = 0.25132316771246677


In [21]:
temp = q_df.groupby('id')[['fatigue_score', 'correctFlg_mean']]\
    .corr(method='spearman')\
    .drop(columns='fatigue_score')\
    .reset_index()

print('Корреляция по модулю корректности со скором утомления по Spearman r =', temp[temp['level_1'] == 'fatigue_score']['correctFlg_mean'].abs().mean())

Корреляция по модулю корректности со скором утомления по Spearman r = 0.26878575635793156


In [22]:
temp = q_df.groupby('id')[['cog_load_flg', 'response_time_mean']]\
    .corr(method='spearman')\
    .drop(columns='cog_load_flg')\
    .reset_index()

print('Корреляция по модулю когнитивной нагрузки в виде работы со скором утомления по Spearman r =', temp[temp['level_1'] == 'cog_load_flg']['response_time_mean'].abs().mean())

Корреляция по модулю когнитивной нагрузки в виде работы со скором утомления по Spearman r = 0.3027746751058862


In [23]:
temp = fatigue_test_scores.groupby('id')[['fatigue_score', 'self_score']]\
    .corr(method='spearman')\
    .drop(columns='fatigue_score')\
    .reset_index()

print('Корреляция самооценки со скором утомления по Spearman r =', temp[temp['level_1'] == 'fatigue_score']['self_score'].mean())

Корреляция самооценки со скором утомления по Spearman r = -0.6720203330930201


## Перепечатка

In [25]:
def damerau_levenshtein_distance(s1, s2):
    d = {}
    lenstr1 = len(s1)
    lenstr2 = len(s2)
    for i in range(-1, lenstr1 + 1):
        d[(i, -1)] = i + 1
    for j in range(-1, lenstr2 + 1):
        d[(-1, j)] = j + 1

    for i in range(lenstr1):
        for j in range(lenstr2):
            cost = 0 if s1[i] == s2[j] else 1
            d[(i, j)] = min(
                d[(i - 1, j)] + 1,      # deletion
                d[(i, j - 1)] + 1,      # insertion
                d[(i - 1, j - 1)] + cost,  # substitution
            )
            if i and j and s1[i] == s2[j - 1] and s1[i - 1] == s2[j]:
                d[(i, j)] = min(d[(i, j)], d[(i - 2, j - 2)] + 1)  # transposition

    return d[(lenstr1 - 1, lenstr2 - 1)]

In [26]:
def smith_waterman(seq1, seq2, match=2, mismatch=-1, gap_penalty=-1):
    # Инициализация матрицы
    n, m = len(seq1), len(seq2)
    score_matrix = np.zeros((n + 1, m + 1))

    # Заполнение матрицы
    max_score = 0
    max_pos = None
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            match_score = match if seq1[i - 1] == seq2[j - 1] else mismatch
            diagonal = score_matrix[i - 1, j - 1] + match_score
            up = score_matrix[i - 1, j] + gap_penalty
            left = score_matrix[i, j - 1] + gap_penalty
            score_matrix[i, j] = max(0, diagonal, up, left)

            # Обновление максимального значения
            if score_matrix[i, j] > max_score:
                max_score = score_matrix[i, j]
                max_pos = (i, j)

    return max_score

# Пример использования
seq1 = "AGCTG"
seq2 = "GCT"
score = smith_waterman(seq1, seq2)
print(f"Smith-Waterman score: {score}")


Smith-Waterman score: 6.0


In [27]:
typingAnswersLog_df['distance'] = typingAnswersLog_df.apply(lambda rec: damerau_levenshtein_distance(rec['reference'], rec['answer']), axis=1)
typingAnswersLog_df['answer_len'] = typingAnswersLog_df.apply(lambda rec:  len(rec['answer']), axis=1)
display(typingAnswersLog_df[typingAnswersLog_df['answer_len'] ==0])

typingAnswersLog_df['sec_per_char'] = typingAnswersLog_df.apply(lambda rec: (rec['endTime'] - rec['startTime']) / 1000 / max((rec['answer_len']), 1), axis=1)
typingAnswersLog_df['response_time'] = typingAnswersLog_df.apply(lambda rec: (rec['endTime'] - rec['startTime']) / 1000, axis=1)
typingAnswersLog_df.head()

Unnamed: 0,q_num,reference,answer,backspaceCount,startTime,endTime,charPool,textLength,id,datetime,weekday,weekday_num,hour,date,distance,answer_len
109,9,2ц1ыв7з 1йы5шП5хжж еЕтюго.,,0,1746456140172,1746456153197,294,26,respondent_1,2025-05-05 17:37:13,Monday,0,17,2025-05-05,26,0
1096,6,ф2х шыЩо7т 4Кр иншеЖпЙ л.,,0,1745910801019,1745910801399,294,25,respondent_10,2025-04-29 10:04:51,Tuesday,1,10,2025-04-29,25,0
1097,7,7 9 8Дшфс8йс резА з йнщЯ.,,0,1745910801399,1745910802674,294,25,respondent_10,2025-04-29 10:04:51,Tuesday,1,10,2025-04-29,25,0
1098,8,ш д2ь2 ъЕск 5кй ж жъ ч.,,0,1745910802674,1745910802836,294,23,respondent_10,2025-04-29 10:04:51,Tuesday,1,10,2025-04-29,23,0
1099,9,оПпх57 4 Ч 1 ы дшсо4 з4д.,,0,1745910802836,1745910807326,294,25,respondent_10,2025-04-29 10:04:51,Tuesday,1,10,2025-04-29,25,0
1139,9,мш уюп1 Щн ру фС ж4ц7я8 ц.,,0,1746256951019,1746256953699,294,26,respondent_10,2025-05-03 10:15:48,Saturday,5,10,2025-05-03,26,0
1179,9,то5182ррзб й мк7 сйь2к.,,0,1746631766512,1746631767867,294,23,respondent_10,2025-05-07 18:23:57,Wednesday,2,18,2025-05-07,23,0


Unnamed: 0,q_num,reference,answer,backspaceCount,startTime,endTime,charPool,textLength,id,datetime,weekday,weekday_num,hour,date,distance,answer_len,sec_per_char,response_time
0,0,в1Юх х АяеМищ8н ъ о2бч.,в1Юх х АяеМищ8н ъ о2бч.,0,1745256183522,1745256221605,294,23,respondent_1,2025-04-21 20:20:49,Monday,0,20,2025-04-21,0,23,1.655783,38.083
1,1,5фГгсп вг мйнЦШ Р СщЪВЯцы.,5фГгсп вг мйнЦШ Р СщЪВЯцы.,1,1745256221605,1745256251960,294,26,respondent_1,2025-04-21 20:20:49,Monday,0,20,2025-04-21,0,26,1.1675,30.355
2,2,фкя7й мц ы2зи чхрЛз2р9ъ.,фкя7й мц ы2зи чхрЛз2р9ъ.,0,1745256251960,1745256279268,294,24,respondent_1,2025-04-21 20:20:49,Monday,0,20,2025-04-21,0,24,1.137833,27.308
3,3,э5грк 2йо дфЩ д евкрэ7.,э5грк 2йо дфЩ д евкрэ7.,0,1745256279268,1745256302362,294,23,respondent_1,2025-04-21 20:20:49,Monday,0,20,2025-04-21,0,23,1.004087,23.094
4,4,ю н кыИфж9Зжз4нХд4юд4юж8.,ю н кыИфж93жз4нХд4юд4юж8.,1,1745256302362,1745256343396,294,25,respondent_1,2025-04-21 20:20:49,Monday,0,20,2025-04-21,1,25,1.64136,41.034


In [28]:
ta_temp_df = typingAnswersLog_df.groupby(['id', 'datetime','date', 'weekday', 'weekday_num', 'hour']).agg({
    'backspaceCount':['sum', 'mean', 'max'],
    'startTime':'min',
    'endTime':'max',
    'answer': 'sum',
    'reference': 'sum',
    'answer_len': 'sum',
    'sec_per_char':['mean', 'max', 'min', 'std'],
    'response_time':['mean', 'max', 'min', 'std'],
    'q_num': 'count',
    'distance': ['sum', 'mean', 'max'],
}).reset_index()
ta_temp_df.columns = ['_'.join(col).strip('_') for col in ta_temp_df.columns.values]
ta_temp_df = ta_temp_df[~((ta_temp_df['datetime'] == pd.to_datetime('2025-04-29 10:04:51')) & (ta_temp_df['id'] == 'respondent_10'))]
ta_temp_df['sec_per_char_mean'] = ta_temp_df['sec_per_char_mean']
ta_temp_df['distance_sw'] = ta_temp_df.apply(lambda rec: smith_waterman(rec['reference_sum'], rec['answer_sum']), axis=1)
ta_temp_df['distance_sum'] = ta_temp_df.apply(lambda rec: damerau_levenshtein_distance(rec['reference_sum'], rec['answer_sum']), axis=1)

ta_temp_df.set_index(['id', 'date', 'weekday', 'hour']).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,datetime,weekday_num,backspaceCount_sum,backspaceCount_mean,backspaceCount_max,startTime_min,endTime_max,answer_sum,reference_sum,answer_len_sum,...,sec_per_char_std,response_time_mean,response_time_max,response_time_min,response_time_std,q_num_count,distance_sum,distance_mean,distance_max,distance_sw
id,date,weekday,hour,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
respondent_1,2025-04-21,Monday,20,2025-04-21 20:20:49,0,6,0.6,2,1745256183522,1745256497271,в1Юх х АяеМищ8н ъ о2бч.5фГгсп вг мйнЦШ Р СщЪВЯ...,в1Юх х АяеМищ8н ъ о2бч.5фГгсп вг мйнЦШ Р СщЪВЯ...,244,...,0.253178,31.3749,41.034,18.779,7.033154,10,1,0.1,1,485.0
respondent_1,2025-04-22,Tuesday,22,2025-04-22 22:21:04,1,3,0.3,2,1745349732181,1745350013251,ьъшщмл ГЬМ1УгЧкм 2яп нгях.ывнцъезичб4рз г1с Цу...,ьъшщмл ГЬМ1УгЧкм 2яп нгях.ывнцъезичб4рз г1с Цу...,251,...,0.218283,28.107,42.484,21.888,6.204222,10,1,0.1,1,499.0
respondent_1,2025-04-23,Wednesday,14,2025-04-23 14:36:06,2,5,0.5,3,1745408254949,1745408547317,89нс ч 84вйл 7л сяивь.хД 1ъ вэж ре92з ъ5ся7г.7...,89нс ч 84вйл 7л сяивь.хД 1ъ вэж ре92з ъ5ся7г.7...,242,...,0.218116,29.2367,36.068,19.06,5.280396,10,1,0.1,1,481.0
respondent_1,2025-04-28,Monday,10,2025-04-28 10:32:05,0,1,0.1,1,1745825595713,1745825847068,пИЖ21к й8ржнХт4пзп йксГ5.е2Эфу дар жвК9ъуф 9ш1...,пИЖ21к й8ржнХт4пзп йксГ5.е2Эфу дар жвК9ъуф 9ш1...,252,...,0.204673,25.1355,35.319,18.709,5.524814,10,2,0.2,1,498.0
respondent_1,2025-04-28,Monday,21,2025-04-28 21:30:12,0,8,0.8,5,1745865079755,1745865343669,тщи9х4 2Фш 5эк км нтыыьг.Ц1 юштО ыбжьа фццэСая...,тщи9х4 2Фш 5эк км нтыыьг.Ц1 юштО ыбжьа фццэСая...,248,...,0.20447,26.3914,36.5,19.235,5.362754,10,0,0.0,0,496.0


In [29]:
merge_1 = pd.merge(fatigue_test_scores, ta_temp_df , on=['id', 'datetime','date', 'weekday', 'weekday_num', 'hour'] , how='inner')
ta_df = pd.merge(stress_test_scores, merge_1, on=['id', 'datetime','date', 'weekday', 'weekday_num', 'hour'], how='inner')
ta_df['distance_backspace'] = ta_df['distance_sum'] + ta_df['backspaceCount_sum']
ta_df['response_time'] = (ta_df['endTime_max'] - ta_df['startTime_min'])/1000
ta_temp_df['id'].unique(), fatigue_test_scores['id'].unique(), stress_test_scores['id'].unique()

(array(['respondent_1', 'respondent_10', 'respondent_11', 'respondent_2',
        'respondent_3', 'respondent_4', 'respondent_5', 'respondent_6',
        'respondent_7', 'respondent_8', 'respondent_9'], dtype=object),
 array(['respondent_1', 'respondent_10', 'respondent_11', 'respondent_2',
        'respondent_3', 'respondent_4', 'respondent_5', 'respondent_6',
        'respondent_7', 'respondent_8', 'respondent_9'], dtype=object),
 array(['respondent_1', 'respondent_10', 'respondent_11', 'respondent_2',
        'respondent_3', 'respondent_4', 'respondent_5', 'respondent_6',
        'respondent_7', 'respondent_8', 'respondent_9'], dtype=object))

### Все метрики и утомление

In [30]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression
import numpy as np
from scipy import stats

ids = ta_df['id'].unique()
num_ids = len(ids)

numeric_columns = [
       'backspaceCount_mean',
       'sec_per_char_mean', 
       'sec_per_char_std', 'distance_sum', 'distance_sw', 'distance_backspace',
       'response_time']
colors = [
    '#FF0000',  # Bright Red
    '#00FF00',  # Bright Green
    '#0000FF',  # Bright Blue
    '#FF00FF',  # Magenta
    '#00FFFF',  # Cyan
    '#FFFF00',  # Yellow
    '#FFA500',  # Orange
    '#800080',  # Purple
    '#008000',  # Green
    '#000080',  # Navy
    '#800000',  # Maroon
    '#008080',  # Teal
    '#FF1493',  # Deep Pink
    '#1E90FF',  # Dodger Blue
    '#7CFC00',  # Lawn Green
    '#FFD700',  # Gold
    '#FF4500'   # Orange Red
]

metrics = [col for col in numeric_columns if col != 'fatigue_score']
n_metrics = len(metrics)
cols = 3
rows = (n_metrics + 2) // cols


fig = make_subplots(
    rows=rows,
    cols=cols,
    subplot_titles=[f"{metric} vs Fatigue Score" for metric in metrics],
    horizontal_spacing=0.1,
    vertical_spacing=0.1
)


for i, metric in enumerate(metrics):
    row = i // cols + 1
    col = i % cols + 1
    
    
    for j, respondent_id in enumerate(ids):
        respondent_data = ta_df[ta_df['id'] == respondent_id].sort_values('datetime')

        # Calculate correlations
        pearson_corr = np.nan
        spearman_corr = np.nan
        kendall_corr = np.nan
        if respondent_data[metric].nunique() > 1:
            pearson_corr = respondent_data[metric].corr(respondent_data['fatigue_score'], method='pearson')
            spearman_corr = respondent_data[metric].corr(respondent_data['fatigue_score'], method='spearman')
            kendall_corr = respondent_data[metric].corr(respondent_data['fatigue_score'], method='kendall')
            
        scatter = go.Scatter(
            y=respondent_data[metric],
            x=respondent_data['fatigue_score'],
            mode='markers',
            name=f'ID: {respondent_id}',
            marker=dict(
                color=colors[j % len(colors)],
                size=8,
                opacity=0.8
            ),
            showlegend=(i == 0),
            legendgroup=f'{respondent_id}',
            hovertemplate=(
                f"ID: {respondent_id}<br>" +
                "DateTime: %{customdata}<br>" +
                f"{metric}: %{{x:.2f}}<br>" +
                "Score: %{y:.2f}<br>" +
                f"Pearson r: {pearson_corr:.3f}<br>" +
                f"Spearman r: {spearman_corr:.3f}<br>" +
                f"Kendall τ: {kendall_corr:.3f}<br>"
            ),
            customdata=respondent_data['datetime']
        )
        fig.add_trace(scatter, row=row, col=col)

        # Add trendline
        y_vals = respondent_data[metric].values
        x_vals = respondent_data['fatigue_score'].values.reshape(-1, 1)

        if len(x_vals) > 1:
            model = LinearRegression().fit(x_vals, y_vals)
            x_range = np.linspace(x_vals.min(), x_vals.max(), 100).reshape(-1, 1)
            y_pred = model.predict(x_range)

            # Create annotation text with correlation values
            annotation_text = (
                f"ID: {respondent_id}<br>" +
                f"Pearson r: {pearson_corr:.3f}<br>" +
                f"Spearman r: {spearman_corr:.3f}<br>" +
                f"Kendall τ: {kendall_corr:.3f}<br>" 
            )

            fig.add_trace(
                go.Scatter(
                    x=x_range.flatten(),
                    y=y_pred,
                    mode='lines',
                    name=f'Trend {respondent_id}',
                    line=dict(
                        color=colors[j % len(colors)],
                        width=2,
                        dash='dash'
                    ),
                    showlegend=False,
                    legendgroup=f'{respondent_id}',
                    hovertemplate=annotation_text
                ),
                row=row, col=col
            )

# Update layout
fig.update_layout(
    height=300 * rows,
    width=1200,
    title_text="Metrics vs Score by Respondent",
    font=dict(size=12),
    margin=dict(l=50, r=50, t=100, b=50),
    showlegend=True,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=1.05
    )
)


- у респондентов с большой вариативностью уровня утомления заметна корреляция среднего времени на символ и уровеня утомления
- так же у ряда респондентов видно зависимость с расстоянием между оригинальными и введеными строками 

### Корреляции

In [31]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

ids = ta_df['id'].unique()
num_ids = len(ids)

rows, cols = num_ids, 3

numeric_columns = ['fatigue_lvl', 'fatigue_score', 'cog_load_flg', 'self_score', 'stress_lvl', 'stress_score', 'weekday_num', 'hour',
       'backspaceCount_mean',
       'sec_per_char_mean', 
       'sec_per_char_std', 'distance_sum', 'distance_sw', 'distance_backspace',
       'response_time']

selected_rows = ['fatigue_lvl', 'fatigue_score', 'cog_load_flg', 'self_score', 'stress_lvl', 'stress_score', 'weekday_num', 'hour']

fig = make_subplots(
    rows=rows,
    cols=cols,
    subplot_titles=[f"ID {id} - {method}" for id in ids for method in ['Pearson', 'Spearman', 'Kendall']],
    horizontal_spacing=0.1,
    vertical_spacing=0.05
)

for i, (name, sub_df) in enumerate(ta_df.groupby('id')):
    pearson_corr = sub_df[numeric_columns].corr(method='pearson')
    spearman_corr = sub_df[numeric_columns].corr(method='spearman')
    kendall_corr = sub_df[numeric_columns].corr(method='kendall')
    for j, (corr, method) in enumerate([
        (pearson_corr, 'Pearson'),
        (spearman_corr, 'Spearman'),
        (kendall_corr, 'Kendall')
    ]):
        corr_values = corr.values.copy()
        np.fill_diagonal(corr_values, None)
    
        corr_values = corr_values[[list(corr.index).index(row) for row in selected_rows]]
        corr_index = selected_rows
        
        heatmap = go.Heatmap(
            z=corr_values,
            x=corr.columns,
            y=corr_index,
            colorscale='RdBu_r',
            zmid=0,
            colorbar=dict(title="r", len=0.3),
            showscale=(i == num_ids - 1 and j == 2) 
        )
    
        fig.add_trace(heatmap, row=i+1, col=j+1)

fig.update_layout(
    height=300 * num_ids, 
    width=900,
    title_text="Correlation Matrices by Respondent and Method",
    font=dict(size=10),
    margin=dict(l=50, r=80, t=100, b=50)
)

for i in range(1, rows * cols + 1):
    fig['layout'][f'xaxis{i}'].update(tickangle=45)

fig.show()

- у большинства респондентов есть корреляция (в основном положительная) между урвнем утомления и средним временем на символ

In [33]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

rows, cols = (len(ta_df['id'].unique()) + 1) // 2, 2
fig = make_subplots(rows=rows, cols=cols, subplot_titles=q_df['id'].unique(), shared_xaxes=True)
group_col = 'fatigue_lvl'
metric = 'sec_per_char_mean'
temp = ta_df.groupby('id')[[group_col, metric]]\
    .corr(method='spearman')\
    .drop(columns=group_col)\
    .reset_index()
display(temp[temp['level_1'] == group_col].sort_values(metric, ascending=False))
resp_abnormal = ['respondent_9', 'respondent_2', 'respondent_11', 'respondent_6', 'respondent_10']
print(f'Корреляция {metric} со скором утомления по Spearman r =', temp[(temp['level_1'] == group_col) & (~temp['id'].isin(resp_abnormal))][metric].mean())


for i, name in enumerate(ta_df['id'].unique()):
    row = i // cols + 1
    col = i % cols + 1

    for group_value in sorted(ta_df[group_col].dropna().unique()):
        group_df = ta_df[(ta_df[group_col] == group_value) & (ta_df['id'] == name)]

        fig.add_trace(
            go.Box(
                y=group_df[metric],
                name=str(group_value),
                boxmean=True,
                marker=dict(opacity=0),
                showlegend=False
            ),
            row=row, col=col
        )

        fig.add_trace(
            go.Scatter(
                x=[str(group_value)] * len(group_df),
                y=group_df[metric],
                mode='markers',
                marker=dict(size=5, color='black', opacity=0.4),
                text=group_df['datetime'].astype(str),
                hovertemplate=(
                    f"{group_col}: {group_value}<br>" +
                    f"{metric}: %{{y}}<br>" +
                    "Время: %{text}<extra></extra>"
                ),
                showlegend=False
            ),
            row=row, col=col
        )
ordered_groups = sorted(ta_df[group_col].dropna().unique())
fig.update_layout(xaxis=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))
fig.update_layout(xaxis2=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))

# Общие настройки
fig.update_layout(
    height=1200,
    width=800,
    title=f"Распределения метрик по группам {group_col} по метрике {metric}",
    margin=dict(t=100),
    font=dict(size=12)
)
for i in range(1, rows * cols + 1):
    fig.update_xaxes(title_text=group_col, row=(i - 1) // cols + 1, col=(i - 1) % cols + 1)
    fig.update_yaxes(title_text=metric, row=(i - 1) // cols + 1, col=(i - 1) % cols + 1)


fig.show()

Unnamed: 0,id,level_1,sec_per_char_mean
0,respondent_1,fatigue_lvl,0.573153
18,respondent_8,fatigue_lvl,0.537853
10,respondent_4,fatigue_lvl,0.414758
12,respondent_5,fatigue_lvl,0.392534
8,respondent_3,fatigue_lvl,0.197386
4,respondent_11,fatigue_lvl,0.082479
2,respondent_10,fatigue_lvl,0.0
16,respondent_7,fatigue_lvl,0.0
20,respondent_9,fatigue_lvl,-0.052564
6,respondent_2,fatigue_lvl,-0.187867


Корреляция sec_per_char_mean со скором утомления по Spearman r = 0.35261396766095293


- respondent_2 - 2 точки в левом ящике могут быть выбросами
- respondent_11 и respondent_6 - нет вариативности уровней утомлений

In [34]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

rows, cols = (len(ta_df['id'].unique()) + 1) // 2, 2
fig = make_subplots(rows=rows, cols=cols, subplot_titles=q_df['id'].unique(), shared_xaxes=True)
group_col = 'fatigue_lvl'
metric = 'distance_sum'
temp = ta_df.groupby('id')[[group_col, metric]]\
    .corr(method='spearman')\
    .drop(columns=group_col)\
    .reset_index()
display(temp[temp['level_1'] == group_col].sort_values(metric, ascending=False))
resp_abnormal = ['respondent_2', 'respondent_11', 'respondent_6', 'respondent_1', 'respondent_3']
print(f'Корреляция {metric} со скором утомления по Spearman r =', temp[(temp['level_1'] == group_col) & (~temp['id'].isin(resp_abnormal))][metric].mean())

temp = ta_df.groupby('id')[['fatigue_score', metric]]\
    .corr(method='spearman')\
    .drop(columns='fatigue_score')\
    .reset_index()
display(temp[temp['level_1'] == 'fatigue_score'].sort_values(metric, ascending=False))
resp_abnormal = ['respondent_2', 'respondent_11', 'respondent_6', 'respondent_1', 'respondent_3']
print(f'Корреляция {metric} со скором утомления по Spearman r =', temp[(temp['level_1'] == 'fatigue_score') & (~temp['id'].isin(resp_abnormal))][metric].mean())

for i, name in enumerate(ta_df['id'].unique()):
    row = i // cols + 1
    col = i % cols + 1

    for group_value in sorted(ta_df[group_col].dropna().unique()):
        group_df = ta_df[(ta_df[group_col] == group_value) & (ta_df['id'] == name)]

        fig.add_trace(
            go.Box(
                y=group_df[metric],
                name=str(group_value),
                boxmean=True,
                marker=dict(opacity=0),
                showlegend=False
            ),
            row=row, col=col
        )

        fig.add_trace(
            go.Scatter(
                x=[str(group_value)] * len(group_df),
                y=group_df[metric],
                mode='markers',
                marker=dict(size=5, color='black', opacity=0.4),
                text=group_df['datetime'].astype(str),
                hovertemplate=(
                    f"{group_col}: {group_value}<br>" +
                    f"{metric}: %{{y}}<br>" +
                    "Время: %{text}<extra></extra>"
                ),
                showlegend=False
            ),
            row=row, col=col
        )
ordered_groups = sorted(ta_df[group_col].dropna().unique())
fig.update_layout(xaxis=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))
fig.update_layout(xaxis2=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))
fig.update_layout(
    height=1200,
    width=800,
    title=f"Распределения метрик по группам {group_col} по метрике {metric}",
    margin=dict(t=100),
    font=dict(size=12)
)
for i in range(1, rows * cols + 1):
    fig.update_xaxes(title_text=group_col, row=(i - 1) // cols + 1, col=(i - 1) % cols + 1)
    fig.update_yaxes(title_text=metric, row=(i - 1) // cols + 1, col=(i - 1) % cols + 1)


fig.show()

Unnamed: 0,id,level_1,distance_sum
2,respondent_10,fatigue_lvl,0.390434
16,respondent_7,fatigue_lvl,0.366679
20,respondent_9,fatigue_lvl,0.318686
10,respondent_4,fatigue_lvl,0.285282
18,respondent_8,fatigue_lvl,0.273861
12,respondent_5,fatigue_lvl,0.208013
0,respondent_1,fatigue_lvl,-0.037849
8,respondent_3,fatigue_lvl,-0.13405
4,respondent_11,fatigue_lvl,-0.340195
6,respondent_2,fatigue_lvl,-0.414885


Корреляция distance_sum со скором утомления по Spearman r = 0.3071591483285246


Unnamed: 0,id,level_1,distance_sum
18,respondent_8,fatigue_score,0.46791
2,respondent_10,fatigue_score,0.433766
14,respondent_6,fatigue_score,0.419355
20,respondent_9,fatigue_score,0.359064
16,respondent_7,fatigue_score,0.339606
10,respondent_4,fatigue_score,0.318243
12,respondent_5,fatigue_score,0.283264
8,respondent_3,fatigue_score,0.037156
4,respondent_11,fatigue_score,-0.043486
6,respondent_2,fatigue_score,-0.141678


Корреляция distance_sum со скором утомления по Spearman r = 0.36697559999338064


In [35]:
ta_df.columns

Index(['id', 'datetime', 'date', 'weekday', 'weekday_num', 'hour',
       'stress_score', 'stress_lvl', 'fatigue_score', 'fatigue_lvl',
       'fatigue_flg', 'activity_type', 'cog_load_flg', 'self_score',
       'backspaceCount_sum', 'backspaceCount_mean', 'backspaceCount_max',
       'startTime_min', 'endTime_max', 'answer_sum', 'reference_sum',
       'answer_len_sum', 'sec_per_char_mean', 'sec_per_char_max',
       'sec_per_char_min', 'sec_per_char_std', 'response_time_mean',
       'response_time_max', 'response_time_min', 'response_time_std',
       'q_num_count', 'distance_sum', 'distance_mean', 'distance_max',
       'distance_sw', 'distance_backspace', 'response_time'],
      dtype='object')

In [36]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

rows, cols = (len(ta_df['id'].unique()) + 1) // 2, 2
fig = make_subplots(rows=rows, cols=cols, subplot_titles=q_df['id'].unique(), shared_xaxes=True)

# ta_df['spc_with_errors'] = 2 * ta_df['sec_per_char_mean'] * (1 + ta_df['distance_sum'] / ta_df['answer_len_sum']) 
ta_df['spc_with_errors'] = ta_df['sec_per_char_mean'] * (1 + 5 * ta_df['distance_sum'] / ta_df['answer_len_sum']) 

group_col = 'fatigue_lvl'
metric = 'spc_with_errors'

temp = ta_df.groupby('id')[[group_col, metric]]\
    .corr(method='spearman')\
    .drop(columns=group_col)\
    .reset_index()
display(temp[temp['level_1'] == group_col].sort_values(metric, ascending=False))

resp_abnormal = ['respondent_2', 'respondent_11', 'respondent_6']
print(f'Корреляция {metric} со скором утомления по Spearman r =', temp[(temp['level_1'] == group_col) & (~temp['id'].isin(resp_abnormal))][metric].mean())

temp = ta_df.groupby('id')[['fatigue_score', metric]]\
    .corr(method='spearman')\
    .drop(columns='fatigue_score')\
    .reset_index()
display(temp[temp['level_1'] == 'fatigue_score'].sort_values(metric, ascending=False))
print(f'Корреляция {metric} со скором утомления по Spearman r =', temp[(temp['level_1'] == 'fatigue_score') & (~temp['id'].isin(resp_abnormal))][metric].mean())

for i, name in enumerate(ta_df['id'].unique()):
    row = i // cols + 1
    col = i % cols + 1

    for group_value in sorted(ta_df[group_col].dropna().unique()):
        group_df = ta_df[(ta_df[group_col] == group_value) & (ta_df['id'] == name)]

        fig.add_trace(
            go.Box(
                y=group_df[metric],
                name=str(group_value),
                boxmean=True,
                marker=dict(opacity=0),
                showlegend=False
            ),
            row=row, col=col
        )

        fig.add_trace(
            go.Scatter(
                x=[str(group_value)] * len(group_df),
                y=group_df[metric],
                mode='markers',
                marker=dict(size=5, color='black', opacity=0.4),
                text=group_df['datetime'].astype(str),
                hovertemplate=(
                    f"{group_col}: {group_value}<br>" +
                    f"{metric}: %{{y}}<br>" +
                    "Время: %{text}<extra></extra>"
                ),
                showlegend=False
            ),
            row=row, col=col
        )
ordered_groups = sorted(ta_df[group_col].dropna().unique())
fig.update_layout(xaxis=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))
fig.update_layout(xaxis2=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))
fig.update_layout(
    height=1200,
    width=800,
    title=f"Распределения метрик по группам {group_col} по метрике {metric}",
    margin=dict(t=100),
    font=dict(size=12)
)
for i in range(1, rows * cols + 1):
    fig.update_xaxes(title_text=group_col, row=(i - 1) // cols + 1, col=(i - 1) % cols + 1)
    fig.update_yaxes(title_text=metric, row=(i - 1) // cols + 1, col=(i - 1) % cols + 1)


fig.show()

Unnamed: 0,id,level_1,spc_with_errors
20,respondent_9,fatigue_lvl,0.676161
0,respondent_1,fatigue_lvl,0.611363
18,respondent_8,fatigue_lvl,0.597614
10,respondent_4,fatigue_lvl,0.426458
12,respondent_5,fatigue_lvl,0.346891
8,respondent_3,fatigue_lvl,0.250022
2,respondent_10,fatigue_lvl,0.231455
16,respondent_7,fatigue_lvl,0.091287
4,respondent_11,fatigue_lvl,-0.082479
6,respondent_2,fatigue_lvl,-0.25049


Корреляция spc_with_errors со скором утомления по Spearman r = 0.4039064162253953


Unnamed: 0,id,level_1,spc_with_errors
18,respondent_8,fatigue_score,0.702731
20,respondent_9,fatigue_score,0.584481
10,respondent_4,fatigue_score,0.510761
0,respondent_1,fatigue_score,0.401651
2,respondent_10,fatigue_score,0.380952
12,respondent_5,fatigue_score,0.288177
16,respondent_7,fatigue_score,0.245955
8,respondent_3,fatigue_score,0.145897
6,respondent_2,fatigue_score,-0.042329
4,respondent_11,fatigue_score,-0.072294


Корреляция spc_with_errors со скором утомления по Spearman r = 0.4075756497614484


- respondent_9 не показал чувтвительность к времени, но предположительно есть зависимость с количеством ошибок(расстояния строк)

#### Рассчет средних корреляций

In [129]:
temp = ta_df.groupby('id')[['stress_score', 'sec_per_char_mean']]\
    .corr(method='spearman')\
    .drop(columns='stress_score')\
    .reset_index()

print('Корреляция sec_per_char со скором утомления по Spearman r =', temp[temp['level_1'] == 'stress_score']['sec_per_char_mean'].mean())

temp = ta_df.groupby('id')[['stress_score', 'distance_sum']]\
    .corr(method='spearman')\
    .drop(columns='stress_score')\
    .reset_index()

print('Корреляция distance_sum со скором утомления по Spearman r =', temp[temp['level_1'] == 'stress_score']['distance_sum'].mean())

Корреляция sec_per_char со скором утомления по Spearman r = 0.13660144449342015
Корреляция distance_sum со скором утомления по Spearman r = 0.074057975356157


In [37]:
group_col = 'fatigue_lvl'
metric = 'sec_per_char_mean'
temp = ta_df.groupby('id')[[group_col, metric]]\
    .corr(method='spearman')\
    .drop(columns=group_col)\
    .reset_index()
rt_df = temp[(temp['level_1'] == group_col)]
group_col = 'fatigue_lvl'
metric = 'distance_sum'
temp = ta_df.groupby('id')[[group_col, metric]]\
    .corr(method='spearman')\
    .drop(columns=group_col)\
    .reset_index()
err_df = temp[(temp['level_1'] == group_col)]
print(rt_df.columns)
print(err_df.columns)
df_quad = pd.merge(rt_df, err_df, on=['id', 'level_1'], how='inner')
display(df_quad)

import plotly.express as px

fig = px.scatter(data_frame=df_quad, x='sec_per_char_mean', y='distance_sum', color='id', trendline='ols',
                 height=400,
                 width=800,)
fig.update_layout(
    xaxis=dict(zeroline=True, zerolinewidth=2, zerolinecolor='blue'),
    yaxis=dict(zeroline=True, zerolinewidth=2, zerolinecolor='blue')
)

fig.show()


Index(['id', 'level_1', 'sec_per_char_mean'], dtype='object')
Index(['id', 'level_1', 'distance_sum'], dtype='object')


Unnamed: 0,id,level_1,sec_per_char_mean,distance_sum
0,respondent_1,fatigue_lvl,0.573153,-0.037849
1,respondent_10,fatigue_lvl,0.0,0.390434
2,respondent_11,fatigue_lvl,0.082479,-0.340195
3,respondent_2,fatigue_lvl,-0.187867,-0.414885
4,respondent_3,fatigue_lvl,0.197386,-0.13405
5,respondent_4,fatigue_lvl,0.414758,0.285282
6,respondent_5,fatigue_lvl,0.392534,0.208013
7,respondent_6,fatigue_lvl,,
8,respondent_7,fatigue_lvl,0.0,0.366679
9,respondent_8,fatigue_lvl,0.537853,0.273861


#### Отнормированные значения

In [39]:
import plotly.express as px
# ta_df['']
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
ta_df['rt_scaled'] = ta_df.groupby('id')['sec_per_char_mean'].transform(
    lambda x: (x - x.mean()) / x.std()
)
temp = ta_df.groupby('id')[['fatigue_score', 'rt_scaled']]\
    .corr(method='spearman')\
    .drop(columns='fatigue_score')\
    .reset_index()
display(temp[temp['level_1'] == 'fatigue_score']['rt_scaled'].abs().mean())

fig = px.scatter(data_frame=ta_df, x='fatigue_score', y='rt_scaled', color='id', trendline='ols',
                 height=400,
                 width=800
                 )
fig.show()


0.25403611824973527

In [40]:
import plotly.express as px
# ta_df['']
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
ta_df['distance_scaled'] = ta_df.groupby('id')['distance_sum'].transform(
    lambda x: (x - x.mean()) / x.std()
)
temp = ta_df.groupby('id')[['fatigue_score', 'distance_scaled']]\
    .corr(method='spearman')\
    .drop(columns='fatigue_score')\
    .reset_index()
display(temp[temp['level_1'] == 'fatigue_score']['distance_scaled'].abs().mean())

fig = px.scatter(data_frame=ta_df, 
                 x='fatigue_score', 
                 y='distance_scaled', 
                 color='id', trendline='ols',
                 height=400,
                 width=800
                 )
fig.show()


0.27396076280003456

## Трекинг мыши

In [41]:
import json
from pathlib import Path
from math import atan2, degrees, sqrt, log2, pi
from typing import Dict, List

import numpy as np
import pandas as pd


def angle(v1, v2) -> float:
    # Беззнаковый угол между двумя 2-D векторами
    dot = v1[0] * v2[0] + v1[1] * v2[1]
    norm = sqrt(v1[0] ** 2 + v1[1] ** 2) * sqrt(v2[0] ** 2 + v2[1] ** 2)
    if norm == 0:
        return 0.0
    return degrees(np.arccos(max(min(dot / norm, 1.0), -1.0)))

In [42]:
def compute_segment_metrics_extended(data: pd.DataFrame,
                                     pause_speed: float = 10.0,
                                     sharp_turn: float = 90.0,
                                     pause_duration_threshold: float = 0.2) -> Dict[str, float]:

    start_dttm = pageMetaData_df[(pageMetaData_df['page'] == 'math-test') & 
                                  (pageMetaData_df['id'] == data['id'].min()) & 
                                  (pageMetaData_df['datetime'] == data['datetime'].min()
                                  )]['startTime'].min() 
    end_dttm = pageMetaData_df[(pageMetaData_df['page'] == 'typing-instruction') & 
                                  (pageMetaData_df['id'] == data['id'].min()) & 
                                  (pageMetaData_df['datetime'] == data['datetime'].min()
                                  )]['startTime'].min() 
    data = data[(data['time'] >= start_dttm) & (data['time'] < end_dttm)].sort_values(by=['time']).reset_index(drop=True)
    dx, dy = data["x"].diff(), data["y"].diff()
    dt = data["time"].diff() / 1000
    dt.replace(0, np.nan, inplace=True)

    dist = np.hypot(dx, dy)
    speed = dist / dt
    accel = speed.diff() / dt
    jerk = accel.diff() / dt

    angles = [angle((dx[i], dy[i]), (dx[i + 1], dy[i + 1]))
              for i in range(1, len(data) - 1)]

    total_dist = dist.sum()
    straight_dist = np.hypot(data["x"].iloc[-1] - data["x"].iloc[0],
                             data["y"].iloc[-1] - data["y"].iloc[0])

    pause_durations = dt[speed < pause_speed].groupby((speed >= pause_speed).cumsum()).sum()

    metrics = {
        "duration_s": (data["time"].iloc[-1] - data["time"].iloc[0]) / 1000,
        "total_distance": total_dist,
        "straight_distance": straight_dist,
        "path_efficiency": total_dist / straight_dist if straight_dist else np.nan,
        "mean_speed": speed.mean(),
        "speed_std": speed.std(),
        "mean_accel": accel.mean(),
        "mean_jerk": jerk.mean(),
        "num_pauses": int(((speed < pause_speed) & (speed.shift() >= pause_speed)).sum()),
        "pause_time_s": pause_durations.sum(),
        "sharp_turns": sum(a > sharp_turn for a in angles),
        "mean_turn_angle": np.mean(angles),
        "acceleration_std": accel.std(),
        "high_speed_ratio": dt[speed > speed.quantile(0.9)].sum() / dt.sum(),
        "mean_pause_duration": pause_durations.mean(),
    }

    return metrics


In [43]:
mt_temp = mouseTrack_df.groupby(['id', 'datetime','date', 'weekday', 'weekday_num', 'hour']).apply(lambda group: pd.Series(compute_segment_metrics_extended(group))).reset_index()
merge_1 = pd.merge(fatigue_test_scores, mt_temp , on=['id', 'datetime','date', 'weekday', 'weekday_num', 'hour'], how='inner')
mt_df = pd.merge(stress_test_scores, merge_1, on=['id', 'datetime','date', 'weekday', 'weekday_num', 'hour'], how='inner')
mt_df = mt_df[mt_df['datetime'] != pd.to_datetime('2025-04-21 20:20:49')]
mt_df.set_index(['id', 'date', 'weekday']).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datetime,weekday_num,hour,stress_score,stress_lvl,fatigue_score,fatigue_lvl,fatigue_flg,activity_type,cog_load_flg,...,speed_std,mean_accel,mean_jerk,num_pauses,pause_time_s,sharp_turns,mean_turn_angle,acceleration_std,high_speed_ratio,mean_pause_duration
id,date,weekday,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
respondent_1,2025-04-22,Tuesday,2025-04-22 22:21:04,1,22,95,1,26,3,1,умственная работа(включая комп игры)/работа с ...,1,...,471.493409,1290.403562,309976.480599,45.0,35.795,31.0,14.950294,21875.962229,0.037342,0.795444
respondent_1,2025-04-23,Wednesday,2025-04-23 14:36:06,2,14,42,1,20,3,1,другая работа,0,...,507.481513,891.588152,178451.065122,58.0,38.799,48.0,16.459666,17964.402391,0.045445,0.668948
respondent_1,2025-04-28,Monday,2025-04-28 10:32:05,0,10,62,1,7,1,0,"отдых (не у экрана, сон)",0,...,483.375381,597.183178,32280.000077,39.0,35.669,39.0,16.384793,16699.445074,0.041987,0.91459
respondent_1,2025-04-28,Monday,2025-04-28 21:30:12,0,21,67,1,25,3,1,умственная работа(включая комп игры)/работа с ...,1,...,447.864564,912.545832,185800.687723,46.0,33.616,29.0,16.739807,16128.692264,0.041202,0.730783
respondent_1,2025-04-29,Tuesday,2025-04-29 11:47:33,1,11,60,1,15,2,0,другая работа,0,...,479.619846,1721.975974,457919.046129,43.0,44.358,36.0,17.535497,22163.680812,0.030602,1.031581


In [44]:
mt_df.columns

Index(['id', 'datetime', 'date', 'weekday', 'weekday_num', 'hour',
       'stress_score', 'stress_lvl', 'fatigue_score', 'fatigue_lvl',
       'fatigue_flg', 'activity_type', 'cog_load_flg', 'self_score',
       'duration_s', 'total_distance', 'straight_distance', 'path_efficiency',
       'mean_speed', 'speed_std', 'mean_accel', 'mean_jerk', 'num_pauses',
       'pause_time_s', 'sharp_turns', 'mean_turn_angle', 'acceleration_std',
       'high_speed_ratio', 'mean_pause_duration'],
      dtype='object')

### Все метрики и утомление

In [45]:
mt_df.columns

Index(['id', 'datetime', 'date', 'weekday', 'weekday_num', 'hour',
       'stress_score', 'stress_lvl', 'fatigue_score', 'fatigue_lvl',
       'fatigue_flg', 'activity_type', 'cog_load_flg', 'self_score',
       'duration_s', 'total_distance', 'straight_distance', 'path_efficiency',
       'mean_speed', 'speed_std', 'mean_accel', 'mean_jerk', 'num_pauses',
       'pause_time_s', 'sharp_turns', 'mean_turn_angle', 'acceleration_std',
       'high_speed_ratio', 'mean_pause_duration'],
      dtype='object')

In [46]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression
import numpy as np
from scipy import stats

ids = mt_df['id'].unique()
num_ids = len(ids)

numeric_columns = ['total_distance', 'straight_distance', 'path_efficiency',
       'mean_speed', 'speed_std', 'mean_accel', 'mean_jerk', 'num_pauses',
       'pause_time_s', 'sharp_turns', 'mean_turn_angle', 'acceleration_std',
       'high_speed_ratio', 'mean_pause_duration']

colors = [
    '#FF0000',  # Bright Red
    '#00FF00',  # Bright Green
    '#0000FF',  # Bright Blue
    '#FF00FF',  # Magenta
    '#00FFFF',  # Cyan
    '#FFFF00',  # Yellow
    '#FFA500',  # Orange
    '#800080',  # Purple
    '#008000',  # Green
    '#000080',  # Navy
    '#800000',  # Maroon
    '#008080',  # Teal
    '#FF1493',  # Deep Pink
    '#1E90FF',  # Dodger Blue
    '#7CFC00',  # Lawn Green
    '#FFD700',  # Gold
    '#FF4500'   # Orange Red
]

metrics = [col for col in numeric_columns if col != 'fatigue_score']
n_metrics = len(metrics)
cols = 3
rows = (n_metrics + 2) // cols


fig = make_subplots(
    rows=rows,
    cols=cols,
    subplot_titles=[f"{metric} vs Fatigue Score" for metric in metrics],
    horizontal_spacing=0.1,
    vertical_spacing=0.1
)


for i, metric in enumerate(metrics):
    row = i // cols + 1
    col = i % cols + 1
    
    
    for j, respondent_id in enumerate(ids):
        respondent_data = mt_df[mt_df['id'] == respondent_id].sort_values('datetime')

        # Calculate correlations
        pearson_corr = np.nan
        spearman_corr = np.nan
        kendall_corr = np.nan
        if respondent_data[metric].nunique() > 1:
            pearson_corr = respondent_data[metric].corr(respondent_data['fatigue_score'], method='pearson')
            spearman_corr = respondent_data[metric].corr(respondent_data['fatigue_score'], method='spearman')
            kendall_corr = respondent_data[metric].corr(respondent_data['fatigue_score'], method='kendall')
            
        scatter = go.Scatter(
            y=respondent_data[metric],
            x=respondent_data['fatigue_score'],
            mode='markers',
            name=f'ID: {respondent_id}',
            marker=dict(
                color=colors[j % len(colors)],
                size=8,
                opacity=0.8
            ),
            showlegend=(i == 0),
            legendgroup=f'{respondent_id}',
            hovertemplate=(
                f"ID: {respondent_id}<br>" +
                "DateTime: %{customdata}<br>" +
                f"{metric}: %{{x:.2f}}<br>" +
                "Score: %{y:.2f}<br>" +
                f"Pearson r: {pearson_corr:.3f}<br>" +
                f"Spearman r: {spearman_corr:.3f}<br>" +
                f"Kendall τ: {kendall_corr:.3f}<br>"
            ),
            customdata=respondent_data['datetime']
        )
        fig.add_trace(scatter, row=row, col=col)

        # Add trendline
        y_vals = respondent_data[metric].values
        x_vals = respondent_data['fatigue_score'].values.reshape(-1, 1)

        if len(x_vals) > 1:
            model = LinearRegression().fit(x_vals, y_vals)
            x_range = np.linspace(x_vals.min(), x_vals.max(), 100).reshape(-1, 1)
            y_pred = model.predict(x_range)

            # Create annotation text with correlation values
            annotation_text = (
                f"ID: {respondent_id}<br>" +
                f"Pearson r: {pearson_corr:.3f}<br>" +
                f"Spearman r: {spearman_corr:.3f}<br>" +
                f"Kendall τ: {kendall_corr:.3f}<br>" 
            )

            fig.add_trace(
                go.Scatter(
                    x=x_range.flatten(),
                    y=y_pred,
                    mode='lines',
                    name=f'Trend {respondent_id}',
                    line=dict(
                        color=colors[j % len(colors)],
                        width=2,
                        dash='dash'
                    ),
                    showlegend=False,
                    legendgroup=f'{respondent_id}',
                    hovertemplate=annotation_text
                ),
                row=row, col=col
            )

# Update layout
fig.update_layout(
    height=300 * rows,
    width=1200,
    title_text="Metrics vs Score by Respondent",
    font=dict(size=12),
    margin=dict(l=50, r=50, t=100, b=50),
    showlegend=True,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=1.05
    )
)

# Update axes labels
for i in range(1, rows * cols + 1):
    if i > len(metrics):
        continue
    fig['layout'][f'xaxis{i}'].update(title="Fatigue score")
    fig['layout'][f'yaxis{i}'].update(title=metrics[i-1])

fig.show()

- high_speed_ratio -
- total_distance
- sharp_turns
- pause_time_s -
- mean_speed -
- mean_accel -

### Корреляции

In [47]:

numeric_columns = ['total_distance', 'straight_distance', 'path_efficiency',
       'mean_speed', 'speed_std', 'mean_accel', 'mean_jerk', 'num_pauses',
       'pause_time_s', 'sharp_turns', 'mean_turn_angle', 'acceleration_std',
       'high_speed_ratio', 'mean_pause_duration']
metrics = {}
for metric in numeric_columns:
    temp = mt_df.groupby('id')[['fatigue_score', metric]]\
        .corr(method='pearson')\
        .drop(columns='fatigue_score')\
        .reset_index()
    metrics[metric] = temp[temp['level_1'] == 'fatigue_score'][metric].abs().mean()
df_metrics = pd.DataFrame(list(metrics.items()), columns=['metric', 'value'])
df_metrics.sort_values('value')

Unnamed: 0,metric,value
1,straight_distance,0.178555
7,num_pauses,0.228391
5,mean_accel,0.233365
10,mean_turn_angle,0.238785
6,mean_jerk,0.248828
11,acceleration_std,0.249795
3,mean_speed,0.255077
9,sharp_turns,0.312553
2,path_efficiency,0.313524
13,mean_pause_duration,0.330925


In [48]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

rows, cols = (len(mt_df['id'].unique()) + 1) // 2, 2
fig = make_subplots(rows=rows, cols=cols, subplot_titles=mt_df['id'].unique(), shared_xaxes=True)
group_col = 'fatigue_lvl'
metric = 'mean_accel'

for i, name in enumerate(mt_df['id'].unique()):
    row = i // cols + 1
    col = i % cols + 1

    for group_value in sorted(mt_df[group_col].dropna().unique()):
        group_df = mt_df[(mt_df[group_col] == group_value) & (mt_df['id'] == name)]
        # print(group_df.shape, group_value)

        # Boxplot (без hover)
        fig.add_trace(
            go.Box(
                y=group_df[metric],
                name=str(group_value),
                boxmean=True,
                marker=dict(opacity=0),
                showlegend=False
            ),
            row=row, col=col
        )

        # Scatter с datetime в hover
        fig.add_trace(
            go.Scatter(
                x=[str(group_value)] * len(group_df),
                y=group_df[metric],
                mode='markers',
                marker=dict(size=5, color='black', opacity=0.4),
                text=group_df['datetime'].astype(str),
                hovertemplate=(
                    f"{group_col}: {group_value}<br>" +
                    f"{metric}: %{{y}}<br>" +
                    "Время: %{text}<extra></extra>"
                ),
                showlegend=False
            ),
            row=row, col=col
        )
ordered_groups = sorted(mt_df[group_col].dropna().unique())
fig.update_layout(xaxis=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))
fig.update_layout(xaxis2=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))

# Общие настройки
fig.update_layout(
    height=800,
    width=800,
    title=f"Распределения метрик по группам {group_col} по метрике {metric} при наведении",
    margin=dict(t=100),
    font=dict(size=12)
)

fig.show()

In [49]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

rows, cols = (len(mt_df['id'].unique()) + 1) // 2, 2
fig = make_subplots(rows=rows, cols=cols, subplot_titles=mt_df['id'].unique(), shared_xaxes=True)
group_col = 'fatigue_lvl'
metric = 'pause_time_s'

for i, name in enumerate(mt_df['id'].unique()):
    row = i // cols + 1
    col = i % cols + 1

    for group_value in sorted(mt_df[group_col].dropna().unique()):
        group_df = mt_df[(mt_df[group_col] == group_value) & (mt_df['id'] == name)]
        # print(group_df.shape, group_value)

        # Boxplot (без hover)
        fig.add_trace(
            go.Box(
                y=group_df[metric],
                name=str(group_value),
                boxmean=True,
                marker=dict(opacity=0),
                showlegend=False
            ),
            row=row, col=col
        )

        # Scatter с datetime в hover
        fig.add_trace(
            go.Scatter(
                x=[str(group_value)] * len(group_df),
                y=group_df[metric],
                mode='markers',
                marker=dict(size=5, color='black', opacity=0.4),
                text=group_df['datetime'].astype(str),
                hovertemplate=(
                    f"{group_col}: {group_value}<br>" +
                    f"{metric}: %{{y}}<br>" +
                    "Время: %{text}<extra></extra>"
                ),
                showlegend=False
            ),
            row=row, col=col
        )
ordered_groups = sorted(mt_df[group_col].dropna().unique())
fig.update_layout(xaxis=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))
fig.update_layout(xaxis2=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))

# Общие настройки
fig.update_layout(
    height=800,
    width=800,
    title=f"Распределения метрик по группам {group_col} по метрике {metric} при наведении",
    margin=dict(t=100),
    font=dict(size=12)
)

fig.show()

In [50]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

rows, cols = (len(mt_df['id'].unique()) + 1) // 2, 2
fig = make_subplots(rows=rows, cols=cols, subplot_titles=mt_df['id'].unique(), shared_xaxes=True)
group_col = 'fatigue_lvl'
metric = 'high_speed_ratio'

for i, name in enumerate(mt_df['id'].unique()):
    row = i // cols + 1
    col = i % cols + 1

    for group_value in sorted(mt_df[group_col].dropna().unique()):
        group_df = mt_df[(mt_df[group_col] == group_value) & (mt_df['id'] == name)]
        # print(group_df.shape, group_value)

        # Boxplot (без hover)
        fig.add_trace(
            go.Box(
                y=group_df[metric],
                name=str(group_value),
                boxmean=True,
                marker=dict(opacity=0),
                showlegend=False
            ),
            row=row, col=col
        )

        # Scatter с datetime в hover
        fig.add_trace(
            go.Scatter(
                x=[str(group_value)] * len(group_df),
                y=group_df[metric],
                mode='markers',
                marker=dict(size=5, color='black', opacity=0.4),
                text=group_df['datetime'].astype(str),
                hovertemplate=(
                    f"{group_col}: {group_value}<br>" +
                    f"{metric}: %{{y}}<br>" +
                    "Время: %{text}<extra></extra>"
                ),
                showlegend=False
            ),
            row=row, col=col
        )
ordered_groups = sorted(mt_df[group_col].dropna().unique())
fig.update_layout(xaxis=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))
fig.update_layout(xaxis2=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))

# Общие настройки
fig.update_layout(
    height=800,
    width=800,
    title=f"Распределения метрик по группам {group_col} по метрике {metric} при наведении",
    margin=dict(t=100),
    font=dict(size=12)
)

fig.show()

In [51]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

rows, cols = (len(mt_df['id'].unique()) + 1) // 2, 2
fig = make_subplots(rows=rows, cols=cols, subplot_titles=mt_df['id'].unique(), shared_xaxes=True)
group_col = 'fatigue_lvl'
metric = 'sharp_turns'

for i, name in enumerate(mt_df['id'].unique()):
    row = i // cols + 1
    col = i % cols + 1

    for group_value in sorted(mt_df[group_col].dropna().unique()):
        group_df = mt_df[(mt_df[group_col] == group_value) & (mt_df['id'] == name)]
        # print(group_df.shape, group_value)

        # Boxplot (без hover)
        fig.add_trace(
            go.Box(
                y=group_df[metric],
                name=str(group_value),
                boxmean=True,
                marker=dict(opacity=0),
                showlegend=False
            ),
            row=row, col=col
        )

        # Scatter с datetime в hover
        fig.add_trace(
            go.Scatter(
                x=[str(group_value)] * len(group_df),
                y=group_df[metric],
                mode='markers',
                marker=dict(size=5, color='black', opacity=0.4),
                text=group_df['datetime'].astype(str),
                hovertemplate=(
                    f"{group_col}: {group_value}<br>" +
                    f"{metric}: %{{y}}<br>" +
                    "Время: %{text}<extra></extra>"
                ),
                showlegend=False
            ),
            row=row, col=col
        )
ordered_groups = sorted(mt_df[group_col].dropna().unique())
fig.update_layout(xaxis=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))
fig.update_layout(xaxis2=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))

# Общие настройки
fig.update_layout(
    height=800,
    width=800,
    title=f"Распределения метрик по группам {group_col} по метрике {metric} при наведении",
    margin=dict(t=100),
    font=dict(size=12)
)

fig.show()

- все индивидуально - для разных респондентов разные метрики коррелируют с утомлением

### По задачам

In [52]:
merged = mouseTrack_df.merge(
    answersLog_df,
    on=['id', 'datetime','date', 'weekday', 'weekday_num', 'hour'] ,
    how='inner'
)

mouse_with_task = merged[
    (merged['time'] >= merged['startTime']) & (merged['time'] < merged['endTime'])
].copy()
mouse_with_task['start_diff'] = mouse_with_task['time'] - mouse_with_task['startTime']
mouse_with_task['end_diff'] = mouse_with_task['time'] - mouse_with_task['endTime']


In [53]:
mt_q_df_temp = mouse_with_task.groupby(['id', 'datetime','date', 'weekday', 'weekday_num', 'hour', 'q_num']).apply(lambda group: pd.Series(compute_segment_metrics_extended(group))).reset_index()
mt_q_df_temp.set_index(['id', 'datetime','date', 'weekday', 'weekday_num', 'hour', 'q_num'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,duration_s,total_distance,straight_distance,path_efficiency,mean_speed,speed_std,mean_accel,mean_jerk,num_pauses,pause_time_s,sharp_turns,mean_turn_angle,acceleration_std,high_speed_ratio,mean_pause_duration
id,datetime,date,weekday,weekday_num,hour,q_num,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
respondent_1,2025-04-21 20:20:49,2025-04-21,Monday,0,20,0,9.981,2662.256863,235.238177,11.317282,536.113696,704.834262,553.028208,2.192894e+04,8.0,4.485,4.0,16.673425,21275.019338,0.049494,0.560625
respondent_1,2025-04-21 20:20:49,2025-04-21,Monday,0,20,1,9.839,1443.399174,200.481919,7.199648,474.283690,360.124753,535.184045,3.516372e+04,4.0,6.402,2.0,13.844229,13642.395594,0.030389,1.280400
respondent_1,2025-04-21 20:20:49,2025-04-21,Monday,0,20,2,3.655,1237.448415,75.059976,16.486129,538.482008,497.696090,275.733078,-3.238323e+04,1.0,0.433,5.0,18.430381,19285.200727,0.063475,0.433000
respondent_1,2025-04-21 20:20:49,2025-04-21,Monday,0,20,3,4.764,706.346818,17.117243,41.265222,447.843214,352.697777,1052.736272,2.336111e+05,5.0,3.246,1.0,15.699579,17194.890747,0.034845,0.649200
respondent_1,2025-04-21 20:20:49,2025-04-21,Monday,0,20,4,4.772,1740.215737,130.969462,13.287187,478.180966,355.796499,70.180274,-2.056895e+04,2.0,0.734,6.0,16.789278,12795.369607,0.076488,0.367000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
respondent_9,2025-05-13 16:59:33,2025-05-13,Tuesday,1,16,5,5.976,2257.392427,241.174211,9.360008,618.296169,966.160377,1935.023729,2.711036e+05,48.0,2.693,2.0,9.076734,99661.495070,0.059739,0.054959
respondent_9,2025-05-13 16:59:33,2025-05-13,Tuesday,1,16,6,5.037,1178.644990,193.126901,6.102956,494.550095,512.366381,985.715903,1.810476e+05,24.0,2.873,0.0,13.042637,45852.610796,0.046258,0.119708
respondent_9,2025-05-13 16:59:33,2025-05-13,Tuesday,1,16,7,9.389,2406.525115,182.222392,13.206528,531.051866,544.974143,2496.876607,7.328328e+05,42.0,5.168,2.0,11.445220,48991.404686,0.047183,0.123048
respondent_9,2025-05-13 16:59:33,2025-05-13,Tuesday,1,16,8,6.823,1851.294115,315.425110,5.869203,536.286441,700.920301,2269.553557,5.065942e+05,27.0,3.580,3.0,10.850320,57472.613595,0.050271,0.132593


In [54]:
mt_q_df_temp_2 = mt_q_df_temp.groupby(['id', 'datetime','date', 'weekday', 'weekday_num', 'hour']).agg(
    duration_s_mean = ('duration_s','mean' ), 
    total_distance_mean = ('total_distance','mean' ), 
    straight_distance_mean = ('straight_distance','mean' ), 
    path_efficiency_mean = ('path_efficiency','mean'),
    mean_speed_mean = ('mean_speed','mean' ), 
    speed_std_mean = ('speed_std','mean' ), 
    mean_accel_mean = ('mean_accel','mean' ), 
    mean_jerk_mean = ('mean_jerk','mean'),     
    num_pauses_mean = ('num_pauses','mean' ), 
    pause_time_s_mean = ('pause_time_s','mean' ), 
    sharp_turns_mean = ('sharp_turns','mean'),
    high_speed_ratio_mean = ('high_speed_ratio', 'mean')
).reset_index()

In [55]:

merge_1 = pd.merge(fatigue_test_scores, mt_q_df_temp_2 , on=['id', 'datetime','date', 'weekday', 'weekday_num', 'hour'], how='inner')
mt_q_df = pd.merge(stress_test_scores, merge_1, on=['id', 'datetime','date', 'weekday', 'weekday_num', 'hour'], how='inner')
mt_q_df = mt_q_df[mt_q_df['datetime'] != pd.to_datetime('2025-04-21 20:20:49')]
mt_q_df.set_index(['id', 'date', 'weekday']).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datetime,weekday_num,hour,stress_score,stress_lvl,fatigue_score,fatigue_lvl,fatigue_flg,activity_type,cog_load_flg,...,straight_distance_mean,path_efficiency_mean,mean_speed_mean,speed_std_mean,mean_accel_mean,mean_jerk_mean,num_pauses_mean,pause_time_s_mean,sharp_turns_mean,high_speed_ratio_mean
id,date,weekday,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
respondent_1,2025-04-22,Tuesday,2025-04-22 22:21:04,1,22,95,1,26,3,1,умственная работа(включая комп игры)/работа с ...,1,...,177.12047,8.491407,524.359599,446.672964,1104.561327,297202.633923,3.6,1.9348,2.8,0.052969
respondent_1,2025-04-23,Wednesday,2025-04-23 14:36:06,2,14,42,1,20,3,1,другая работа,0,...,158.935994,35.760464,497.282689,484.650761,785.476766,109055.288732,5.1,3.2173,4.6,0.051828
respondent_1,2025-04-28,Monday,2025-04-28 10:32:05,0,10,62,1,7,1,0,"отдых (не у экрана, сон)",0,...,178.025143,14.089168,518.700296,442.86818,385.283838,1305.408483,3.0,1.3745,3.5,0.063378
respondent_1,2025-04-28,Monday,2025-04-28 21:30:12,0,21,67,1,25,3,1,умственная работа(включая комп игры)/работа с ...,1,...,200.692863,8.604197,509.93475,432.192102,580.821694,108153.029443,3.8,1.9604,2.6,0.057471
respondent_1,2025-04-29,Tuesday,2025-04-29 11:47:33,1,11,60,1,15,2,0,другая работа,0,...,174.880602,7.15637,491.071239,452.06392,1207.099958,255717.292385,3.5,3.3195,2.9,0.039886


#### Все метрики и утомление

In [56]:
mt_q_df.columns

Index(['id', 'datetime', 'date', 'weekday', 'weekday_num', 'hour',
       'stress_score', 'stress_lvl', 'fatigue_score', 'fatigue_lvl',
       'fatigue_flg', 'activity_type', 'cog_load_flg', 'self_score',
       'duration_s_mean', 'total_distance_mean', 'straight_distance_mean',
       'path_efficiency_mean', 'mean_speed_mean', 'speed_std_mean',
       'mean_accel_mean', 'mean_jerk_mean', 'num_pauses_mean',
       'pause_time_s_mean', 'sharp_turns_mean', 'high_speed_ratio_mean'],
      dtype='object')

In [57]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression

ids = mt_q_df['id'].unique()
num_ids = len(ids)

numeric_columns = [
        'total_distance_mean', 'straight_distance_mean',
       'path_efficiency_mean', 'mean_speed_mean', 'speed_std_mean',
       'mean_accel_mean', 'mean_jerk_mean', 'num_pauses_mean',
       'pause_time_s_mean', 'sharp_turns_mean', 'high_speed_ratio_mean']


colors = [
    '#FF0000',  # Bright Red
    '#00FF00',  # Bright Green
    '#0000FF',  # Bright Blue
    '#FF00FF',  # Magenta
    '#00FFFF',  # Cyan
    '#FFFF00',  # Yellow
    '#FFA500',  # Orange
    '#800080',  # Purple
    '#008000',  # Green
    '#000080',  # Navy
    '#800000',  # Maroon
    '#008080',  # Teal
    '#FF1493',  # Deep Pink
    '#1E90FF',  # Dodger Blue
    '#7CFC00',  # Lawn Green
    '#FFD700',  # Gold
    '#FF4500'   # Orange Red
]

metrics = [col for col in numeric_columns if col != 'fatigue_score']
n_metrics = len(metrics)
cols = 3
rows = (n_metrics + 2) // cols


fig = make_subplots(
    rows=rows,
    cols=cols,
    subplot_titles=[f"{metric} vs Fatigue Score" for metric in metrics],
    horizontal_spacing=0.1,
    vertical_spacing=0.1
)


for i, metric in enumerate(metrics):
    row = i // cols + 1
    col = i % cols + 1
    
    
    for j, respondent_id in enumerate(ids):
        respondent_data = mt_q_df[mt_q_df['id'] == respondent_id].sort_values('datetime')

        # Calculate correlations
        pearson_corr = np.nan
        spearman_corr = np.nan
        kendall_corr = np.nan
        if respondent_data[metric].nunique() > 1:
            pearson_corr = respondent_data[metric].corr(respondent_data['fatigue_score'], method='pearson')
            spearman_corr = respondent_data[metric].corr(respondent_data['fatigue_score'], method='spearman')
            kendall_corr = respondent_data[metric].corr(respondent_data['fatigue_score'], method='kendall')
            
        scatter = go.Scatter(
            y=respondent_data[metric],
            x=respondent_data['fatigue_score'],
            mode='markers',
            name=f'ID: {respondent_id}',
            marker=dict(
                color=colors[j % len(colors)],
                size=8,
                opacity=0.8
            ),
            showlegend=(i == 0),
            legendgroup=f'{respondent_id}',
            hovertemplate=(
                f"ID: {respondent_id}<br>" +
                "DateTime: %{customdata}<br>" +
                f"{metric}: %{{x:.2f}}<br>" +
                "Score: %{y:.2f}<br>" +
                f"Pearson r: {pearson_corr:.3f}<br>" +
                f"Spearman r: {spearman_corr:.3f}<br>" +
                f"Kendall τ: {kendall_corr:.3f}<br>"
            ),
            customdata=respondent_data['datetime']
        )
        fig.add_trace(scatter, row=row, col=col)

        # Add trendline
        y_vals = respondent_data[metric].values
        x_vals = respondent_data['fatigue_score'].values.reshape(-1, 1)

        if len(x_vals) > 1:
            model = LinearRegression().fit(x_vals, y_vals)
            x_range = np.linspace(x_vals.min(), x_vals.max(), 100).reshape(-1, 1)
            y_pred = model.predict(x_range)

            # Create annotation text with correlation values
            annotation_text = (
                f"ID: {respondent_id}<br>" +
                f"Pearson r: {pearson_corr:.3f}<br>" +
                f"Spearman r: {spearman_corr:.3f}<br>" +
                f"Kendall τ: {kendall_corr:.3f}<br>" 
            )

            fig.add_trace(
                go.Scatter(
                    x=x_range.flatten(),
                    y=y_pred,
                    mode='lines',
                    name=f'Trend {respondent_id}',
                    line=dict(
                        color=colors[j % len(colors)],
                        width=2,
                        dash='dash'
                    ),
                    showlegend=False,
                    legendgroup=f'{respondent_id}',
                    hovertemplate=annotation_text
                ),
                row=row, col=col
            )

# Update layout
fig.update_layout(
    height=300 * rows,
    width=1200,
    title_text="Metrics vs Score by Respondent",
    font=dict(size=12),
    margin=dict(l=50, r=50, t=100, b=50),
    showlegend=True,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=1.05
    )
)

# Update axes labels
for i in range(1, rows * cols + 1):
    if i > len(metrics):
        continue
    fig['layout'][f'xaxis{i}'].update(title="Fatigue score")
    fig['layout'][f'yaxis{i}'].update(title=metrics[i-1])

fig.show()

#### Корреляция

In [58]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

rows, cols = (len(mt_q_df['id'].unique()) + 1) // 2, 2
fig = make_subplots(rows=rows, cols=cols, subplot_titles=mt_q_df['id'].unique(), shared_xaxes=True)
group_col = 'fatigue_lvl'
metric = 'total_distance_mean'

for i, name in enumerate(mt_q_df['id'].unique()):
    row = i // cols + 1
    col = i % cols + 1

    for group_value in sorted(mt_q_df[group_col].dropna().unique()):
        group_df = mt_q_df[(mt_q_df[group_col] == group_value) & (mt_q_df['id'] == name)]
        # print(group_df.shape, group_value)

        # Boxplot (без hover)
        fig.add_trace(
            go.Box(
                y=group_df[metric],
                name=str(group_value),
                boxmean=True,
                marker=dict(opacity=0),
                showlegend=False
            ),
            row=row, col=col
        )

        # Scatter с datetime в hover
        fig.add_trace(
            go.Scatter(
                x=[str(group_value)] * len(group_df),
                y=group_df[metric],
                mode='markers',
                marker=dict(size=5, color='black', opacity=0.4),
                text=group_df['datetime'].astype(str),
                hovertemplate=(
                    f"{group_col}: {group_value}<br>" +
                    f"{metric}: %{{y}}<br>" +
                    "Время: %{text}<extra></extra>"
                ),
                showlegend=False
            ),
            row=row, col=col
        )
ordered_groups = sorted(mt_q_df[group_col].dropna().unique())
fig.update_layout(xaxis=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))
fig.update_layout(xaxis2=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))

# Общие настройки
fig.update_layout(
    height=800,
    width=800,
    title=f"Распределения метрик по группам {group_col} по метрике {metric} при наведении",
    margin=dict(t=100),
    font=dict(size=12)
)

fig.show()

In [59]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

rows, cols = (len(mt_q_df['id'].unique()) + 1) // 2, 2
fig = make_subplots(rows=rows, cols=cols, subplot_titles=mt_q_df['id'].unique(), shared_xaxes=True)
group_col = 'fatigue_lvl'
metric = 'high_speed_ratio_mean'

for i, name in enumerate(mt_q_df['id'].unique()):
    row = i // cols + 1
    col = i % cols + 1

    for group_value in sorted(mt_q_df[group_col].dropna().unique()):
        group_df = mt_q_df[(mt_q_df[group_col] == group_value) & (mt_q_df['id'] == name)]
        # print(group_df.shape, group_value)

        # Boxplot (без hover)
        fig.add_trace(
            go.Box(
                y=group_df[metric],
                name=str(group_value),
                boxmean=True,
                marker=dict(opacity=0),
                showlegend=False
            ),
            row=row, col=col
        )

        # Scatter с datetime в hover
        fig.add_trace(
            go.Scatter(
                x=[str(group_value)] * len(group_df),
                y=group_df[metric],
                mode='markers',
                marker=dict(size=5, color='black', opacity=0.4),
                text=group_df['datetime'].astype(str),
                hovertemplate=(
                    f"{group_col}: {group_value}<br>" +
                    f"{metric}: %{{y}}<br>" +
                    "Время: %{text}<extra></extra>"
                ),
                showlegend=False
            ),
            row=row, col=col
        )
ordered_groups = sorted(mt_q_df[group_col].dropna().unique())
fig.update_layout(xaxis=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))
fig.update_layout(xaxis2=dict(categoryorder='array', categoryarray=[str(v) for v in ordered_groups]))

# Общие настройки
fig.update_layout(
    height=800,
    width=800,
    title=f"Распределения метрик по группам {group_col} по метрике {metric} при наведении",
    margin=dict(t=100),
    font=dict(size=12)
)

fig.show()

# Общая информация

In [60]:
q_df.groupby(['id']).agg(
    mean_time=('response_time_mean', 'mean'),
    rt_25=('response_time_mean', lambda x: x.quantile(0.25)),
    rt_50=('response_time_mean', lambda x: x.quantile(0.5)),
    rt_75=('response_time_mean', lambda x: x.quantile(0.75)),
).sort_index(key=lambda x: x.str.split('_').str[1].astype(int)).round(2)

Unnamed: 0_level_0,mean_time,rt_25,rt_50,rt_75
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
respondent_1,7.59,6.93,7.5,8.1
respondent_2,9.69,9.0,9.6,10.22
respondent_3,9.31,8.75,9.48,9.83
respondent_4,7.35,6.67,7.05,7.6
respondent_5,6.85,6.49,6.81,7.38
respondent_6,8.19,7.48,7.7,8.39
respondent_7,9.37,8.72,9.1,9.66
respondent_8,12.27,9.16,9.54,10.4
respondent_9,8.35,7.83,8.24,8.85
respondent_10,8.19,7.65,7.72,9.02


In [61]:
ta_df.groupby(['id']).agg(
    # mean_time=('response_time', 'mean'),
    # rt_25=('response_time', lambda x: x.quantile(0.25)),
    # rt_50=('response_time', lambda x: x.quantile(0.5)),
    # rt_75=('response_time', lambda x: x.quantile(0.75)),
    # rt_=('answer_len_sum', 'mean'),
    
    smean_time=('sec_per_char_mean', 'mean'),
    srt_25=('sec_per_char_mean', lambda x: x.quantile(0.25)),
    srt_50=('sec_per_char_mean', lambda x: x.quantile(0.5)),
    srt_75=('sec_per_char_mean', lambda x: x.quantile(0.75)),
    srt_=('sec_per_char_mean', 'mean'),
    # dsan_time=('distance_sum', 'mean'),
    # ds_25=('distance_sum', lambda x: x.quantile(0.25)),
    # ds_50=('distance_sum', lambda x: x.quantile(0.5)),
    # ds_75=('distance_sum', lambda x: x.quantile(0.75)),
).sort_index(key=lambda x: x.str.split('_').str[1].astype(int)).round(2)


Unnamed: 0_level_0,smean_time,srt_25,srt_50,srt_75,srt_
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
respondent_1,1.21,1.07,1.13,1.21,1.21
respondent_2,1.18,1.11,1.15,1.21,1.18
respondent_3,1.2,1.1,1.15,1.29,1.2
respondent_4,0.9,0.86,0.89,0.93,0.9
respondent_5,0.82,0.78,0.83,0.84,0.82
respondent_6,0.73,0.69,0.71,0.77,0.73
respondent_7,0.95,0.92,0.99,1.02,0.95
respondent_8,1.37,1.26,1.35,1.47,1.37
respondent_9,0.97,0.88,0.91,1.04,0.97
respondent_10,1.22,1.09,1.24,1.28,1.22
