In [None]:
# | echo: false



## Purpose of This Notebook

This notebook serves as an exploratory tool for examining the log files produced during the Cognitive Bias horse behavioural experiments conducted in October and November 2023. It facilitates the experimentation with text parsing techniques on the files before they are imported into a database. The primary objective is to reconcile which log files should be included in or excluded from the analysis.


## Experiment details and naming conventions

There are two main types of Experiment:

- Reward Prediction (RPE)
- Cognitive Bias (CB)

#### Logfile Exclusion rules

Logfiles that are from test runs and also bad data need to be excluded from the analysis.

Rules are **case-insensitive**. Files which satisfy the following conditions are excluded:

- TODO
  
#### Problems with log file names during experiments

- TODO

#### Time differences

For each trial we calculate the following time differences: 


### Cognitive Bias Experiments

- Extract "RIGHT" or "LEFT" from the Comment field.
- Also extract details from log file name
- 1. Training experiments: 
  - Type 1
  - Type 2
-  2. Testing experiments: 
   -  Type 1
   -  Type 2
   -  Type 3
   -  Type 4 (re-uses Type 1 with indicator to distinguish in Comment field)

#### Time differences

For each trial: 

Training Type 1 (randomised versus fixed):
- `Start` datetime = Green button pressed and horse is released
- Capture positive (`GO`) / negative (`NOGO`) response time subject to maximum cutoff time (e.g. 30 seconds)
- In addition to left/right positioning of feed, there are also median, near positive and near negative positions.

*TODO check with CH: Test only - be in all?*

In [None]:
# | echo: false

import textwrap
from pathlib import Path

import pandas as pd
from IPython.display import Markdown, display
from loguru import logger

from horse_logic.logfiles import Logs
from horse_logic.utils import ProjectInfo, export_data_to_csv, set_custom_logger_format

In [None]:
# | echo: false

import itables.options as opt
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True, connected=False)  # Display Pandas dataframes in a more friendly paginated manner
opt.pageLength = 20  # Display 20 rows per page


In [None]:
# | echo: false

PROJECT_DIR = ProjectInfo.get_root_dir()
set_custom_logger_format()

In [None]:
# | echo: false

# Custom notebook functions

def display_sessions_by_subject(df_out, subject_df):
    session_summary = []
    for n_subject, subject_name in enumerate(subject_df["subject_name"]):
        df_ = df_out.drop(columns=["suffix", "subject_name"])[df_out["subject_name"] == subject_name]
        n_session = len(df_)
        session_summary.append((n_subject + 1, subject_name.capitalize(), n_session))
        if n_session == 0:
            logger.info(f"Subject {subject_name}: No experiments conducted")
        else:
            df_.loc[:, "time_dff"] = df_["datetime"].diff()
            display(Markdown(f"### {n_subject+1}. {subject_name.capitalize()}: {n_session} session(s)"))
            display(df_)
    return pd.DataFrame(session_summary, columns=["Subject number", "Subject name", "Session count"])

### Load Subject order

In [None]:
# | echo: false

def get_subject_info():
    HORSE_ORDER_XLSX = "Cohort data for MB.xlsx"
    HORSE_ORDER_PATH = Path("../docs/from_CH") 
    HORSE_ORDER_FILEPATH = HORSE_ORDER_PATH / HORSE_ORDER_XLSX

    if HORSE_ORDER_FILEPATH.exists():
        subject_df = pd.read_excel(HORSE_ORDER_FILEPATH)
        logger.info(f"Loaded horse order info from: {HORSE_ORDER_FILEPATH}")

        subject_df.rename({"No": "subject_number", "Horse": "subject_name"}, axis=1, inplace=True)  # Rename columns
        subject_df["subject_name"] = subject_df["subject_name"].str.lower()     # Ensure lower case names for later subject lookup
        return subject_df
    else:
        logger.error(f"Horse order info not found: {HORSE_ORDER_FILEPATH}")
        return None

In [None]:
# | echo: false

subject_df = get_subject_info()

logger.info(f"Subjects: {len(subject_df)}")
logger.info(subject_df["subject_name"].to_list())

# ensure lower case subject names for later lookup

subject_df["subject_name"] = subject_df["subject_name"].str.lower()

## Log file reconciliation

### Directory information: log file data and outputs

In [None]:
# | echo: false

LOGFILES_DIR = "../data/results/zips/cb_data"

assert Path(LOGFILES_DIR).exists()
logger.info(f"Logfiles dir: {LOGFILES_DIR}")

In [None]:
# | echo: false

# DATA_DIR = Path("../data")
# EXPERIMENT_TYPE = "CB"    # or RPE

# assert DATA_DIR.exists()

# DATA_DB  = DATA_DIR / f"Experiments_{EXPERIMENT_TYPE}_2023-Q4.ddb"  # DuckDB database name
# db_exists = DATA_DB.exists()

# logger.info(f"Database file: {DATA_DB.resolve()}")

# | echo: false

DATA_DIR = PROJECT_DIR / "data"
EXPERIMENT_TYPE = "CB"  
NOTEBOOK_DIR = Path.cwd()          # Current notebooks directory   

assert DATA_DIR.exists()
assert NOTEBOOK_DIR.exists()

In [None]:
# | echo: false

OUTPUT_DIR = DATA_DIR / f"results/{EXPERIMENT_TYPE}"

assert OUTPUT_DIR.exists()
logger.info(f"Outputs dir: {OUTPUT_DIR.resolve()}")

### Initial exclusions (rule-based)

Rules:

- Ignore all CBF1 files - data will not be analysed (*CH: What does CBF1 mean? and similar*)
- Ignore all Olive files (6 log files)
- Ignore Maple CBT1 on 9 Oct (*CH: Did Maple have a different name?*)
- Run check to see how many N bucket GO responses exceed 30s in first 3 days (*CH: Please explain*)


- 29 `*test*.log` files (exclude)

In [None]:
test_logs = Logs(path=LOGFILES_DIR, patterns=["*test*.log"])

In [None]:
[logfile.file_name for logfile in test_logs.logfiles]

In [None]:
test_logs

In [None]:
olive_logs = Logs(path=LOGFILES_DIR, patterns=["*olive*.log"])

In [None]:
olive_logs

In [None]:
[logfile.file_name for logfile in olive_logs.logfiles]

In [None]:
maple_logs = Logs(path=LOGFILES_DIR, patterns=["*maple*.log"])

In [None]:
[logfile.file_name for logfile in maple_logs.logfiles]

In [None]:
# | echo: false

# Define the default set of log files to exclude before fine-tuning after

logs = Logs(path=LOGFILES_DIR, patterns=["*_test*.log", "*olive*.log"], include=False)


In [None]:
# | echo: false

logger.info(logs)
logger.info(f"Included logfiles: {len(logs.included_files)}")
logger.info(f"Excluded logfiles: {len(logs.excluded_files)}")

In [None]:
# | echo: false

df_excluded = pd.DataFrame(logs.excluded_files, columns=["Excluded"])

In [None]:
# | echo: false

df_excluded.sort_values(by="Excluded").reset_index(drop=True)

#### Included log files

In [None]:
# | echo: false

df_included = pd.DataFrame(logs.included_files, columns=["Included"])

In [None]:
# | echo: false

export_data_to_csv(
    df_included.sort_values(by="Included").reset_index(drop=True),
    "List of files to be included in analysis",
    Path(OUTPUT_DIR) / "included_files_CB.csv",
)

In [None]:
# | echo: false

df_included.sort_values(by="Included").reset_index(drop=True)

In [None]:
# | echo: false

df_file_components = logs.create_filename_components_dataframe_cog_bias(logs.included_files)

In [None]:
# | echo: false

df_out = df_file_components.sort_values(by=["subject_name", "session_number"]).reset_index(drop=True)

### Sessions summary by subject name

In [None]:
# | echo: false

session_summary_df = display_sessions_by_subject(df_out, subject_df)

In [None]:
session_summary_df

In [None]:
# | echo: false

export_data_to_csv(
    session_summary_df,
    "Session overview (subject session counts)",
    Path(OUTPUT_DIR) / "session_overview_CB.csv",
)

In [None]:
# | echo: false

df_out.drop(labels=["suffix"], axis=1, inplace=True)

In [None]:
# | echo: false

df_out = df_out[["original_filename", "subject_name", "experiment_type", "session_number", "datetime"]]

In [None]:
# | echo: false

# Calculate time differences between Experiments to look for anomalies

df_out.loc[:, "time_diff"] = df_out["datetime"].diff()

# Convert session number to integers - not sure how we got some floats?

df_out["session_number"] = df_out["session_number"].astype(pd.Int64Dtype())

In [None]:
# | echo: false

df_out

In [None]:
# | echo: false

# Export list of all excluded files - by rules and specific exclusions

export_data_to_csv(
    pd.DataFrame(logs.excluded_files, columns=["col1"]).sort_values(by="col1").reset_index(drop=True),
    "List of log files excluded",
    Path(OUTPUT_DIR) / "all_excluded_files_CB.csv",
)

In [None]:
# | echo: false

export_data_to_csv(
    df_out, "Experiment summary", Path(OUTPUT_DIR) / "experiment_summary_CB_2023_included.csv"
)

In [None]:
# | echo: false

# Export list of all included files - by rules and specific exclusions (inc Bonnie)

export_data_to_csv(
    pd.DataFrame(logs.included_files, columns=["col1"]).sort_values(by="col1").reset_index(drop=True),
    "File list of log files included",
    Path(OUTPUT_DIR) / "all_included_files_CB.csv",
)