## Purpose of This Notebook

This notebook serves as an exploratory tool for examining the log files produced during the horse behavioural experiments conducted in October and November 2023. It facilitates the experimentation with text parsing techniques on the files before they are imported into a database. The primary objectives are:


> To conduct experiments with regular expressions ([`regex`](https://docs.python.org/3/howto/regex.html)) aimed at extracting pertinent data and fields from the log files.

In [None]:
# | echo: false

from pathlib import Path
from pprint import pprint
from typing import Dict, List

import duckdb
import pandas as pd
from IPython.display import Markdown, display
from loguru import logger

from horse_logic.logfiles import EventCB, ExperimentCB, Logfile, Logs, TrialCB
from horse_logic.utils import (
    create_tables_from_sql_file,
    display_class_definition,
    export_data_to_csv,
    set_custom_logger_format,
)

In [None]:
# | echo: false

import itables.options as opt
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True, connected=False)  # Display Pandas dataframes in a more friendly paginated manner
opt.pageLength = 20  # Display 20 rows per page


In [None]:
# | echo: false

set_custom_logger_format()

In [None]:
# | echo: false

# Custom notebook analysis functions

def display_sessions_by_subject(df_out, subject_df):
    session_summary = []
    for n_subject, subject_name in enumerate(subject_df["subject_name"]):
        df_ = df_out.drop(columns=["suffix", "subject_name"])[df_out["subject_name"] == subject_name]
        n_session = len(df_)
        session_summary.append((n_subject+1, subject_name.capitalize(), n_session))
        if n_session == 0:
            logger.info(f"Subject {subject_name}: No experiments conducted")
        else:
            df_.loc[:, "time_dff"] = df_["datetime"].diff()
            display(Markdown(f"### {n_subject+1}. {subject_name.capitalize()}: {n_session} session(s)"))
            display(df_)
    return pd.DataFrame(session_summary, columns=["Subject number", "Subject name", "Session count"])

### Load Subject information

In [None]:
# | echo: false

def get_subject_info():
    HORSE_ORDER_XLSX = "Cohort data for MB.xlsx"
    HORSE_ORDER_PATH = Path("../docs/from_CH") 
    HORSE_ORDER_FILEPATH = HORSE_ORDER_PATH / HORSE_ORDER_XLSX

    if HORSE_ORDER_FILEPATH.exists():
        subject_df = pd.read_excel(HORSE_ORDER_FILEPATH)
        logger.info(f"Loaded horse order info from: {HORSE_ORDER_FILEPATH}")

        subject_df.rename({"No": "subject_number", "Horse": "subject_name"}, axis=1, inplace=True)  # Rename columns
        subject_df["subject_name"] = subject_df["subject_name"].str.lower()     # Ensure lower case names for later subject lookup
        return subject_df
    else:
        logger.error(f"Horse order info not found: {HORSE_ORDER_FILEPATH}")
        return None

In [None]:
subject_df = get_subject_info()

## Log file reconciliation

### Directory information: log file data and outputs

In [None]:
# | echo: false

LOGFILES_DIR = "../data/results/zips/cb_data"

logger.info(f"Logfiles dir: {LOGFILES_DIR}")
assert Path(LOGFILES_DIR).exists()



In [None]:
# | echo: false

DATA_DIR = Path("../data")
EXPERIMENT_TYPE = "CB"    # or CB

assert DATA_DIR.exists()

DATA_DB  = DATA_DIR / f"Experiments_{EXPERIMENT_TYPE}_2023-Q4.ddb"  # DuckDB database name
db_exists = DATA_DB.exists()

logger.info(f"Database file: {DATA_DB.resolve()}")

In [None]:
# | echo: false

OUTPUT_DIR = DATA_DIR / f"results/{EXPERIMENT_TYPE}"

assert OUTPUT_DIR.exists()
logger.info(f"Outputs dir: {OUTPUT_DIR.resolve()}")

### Load logfiles from CSV list of included files

As determined in `logfiles-reconciliation-cb.ipynb`

In [None]:
# | echo: false

logs = Logs(path=LOGFILES_DIR)
logs.load_specific_logfiles_from_csv(Path(OUTPUT_DIR) / "all_included_files_CB.csv")

In [None]:
# | echo: false

logger.info(logs)

### Explore `regex` parsing of a single example log file

Analysis code is in `logfiles.py`. Explore parsing log information for specific example defined below.

In [None]:
# JUst choose one of the file names for testing

log_eg = [log.file_name for log in logs.logfiles][0]


In [None]:
# Override with specific log file

log_eg = "Experiment_2023-11-15T00:39:50.244185_clover_nan_Test Type 1.log"

In [None]:
# | echo: false

logfile_eg = f"{logs.path}/{log_eg}"

In [None]:
logfile_eg

In [None]:
# | echo: false

# Load example log file into Logfile class for processing
log_eg = Logfile(logfile_eg)

In [None]:
log_eg

In [None]:
# | echo: false

print(f"Example log file for regex parsing: {log_eg.file_name}")

### Data structures

These are created as `dataclasses` in Python (in `logfiles.py`) and TABLES in SQL (`sql/create_experiment_tables_ddb.sql`).

#### Experiment table

In [None]:
# | echo: false

display_class_definition(ExperimentCB)

#### e.g. Parse experiment details and comment from log file

In [None]:
# | echo: false

pprint(log_eg.parse_filename_components_cog_bias())

#### Trials and Events tables

In [None]:
log_eg.parse_filename_components_cog_bias().LogFileName

In [None]:
# | echo: false

display_class_definition(TrialCB)

In [None]:
# | echo: false

display_class_definition(EventCB)

#### e.g. Parsing of trials and events info from log file

In [None]:
log_eg.parse_trials_and_events_cog_bias()

In [None]:
log_eg.parse_comments()

In [None]:
from dataclasses import asdict

def convert_parsed_data_to_dataframes(parsed_data):
    if parsed_data is None:
        return None

    # Convert experiment to DataFrame
    experiment_df = pd.DataFrame([asdict(parsed_data['experiment'])])

    # Convert trials to DataFrame
    trials_df = pd.DataFrame([asdict(trial) for trial in parsed_data['trials']])

    # Convert responses to DataFrame
    responses_df = pd.DataFrame([asdict(response) for response in parsed_data['responses']])

    # Convert events to DataFrame
    events_df = pd.DataFrame([asdict(event) for event in parsed_data['events']])

    return {
        'experiment': experiment_df,
        'trials': trials_df,
        'responses': responses_df,
        'events': events_df
    }

In [None]:
parsed_data = log_eg.parse_trials_and_events_cog_bias()
dataframes = convert_parsed_data_to_dataframes(parsed_data)

# Now you can access each DataFrame
experiment_df = dataframes['experiment']
trials_df = dataframes['trials']
responses_df = dataframes['responses']
events_df = dataframes['events']

In [None]:
experiment_df

In [None]:
trials_df

In [None]:
events_df

In [None]:
responses_df

In [None]:
# | echo: false

# Export an example of the current parsing of the Trial/Event info for one sample experiment

export_data_to_csv(trials_df, "Example of parsing of the Trial/Event info for one sample experiment", log_eg.parse_filename_components_cog_bias().LogFileName.replace(" ", "").replace(".log", ".csv"), trials_df.columns.to_list())

trials_df.to_excel("Experiment_2023-11-15T00:39:50.244185_clover_nan_Test Type 1.log.xlsx")

### Bringing this all together and putting the data in DuckDB database

In [None]:
# | echo: false

if DATA_DB.exists():  # remove database file if it exists
    logger.info(f"Deleted existing database file: {DATA_DB}")
    DATA_DB.unlink()
    
con = duckdb.connect(database=str(DATA_DB))

In [None]:
# | echo: false

create_tables_from_sql_file(con, '../sql/create_cb_experiment_tables_ddb.sql')

In [None]:
# | echo: false

# These files where all excluded by the parsing function (and have been added here explicitly)

extra_excludes = [
    'Experiment_2023-10-05T00:33:20.982985_bonnie_34_Training - fixed.log',
    'Experiment_2023-10-05T00:34:54.463462_bonnie_35_Training - fixed.log',
    'Experiment_2023-10-05T00:36:53.748319_bonnie_36_Training randomised Type 1.log',
    'Experiment_2023-10-05T00:37:20.038515_bonnie_37_Training randomised Type 2.log',
    'Experiment_2023-10-05T00:38:53.793295_bonnie_38_Training - fixed.log',
    'Experiment_2023-10-05T00:40:07.655743_bonnie_39_Training - fixed.log',
    'Experiment_2023-10-05T00:40:27.842920_bonnie_40_Test Type 1.log',
    'Experiment_2023-10-05T00:40:48.563909_bonnie_41_Test Type 2.log',
    'Experiment_2023-10-06T08:17:38.338641_bonnie_49_Training randomised Type 1.log',
    'Experiment_2023-10-06T11:59:46.930869_bonnie_51_Training - fixed.log',
    'Experiment_2023-10-06T13:17:11.425590_ash_1_Training - fixed.log',
    'Experiment_2023-10-09T10:02:29.985144_bonnie_57_Training - fixed.log',
    'Experiment_2023-10-09T10:03:02.623587_bonnie_58_Training randomised Type 2.log',
    'Experiment_2023-10-09T15:08:04.475922_filly_7_Training - fixed.log',
    'Experiment_2023-10-09T15:18:10.013061_dougie_4_Training randomised Type 2.log',
    'Experiment_2023-10-09T15:24:15.757940_dougie_6_Training randomised Type 2.log',
    'Experiment_2023-10-09T17:03:22.507500_bonnie_62_Training - fixed.log',
    'Experiment_2023-10-09T18:29:49.279855_molly_5_Test Type 1.log',
    'Experiment_2023-10-10T08:33:44.845499_bonnie_65_Test Type 1.log',
    'Experiment_2023-10-10T08:38:11.132513_bonnie_66_Test Type 1.log',
    'Experiment_2023-10-10T08:39:34.167268_bonnie_67_Test Type 1.log',
    'Experiment_2023-10-10T08:42:47.519718_bonnie_68_Test Type 1.log',
    'Experiment_2023-10-10T08:43:09.979183_bonnie_69_Test Type 2.log',
    'Experiment_2023-10-16T10:38:24.993736_ash_7_Test Type 1.log',
    'Experiment_2023-11-12T18:50:43.726156_pumba_nan_Training - fixed.log',
    'Experiment_2023-11-13T19:16:57.507097_freya_nan_Training randomised Type 1.log',
    'Experiment_2023-11-14T00:11:57.637594_george_4.0_Training randomised Type 2.log',
    'Experiment_2023-11-14T11:03:25.088331_nix_6.0_Training - fixed.log',
    'Experiment_2023-11-14T12:05:21.440157_dusty_5.0_Training - fixed.log',
    'Experiment_2023-11-15T03:51:01.169253_yoshi_7.0_Training - fixed.log'
 ]

In [None]:
len(extra_excludes)

In [None]:
# | echo: false

export_data_to_csv(pd.DataFrame(extra_excludes),
                   "Additional log files excluded as bad data",
                   f"{Path(OUTPUT_DIR) / 'extra_excludes.csv'}")

In [None]:
# | echo: false

logfiles_to_process = [logfile.file_name for logfile in logs.logfiles if logfile.file_name not in extra_excludes]

export_data_to_csv(pd.DataFrame(logfiles_to_process),
                   "List of logfiles to be loaded to database",
                   f"{Path(OUTPUT_DIR) / 'logfiles-to-database.csv'}")

In [None]:
def parse_and_load_logfiles_to_database(logfiles_to_process: List[str], con, subject_df: pd.DataFrame):
    experiment_id = 0
    trial_id = 0
    response_id = 0
    event_id = 0
    skipped_logfiles = []  # New list to store skipped log files

    for log_filename in sorted(logfiles_to_process):
        logger.info(f"Processing {experiment_id}: {logs.path}/{log_filename}")
        logfile = Logfile(f"{logs.path}/{log_filename}")
        
        # Parse filename components
        try:
            exp_details = logfile.parse_filename_components_cog_bias()
        except ValueError as e:
            logger.warning(f"Skipping {log_filename}: {str(e)}")
            skipped_logfiles.append(log_filename)
            continue

        exp_details.ExperimentID = experiment_id

        # Look up additional details from subject_df
        if exp_details.SubjectName:
            subject_row = subject_df[subject_df["subject_name"] == exp_details.SubjectName]
            if not subject_row.empty:
                exp_details.Cohort = subject_row["Cohort"].iloc[0]
                exp_details.SubjectNumber = int(subject_row["subject_number"].iloc[0])
            else:
                logger.warning(f"Subject {exp_details.SubjectName} not found in subject_df. Skipping this file.")
                skipped_logfiles.append(log_filename)
                continue
        else:
            logger.warning(f"Unable to determine SubjectName for {log_filename}. Skipping this file.")
            skipped_logfiles.append(log_filename)
            continue

        # Parse parameters
        exp_details.Parameters = logfile.parse_parameters()

        # Parse trials and events
        parsed_data = logfile.parse_trials_and_events_cog_bias()
        if parsed_data is None:
            logger.warning(f"Skipping {log_filename} due to parsing error")
            skipped_logfiles.append(log_filename)
            continue

        trials = parsed_data['trials']
        responses = parsed_data['responses']
        events = parsed_data['events']

        # Insert experiment record
        logfile.insert_record_to_database(con, exp_details)

        # Process and insert trials first
        trial_id_mapping = {}  # To map original TrialIDs to new ones
        for trial in trials:
            original_trial_id = trial.TrialID
            trial.TrialID = trial_id
            trial.ExperimentID = experiment_id
            logfile.insert_record_to_database(con, trial)
            trial_id_mapping[original_trial_id] = trial_id
            trial_id += 1

        # Process and insert responses
        for response in responses:
            response.ResponseID = response_id
            response.TrialID = trial_id_mapping[response.TrialID]
            logfile.insert_record_to_database(con, response)
            response_id += 1

        # Process and insert events
        for event in events:
            event.EventID = event_id
            if event.TrialID is not None:
                event.TrialID = trial_id_mapping[event.TrialID]
            logfile.insert_record_to_database(con, event)
            event_id += 1

        experiment_id += 1

        # print(f"Trials: {trial_id}, responses: {response_id}, events {event_id}")

    return len(logfiles_to_process), trial_id, response_id, event_id, skipped_logfiles

In [None]:
num_files, num_trials, num_responses, num_events, skipped_files = parse_and_load_logfiles_to_database(logfiles_to_process, con, subject_df)


In [None]:
logger.info(f"# Loaded - Experiments: {num_files-len(skipped_files)}, Trials: {num_trials}, Events: {num_events}, Responses: {num_responses}")

print(f"Number of files processed: {num_files}")
print(f"Number of trials: {num_trials}")
print(f"Number of responses: {num_responses}")
print(f"Number of events: {num_events}")
print(f"Number of skipped files: {len(skipped_files)}")

In [None]:
# | echo: false

def export_database_tables(con, output_dir, output_format="xlsx"):
    # Get the list of tables in the database
    try:
        tables = con.sql("SHOW TABLES;").fetchall()

        if output_format == "xlsx":
            # Write to one Excel file by sheet name
            with pd.ExcelWriter(f"{output_dir}/all_tables.xlsx") as writer:
                for table in tables:
                    table_name = table[0]
                    # Query the table and convert it to a pandas DataFrame
                    df = con.table(table_name).to_df()
                    df.to_excel(writer, sheet_name=table_name, index=False)
            logger.info(f"Exported database tables to: {output_dir}/all_tables.xlsx")

        elif output_format in ["csv", "parquet"]:
            for table in tables:
                table_name = table[0]
                # Query the table and convert it to a pandas DataFrame
                df = con.table(table_name).to_df()

                if output_format == "csv":
                    # Write to a CSV file
                    df.to_csv(f"{output_dir}/{table_name}.csv", index=False)
                    logger.info(f"Exported database table to: {output_dir}/{table_name}.csv")
                elif output_format == "parquet":
                    # Write to a Parquet file
                    df.to_parquet(f"{output_dir}/{table_name}.parquet", index=False)
                    logger.info(f"Exported database table to: {output_dir}/{table_name}.parquet")

        else:
            raise ValueError(f"Unsupported output format: {output_format}")
    except Exception as e:
        logger.error(f"{e} - Database connection not available.")

In [None]:
# | echo: false

export_database_tables(con, OUTPUT_DIR / "export_tables", output_format="xlsx")

In [None]:
# Cross-check queries

con.sql("SHOW TABLES;")

In [None]:
con.sql("SELECT COUNT(*) FROM ExperimentCBs")

In [None]:
con.sql("SELECT COUNT(*) FROM TrialCBs")

In [None]:
con.sql("SELECT COUNT(*) FROM ResponseCBs")

In [None]:
con.sql("SELECT COUNT(*) FROM EventCBs")

In [None]:
experiments_df = con.sql("SELECT * FROM ExperimentCBs").df()

In [None]:
# | echo: false

experiments_df

In [None]:
con.sql("SELECT DISTINCT ExperimentType FROM ExperimentCBs ORDER BY ExperimentType")

In [None]:
con.sql("SELECT DISTINCT SubjectName FROM ExperimentCBs ORDER BY SubjectName")

In [None]:
con.sql("SELECT * FROM TrialCBs")

In [None]:
con.sql("SELECT DISTINCT TrialStartTime FROM TrialCBs")

In [None]:
events_ddb = con.sql("SELECT * FROM EventCBs").df()

In [None]:
event_type_ddb = con.sql("SELECT DISTINCT EventType FROM EventCBs ORDER BY EventType").df()

In [None]:
# | echo: false

event_type_ddb

In [None]:
con.close()