## Purpose of This Notebook

This notebook serves to load data from the manually recorded corrections spreadsheet into tables in the CB database
so that they can be applied to the data loaded from the log files.


In [None]:
# | echo: false

from datetime import datetime
from pathlib import Path
from pprint import pprint

import duckdb
import openpyxl
import pandas as pd
from IPython.display import display
from loguru import logger
from openpyxl.utils.dataframe import dataframe_to_rows

from horse_logic.logfiles import (
    EventCB,
    ExperimentCB,
    Logfile,
    Logs,
    ResponseCB,
    TrialCB,
)
from horse_logic.utils import (
    create_tables_from_sql_file,
    export_data_to_csv,
    set_custom_logger_format,
)

In [None]:
# | echo: false

import itables.options as opt
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True, connected=False)  # Display Pandas dataframes in a more friendly paginated manner
opt.pageLength = 20  # Display 20 rows per page


In [None]:
# | echo: false

set_custom_logger_format()

### Load Subject information

In [None]:
# | echo: false

def get_subject_info():
    HORSE_ORDER_XLSX = "Cohort data for MB.xlsx"
    HORSE_ORDER_PATH = Path("../docs/from_CH") 
    HORSE_ORDER_FILEPATH = HORSE_ORDER_PATH / HORSE_ORDER_XLSX

    if HORSE_ORDER_FILEPATH.exists():
        subject_df = pd.read_excel(HORSE_ORDER_FILEPATH)
        logger.info(f"Loaded horse order info from: {HORSE_ORDER_FILEPATH}")

        subject_df.rename({"No": "subject_number", "Horse": "subject_name"}, axis=1, inplace=True)  # Rename columns
        subject_df["subject_name"] = subject_df["subject_name"].str.lower()     # Ensure lower case names for later subject lookup
        return subject_df
    else:
        logger.error(f"Horse order info not found: {HORSE_ORDER_FILEPATH}")
        return None

In [None]:
subject_df = get_subject_info()

### Directory information

In [None]:
# | echo: false

DATA_DIR = Path("../data")
EXPERIMENT_TYPE = "CB"    # or CB

assert DATA_DIR.exists()

DATA_DB  = DATA_DIR / f"Experiments_{EXPERIMENT_TYPE}_2023-Q4.ddb"  # DuckDB database name
db_exists = DATA_DB.exists()

logger.info(f"Database file: {DATA_DB.resolve()}")

In [None]:
def lookup_log_file_name(horse_name, date, db_path):
    try:
        # Connect to DuckDB
        con = duckdb.connect(db_path)

        # Prepare and execute the query
        query = """
        SELECT LogFileName
        FROM ExperimentCBs
        WHERE LOWER(SubjectName) = LOWER(?)
          AND CAST(DateTime AS DATE) = ?
        """
        
        # Execute the query with parameters
        result = con.execute(query, [horse_name, date]).fetchall()
        
        con.close()

        # Return the result (list of LogFileNames)
        log_file_names = [row[0] for row in result]
        logger.debug(f"Query result for {horse_name} on {date}: {log_file_names}")
        return log_file_names
    except Exception as e:
        logger.error(f"Error in lookup_log_file_name: {str(e)}")
        return []

In [None]:
# | echo: false

OUTPUT_DIR = DATA_DIR / f"results/{EXPERIMENT_TYPE}"

assert OUTPUT_DIR.exists()
logger.info(f"Outputs dir: {OUTPUT_DIR.resolve()}")

## Corrections workbook

In [None]:
def add_log_filename_column(df, db_path):
    def lookup_wrapper(row):
        horse_name = row['Horse'].lower()
        date = row['Date'].date()  # Assuming 'Date' is already a datetime object
        log_file_names = lookup_log_file_name(horse_name, date, db_path)
        return log_file_names[0] if log_file_names else None

    df['LogFilename'] = df.apply(lookup_wrapper, axis=1)
    return df


In [None]:
CORRECTIONS_WORKBOOK = Path("../docs/from_CH/Exp1 Errors.xlsx")

WORKSHEET_NAME = ["CBHD_Times", "CBCSU_Times"]

In [None]:
def preprocess_dataframe(df, db_path):

    columns_to_ffill = ['Date', 'Session', 'Horse']
    df[columns_to_ffill] = df[columns_to_ffill].ffill()

    df = df.drop(columns=['Unnamed: 9'], errors='ignore')
    object_columns = df.select_dtypes(include=['object']).columns
    df[object_columns] = df[object_columns].fillna('')

    df['Date'] = pd.to_datetime(df['Date'])

    # Add LogFileName column
    def get_log_filename(row):
        horse_name = row['Horse'].lower()
        date = row['Date'].date()
        log_file_names = lookup_log_file_name(horse_name, date, db_path)
        return ', '.join(log_file_names) if log_file_names else f"No log file in CB database matches: {horse_name} / {date}"

    df['LogFileName'] = df.apply(get_log_filename, axis=1)

    return df

In [None]:
def process_excel_sheet(input_file, sheet_name, output_file, db_path):
    # Read the Excel file
    df = pd.read_excel(input_file, sheet_name=sheet_name)
    
    # Preprocess the dataframe
    df = preprocess_dataframe(df, db_path)
    
    # Create a copy of the original workbook
    wb = openpyxl.load_workbook(input_file)
    
    # Create a new sheet for the processed data
    processed_sheet_name = f"{sheet_name}_processed"
    if processed_sheet_name in wb.sheetnames:
        wb.remove(wb[processed_sheet_name])
    ws_processed = wb.create_sheet(processed_sheet_name)
    
    # Write the processed dataframe to the new sheet
    for r in dataframe_to_rows(df, index=False, header=True):
        ws_processed.append(r)
    
    wb.save(output_file)
    
    print(f"Processed data saved to '{output_file}' in sheet '{processed_sheet_name}'")
    
    return df

In [None]:
processed_sheet = {}
for sheet_name in WORKSHEET_NAME:
    output_file = Path(f'{str(CORRECTIONS_WORKBOOK).replace(".xlsx", f"_{sheet_name}.xlsx")}')
    processed_sheet[sheet_name] = process_excel_sheet(CORRECTIONS_WORKBOOK, sheet_name, output_file, str(DATA_DB))

In [None]:
processed_sheet["CBHD_Times"]

In [None]:
processed_sheet["CBCSU_Times"]

### Bringing this all together and putting the data in DuckDB database

In [None]:
# | echo: false

# create_tables_from_sql_file(con, '../sql/create_cb_correction_tables_ddb.sql')