# Naive dataset compilation
We have received data for naive animals from the Corbo. To access the data with the tools we've developed, we need to follow the workflow we established for compiling data from `DATASET1`. The workflow is rough the following:
1. Create SQL database from CellTrialTable
2. Create index on `Experiment` names in the SQL database
3. Convert the SQL database to experiment specific CSVs
4. Change permisions of the directory holding the CSVs
5. Use MATLAB to decode data in experiment specific CSVs to HDF5 files

We'll also create the `ExperimentTable` dataframe.

## Imports

In [None]:
%cd ../../
%load_ext autoreload
%autoreload 2

In [None]:
# SQL imports
import sqlite3
import pandas as pd
from tqdm import tqdm

# `ExperimentTable` creation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# CSV creation
from src.data_compilation import NAIVE
from src.data_compilation import query_CellTrialTable_dataframe, check_CellTrialTable_existence, save_CellTrialTable_df
from src.data_compilation import retrieve_experiment_stats, compute_CellTrialTable_df_stats

## Create SQL database

In [None]:
def create_sqlite_db_from_csv(csv_file, db_file, table_name):
    """
    Create an SQLite database from a CSV file, preserving all columns and column types.
    
    Args:
    - csv_file (str): Path to the CSV file.
    - db_file (str): Path to the SQLite database file.
    - table_name (str): Name of the table to create in the SQLite database.
    """
    # Step 1: Read a sample of the CSV to infer the schema
    sample_size = 1000  # Number of rows to read for inferring the schema
    sample_df = pd.read_csv(csv_file, nrows=sample_size)
    
    # Step 2: Create a dictionary mapping pandas dtypes to SQLite types
    dtype_mapping = {
        'object': 'TEXT',
        'int64': 'INTEGER',
        'float64': 'REAL',
        'datetime64[ns]': 'TEXT',  # Dates can be stored as TEXT in SQLite
        'bool': 'INTEGER'  # SQLite does not have a separate BOOLEAN type
    }
    
    # Step 3: Infer the SQLite column types based on the sample
    column_types = {}
    for column, dtype in sample_df.dtypes.items():
        column_types[column] = dtype_mapping.get(str(dtype), 'TEXT')  # Default to TEXT if dtype is not in mapping
    
    # Step 4: Create the SQLite table based on the inferred schema
    columns_with_types = ', '.join([f'"{col}" {dtype}' for col, dtype in column_types.items()])
    create_table_sql = f'CREATE TABLE IF NOT EXISTS {table_name} ({columns_with_types});'
    
    # Step 5: Connect to SQLite database and create the table
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    cursor.execute(create_table_sql)
    conn.commit()
    
    print(f"Table '{table_name}' created successfully with columns: {list(column_types.keys())}")
    
    # Step 6: Read and insert the CSV data into SQLite in chunks
    chunk_size = 100000  # Number of rows per chunk (adjust based on your memory constraints)
    for chunk in tqdm(pd.read_csv(csv_file, chunksize=chunk_size)):
        chunk.to_sql(table_name, conn, if_exists='append', index=False)

    # Step 7: Close the connection
    conn.close()
    print(f"Data from '{csv_file}' has been successfully imported into '{db_file}' as table '{table_name}'.")

In [None]:
csv_file = '/home/jovyan/work/task-priors/data/Naive_CellTrialTable.csv'
db_file = '/home/jovyan/work/task-priors/data/Naive_CellTrialTable.db'
table_name = 'CellTrialTable'  # Name of the table in the SQLite database

create_sqlite_db_from_csv(csv_file, db_file, table_name)

## Create SQL index

In [None]:
conn = sqlite3.connect('/home/jovyan/work/task-priors/data/Naive_CellTrialTable.db')
cursor = conn.cursor()
cursor.execute("CREATE INDEX IF NOT EXISTS idx_experiment ON CellTrialTable (Experiment);")
conn.commit()
conn.close()

## Create `ExperimentTable`

In [None]:
df_celltable = pd.read_csv(NAIVE['CellTable_path'])
df_trialtable = pd.read_csv(NAIVE['TrialTable_path'])
tables = df_celltable, df_trialtable

experiment_ids = list(df_trialtable['Experiment'].unique())

In [None]:
experiment_info = []

for id_ in experiment_ids:
    df_experiment = df_trialtable[df_trialtable['Experiment'] == id_]

    mouse = id_[:3]
    behav_cond = df_experiment['Behav_Cond'].unique()[0]
    task_stim = np.sort(df_experiment[df_experiment['Block'] == 'Visual']['Visual_Stim'].unique())
    go_stim = task_stim[0]
    nogo_stim = task_stim[1]

    experiment_info.append({
        'Experiment': id_,
        'Mouse': mouse,
        'Day': int(behav_cond[1]),
        'Behav_Cond': behav_cond,
        'Go_stim': go_stim,
        'NoGo_stim': nogo_stim,
    })

In [None]:
df_experiments = pd.DataFrame(experiment_info)
df_experiments.to_csv('./data/Naive_ExperimentTable.csv')
df_experiments.head()

## Create CSVs

In [None]:
for experiment_id in tqdm(experiment_ids):
    try:
        check_CellTrialTable_existence(experiment_id, DATASET=NAIVE,)
    except FileExistsError as e:
        print(f"Error: {e}")
        continue
    
    df_experiment = query_CellTrialTable_dataframe(experiment_id, DATASET=NAIVE,)
    
    experiment_stats = retrieve_experiment_stats(experiment_id, tables,)
    db_experiment_stats = compute_CellTrialTable_df_stats(df_experiment)
    assert experiment_stats == db_experiment_stats, f'Stats for {experiment_id} do not match'

    save_CellTrialTable_df(df_experiment, DATASET=NAIVE,)

## Set directory permisions
The MATLAB account needs permision to write and delete files on the directory containing the csv. Set the permision like the following:

In [None]:
%chmod -R 777 /home/ktmurray/task-priors/data/Naive_CellTrialTable

Note that this will not work in a Jupyter notebook run through a docker image.