In [16]:
import os
import pandas as pd
import numpy as np

CONTROL = 'TERBINAFINE- (control)'
TREATED = 'TERBINAFINE+'
PREPROCESSED_DIR = 'data/preprocessed'
TRAIN_DIR = 'data/train'
TEST_DIR = 'data/test'
PROCESSED_DIR = 'data/processed'

In [17]:
def cap_speed(df, speed_threshold=10):
    df['Speed'] = df['Speed'].clip(upper=speed_threshold)
    return df

def drop_frames_after_death(df, frame_of_death):
    return df[df['GlobalFrame'] <= frame_of_death + 1]

def interpolate_missing_data(df, interpolation_limit=10):
    """Interpolate missing data in the DataFrame with a limit on consecutive NaNs. 
    It there are more than limit consecutive NaNs we can drop them.
    We juste want to interpolate the columns X, Y and Speed.
    IMPORTANT : This function has to be done at a segment level.
    Arguments:
        df: pandas DataFrame with columns 'X', 'Y', 'Speed'
        interpolation_limit: maximum number of consecutive NaNs to interpolate
    Returns:
        pandas DataFrame with interpolated data
    """
    df[['X', 'Y', 'Speed']] = df[['X', 'Y', 'Speed']].interpolate(limit=interpolation_limit)
    df = df.dropna(subset=['X', 'Y', 'Speed'])
    return df

def normalize_coordinates(df):
    """Normalize the X and Y coordinates to be between 0 and 1.
    This is done at a file level.
    Arguments:
        df: pandas DataFrame with columns 'X', 'Y'
    Returns:
        pandas DataFrame with normalized coordinates
    """
    df['X'] = (df['X'] - df['X'].min()) / (df['X'].max() - df['X'].min())
    df['Y'] = (df['Y'] - df['Y'].min()) / (df['Y'].max() - df['Y'].min())
    return df

def normalize_speed(df):
    """Normalize the Speed to be between 0 and 1.
    This is done at a file level.
    Arguments:
        df: pandas DataFrame with column 'Speed'
    Returns:
        pandas DataFrame with normalized Speed
    """
    df['Speed'] = (df['Speed'] - df['Speed'].min()) / (df['Speed'].max() - df['Speed'].min())
    return df

def preprocess_file(file_path, speed_threshold=10, interpolation_limit=10):
    df = pd.read_csv(file_path)
    lifespan_summary = pd.read_csv('data/lifespan_summary.csv')
    worm_id = os.path.splitext(os.path.basename(file_path))[0]
    frame_of_death = lifespan_summary.loc[lifespan_summary['Filename'] == '/'+ worm_id, 'LifespanInFrames'].values[0]
    df = drop_frames_after_death(df, frame_of_death)
    df = cap_speed(df, speed_threshold) 
    # df = normalize_coordinates(df)
    # df = normalize_speed(df)
    segments = df['Segment'].unique()
    processed_segments = []
    for segment in segments:
        segment_df = df[df['Segment'] == segment].copy()
        segment_df = interpolate_missing_data(segment_df, interpolation_limit)
        processed_segments.append(segment_df)
    processed_df = pd.concat(processed_segments, ignore_index=True).reset_index(drop=True)
    return processed_df

In [18]:
# Make the preprocessing global for all files
for treatment in [CONTROL, TREATED]:
    treatment_dir = os.path.join(PREPROCESSED_DIR, treatment)
    saving_dir = os.path.join(PROCESSED_DIR, treatment)
    os.makedirs(saving_dir, exist_ok=True)
    for file_name in os.listdir(treatment_dir):
        if file_name.endswith('.csv'):
            file_path = os.path.join(treatment_dir, file_name)
            processed_df = preprocess_file(file_path)
            processed_df.to_csv(os.path.join(saving_dir, file_name), index=False)