In [2]:
import os
import pandas as pd
import numpy as np

CONTROL = 'TERBINAFINE- (control)'
TREATED = 'TERBINAFINE+'
PREPROCESSED_DIR = 'data/preprocessed'


In [4]:
# Check whethen X is NaN if also Y is NaN
def check_nan_consistency(segment_df):
    """
    Check that for each row in the segment dataframe, if X is NaN then Y is also NaN,
    and vice versa.

    Args:
        segment_df (pandas.DataFrame): DataFrame containing 'X' and 'Y' columns.

    Returns:
        bool: True if the condition holds for all rows, False otherwise.
    """
    for index, row in segment_df.iterrows():
        x_is_nan = pd.isna(row['X'])
        y_is_nan = pd.isna(row['Y'])
        if x_is_nan != y_is_nan:
            return False
    return True

for file in os.listdir('data/'+CONTROL):
    df = pd.read_csv(os.path.join('data/'+CONTROL, file))
    assert check_nan_consistency(df), f"Inconsistent NaN values in file {file} of CONTROL group."
for file in os.listdir('data/'+TREATED):
    df = pd.read_csv(os.path.join('data/'+TREATED, file))
    assert check_nan_consistency(df), f"Inconsistent NaN values in file {file} of TREATED group."

In [None]:
# add new files and Rename files according to their worm ID. For example, coordinates_highestspeed_20231010_12_01_...csv should be renamed to 20231010_piworm12_1.csv
# Save the renamed files in PREPROCESSED_DIR/CONTROL and PREPROCESSED_DIR/TREATED

for treatment in [CONTROL, TREATED]:
    treatment_dir = os.path.join(PREPROCESSED_DIR, treatment)
    os.makedirs(treatment_dir, exist_ok=True)
    for file_name in os.listdir('data/'+treatment):
        if file_name.endswith('.csv'):
            parts = file_name.split('_')
            if len(str(parts[3])) == 1:
                parts[3] = '0' + str(parts[3])
            worm_id = f"{parts[2]}_piworm{parts[3]}_{parts[4]}"
            df = pd.read_csv(os.path.join('data', treatment, file_name))
            df.to_csv(os.path.join(treatment_dir, f"{worm_id}.csv"), index=False)

In [None]:
# Check that we have the same number of files in PREPROCESSED_DIR/CONTROL and PREPROCESSED_DIR/TREATED as in the original directories
original_control_files = [f for f in os.listdir('data/'+CONTROL) if f.endswith('.csv')]
original_treated_files = [f for f in os.listdir('data/'+TREATED) if f.endswith('.csv')]
processed_control_files = [f for f in os.listdir(os.path.join(PREPROCESSED_DIR, CONTROL)) if f.endswith('.csv')]
processed_treated_files = [f for f in os.listdir(os.path.join(PREPROCESSED_DIR, TREATED)) if f.endswith('.csv')]
assert len(original_control_files) == len(processed_control_files), "Mismatch in CONTROL files count"
assert len(original_treated_files) == len(processed_treated_files), "Mismatch in TREATED files count"
print("File counts match between original and processed directories.")

File counts match between original and processed directories.


In [None]:
# Drop the first row if it contains NaN in 'Timestamp' column or if the the absolute difference between the first two timestamps is larger than 1 hours
for treatment in [CONTROL, TREATED]:
    treatment_dir = os.path.join(PREPROCESSED_DIR, treatment)
    for file_name in os.listdir(treatment_dir):
        if file_name.endswith('.csv'):
            file_path = os.path.join(treatment_dir, file_name)
            df = pd.read_csv(file_path)
            df['Timestamp'] = pd.to_datetime(df['Timestamp'])
            if pd.isna(df.loc[0, 'Timestamp']) or abs((df.loc[1, 'Timestamp'] - df.loc[0, 'Timestamp']).total_seconds()) > 3600:
                df = df.drop(index=0).reset_index(drop=True)
                df.to_csv(file_path, index=False)


In [None]:
# Sanity check the timestamp of the beginning of our dfs
def sanity_check_first_timestamps(file):
    df = pd.read_csv(file)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    if len(df) >= 2:
        if df.loc[0, 'Timestamp'] is pd.NaT:    
            time_diff = abs((df.loc[2, 'Timestamp'] - df.loc[1, 'Timestamp']).total_seconds())
        else:
            time_diff = abs((df.loc[1, 'Timestamp'] - df.loc[0, 'Timestamp']).total_seconds())
        if time_diff > 3600:  # 1 hour in seconds
            return False
        elif time_diff <= 3600:
            return True
        elif time_diff is pd.NaT:
            return False
        print(time_diff)
    return False

insanity_counts = 0
for treatment in [CONTROL, TREATED]:
    treatment_dir = os.path.join(PREPROCESSED_DIR, treatment)
    for file_name in os.listdir(treatment_dir):
        if file_name.endswith('.csv'):
            file_path = os.path.join(treatment_dir, file_name)
            if not sanity_check_first_timestamps(file_path):
                print(f"Sanity check failed for {file_name}")
                insanity_counts += 1
print(f"Total files failing sanity check: {insanity_counts}")

Total files failing sanity check: 0


In [None]:
# See in which file there are missing timestamps
count_missing_timestamps = 0
file_with_missing_timestamps = []
for treatment in [CONTROL, TREATED]:
    treatment_dir = os.path.join(PREPROCESSED_DIR, treatment)
    for file_name in os.listdir(treatment_dir):
        if file_name.endswith('.csv'):
            file_path = os.path.join(treatment_dir, file_name)
            df = pd.read_csv(file_path)
            df['Timestamp'] = pd.to_datetime(df['Timestamp'])
            if df['Timestamp'].isnull().any():
                print(f"Missing timestamps found in {treatment}/{file_name}")
                count_missing_timestamps += 1
                file_with_missing_timestamps.append(treatment_dir+'/'+file_name)
                continue
print(f"Total files with missing timestamps: {count_missing_timestamps}")
print(f"Files with missing timestamps: {file_with_missing_timestamps}")

Missing timestamps found in TERBINAFINE- (control)/20250205_piworm10_2.csv
Missing timestamps found in TERBINAFINE- (control)/20240924_piworm12_5.csv
Missing timestamps found in TERBINAFINE+/20250205_piworm09_5.csv
Missing timestamps found in TERBINAFINE+/20250205_piworm11_5.csv
Total files with missing timestamps: 4
Files with missing timestamps: ['data/processed/TERBINAFINE- (control)/20250205_piworm10_2.csv', 'data/processed/TERBINAFINE- (control)/20240924_piworm12_5.csv', 'data/processed/TERBINAFINE+/20250205_piworm09_5.csv', 'data/processed/TERBINAFINE+/20250205_piworm11_5.csv']


In [None]:
# Add Segment column to our DF. A segment is composed of exactly 900 frames. Hence the first 900 frames are segment 0, the next 900 frames are segment 1, and so on.

def add_segment_column_fixed_frames(file, frames_per_segment=900):
    df = pd.read_csv(file)
    num_rows = len(df)
    df['Segment'] = np.arange(num_rows) // frames_per_segment
    df.to_csv(file, index=False)

for treatment in [CONTROL, TREATED]:
    treatment_dir = os.path.join(PREPROCESSED_DIR, treatment)
    for file_name in os.listdir(treatment_dir):
        if file_name.endswith('.csv'):
            file_path = os.path.join(treatment_dir, file_name)
            add_segment_column_fixed_frames(file_path, frames_per_segment=900)

print("Added Segment column to all files based on fixed frame counts.")

Added Segment column to all files based on fixed frame counts.


In [3]:
# Find lenght of consecutive gap in our csv. This could be for only speed columns, for X or Y or for all of them
def find_consecutive_gaps(df, column):
    is_na = df[column].isna()
    gap_lengths = []
    current_gap_length = 0

    for val in is_na:
        if val:
            current_gap_length += 1
        else:
            if current_gap_length > 0:
                gap_lengths.append(current_gap_length)
                current_gap_length = 0
    if current_gap_length > 0:
        gap_lengths.append(current_gap_length)

    return gap_lengths

gap_lengths_speed = {}
for treatment in [CONTROL, TREATED]:
    treatment_dir = os.path.join(PREPROCESSED_DIR, treatment)
    for file_name in os.listdir(treatment_dir):
        if file_name.endswith('.csv'):
            file_path = os.path.join(treatment_dir, file_name)
            df = pd.read_csv(file_path)
            segments = df['Segment'].unique()
            list_gap = []
            for segment in segments:
                segment_df = df[df['Segment'] == segment]
                gaps = find_consecutive_gaps(segment_df, 'X')
                list_gap.extend(gaps)
            gap_lengths_speed[file_name] = list_gap
print("Consecutive gap lengths in 'Speed' column for each file:")
for file_name, gaps in gap_lengths_speed.items():
    print(f"{file_name}: {gaps}")

KeyboardInterrupt: 

In [5]:
# Count the number of speed > 10 in each file
def count_high_speed_entries(file, speed_threshold=10):
    df = pd.read_csv(file)
    high_speed_count = (df['Speed'] > speed_threshold).sum()/len(df)*100
    return high_speed_count

high_speed_counts = {}
for treatment in [CONTROL, TREATED]:
    treatment_dir = os.path.join(PREPROCESSED_DIR, treatment)
    for file_name in os.listdir(treatment_dir):
        if file_name.endswith('.csv'):
            file_path = os.path.join(treatment_dir, file_name)
            count = count_high_speed_entries(file_path, speed_threshold=10)
            high_speed_counts[file_name] = count
print("Number of entries with Speed > 10 in each file:")
for file_name, count in high_speed_counts.items():
    print(f"{file_name}: {count}")

Number of entries with Speed > 10 in each file:
20250311_piworm18_4.csv: 4.864639637032836
20250311_piworm18_5.csv: 2.6910033680945378
20250311_piworm18_6.csv: 1.379645597749974
20250311_piworm18_2.csv: 1.9444669498489566
20250311_piworm18_1.csv: 2.2546557251820047
20250415_piworm10_1.csv: 3.192651715772514
20250415_piworm12_3.csv: 3.284434979298668
20250415_piworm12_2.csv: 1.949104152841865
20250311_piworm19_6.csv: 4.1420392290004475
20250311_piworm19_4.csv: 1.7407568588598044
20250415_piworm10_2.csv: 3.5304699797616372
20250415_piworm12_1.csv: 1.3000240745198985
20250415_piworm10_3.csv: 2.185214090133467
20250311_piworm19_5.csv: 1.8621102822299365
20250415_piworm12_5.csv: 5.181285466738978
20250415_piworm12_4.csv: 2.533102289712827
20250415_piworm10_6.csv: 3.5423748991388773
20250311_piworm19_2.csv: 1.998166649691201
20250415_piworm10_4.csv: 7.338076204879705
20250415_piworm12_6.csv: 3.1230571832960754
20250415_piworm10_5.csv: 4.381010330824481
20240924_piworm12_2.csv: 2.074106081883

In [26]:

# Sanity check each segment each worms based on:
# - Difference between two consecutive timestamps should be less than 1 hour, or if there are missing timestamps, then flag the segment
# - Find some statistical anomalies in the segments based on statistical properties (mean, std, etc.) of the coordinates  and of the speed within each segment
# - Flag also all the missing values in the segments for X, Y and Speed

def sanity_check_segment(file, segment):
    df = pd.read_csv(file)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    anomalies = {}
    
    seg_df = df[df['Segment'] == segment]

    # Check for missing timestamps
    if seg_df['Timestamp'].isnull().any():
        anomalies["timestamp_na"] = True
    else:
        anomalies["timestamp_na"] = False
    
    # Calculate time differences
    time_diffs = seg_df['Timestamp'].diff().dt.total_seconds().abs()
    if (time_diffs > 3600).any():
        anomalies["timestamp_diff"] = True
    else:
        anomalies["timestamp_diff"] = False

    # Check for missing values in X, Y, Speed
    for col in ['X', 'Y', 'Speed']:  # Assuming these columns exist
        if col in seg_df.columns:
            if seg_df[col].isnull().any():
                anomalies[f"{col}_na"] = True
            else:
                anomalies[f"{col}_na"] = False

    # Statistical checks, flag rows that exceed mean +/- 3*std
    for col in ['X', 'Y', 'Speed']:
        if col in seg_df.columns:
            mean = seg_df[col].mean()
            std = seg_df[col].std()
            if ((seg_df[col] > mean + 3 * std) | (seg_df[col] < mean - 3 * std)).any():
                anomalies[f"{col}_stat_anomaly"] = True
            else:
                anomalies[f"{col}_stat_anomaly"] = False
        
    return anomalies

In [None]:
# Build a matrix where each line represents a worm and each column represents a segment. Each cell contains a dictionary of anomalies found in that segment.
# All worms don't have the same number of segments so we can fill missing segments with None

anomaly_matrix = {}
for treatment in [CONTROL, TREATED]:
    treatment_dir = os.path.join(PREPROCESSED_DIR, treatment)
    for file_name in os.listdir(treatment_dir):
        if file_name.endswith('.csv'):
            file_path = os.path.join(treatment_dir, file_name)
            df = pd.read_csv(file_path)
            max_segment = df['Segment'].max()
            worm_anomalies = {}
            for segment in range(max_segment + 1):
                anomalies = sanity_check_segment(file_path, segment)
                worm_anomalies[segment] = anomalies
            anomaly_matrix[file_name] = worm_anomalies

# Save the results in an excel file
anomaly_df = pd.DataFrame.from_dict({(i,j): anomaly_matrix[i][j] 
                           for i in anomaly_matrix.keys() 
                           for j in anomaly_matrix[i].keys()}, orient='index')
anomaly_df.to_excel('segment_anomalies.xlsx')

In [31]:
# We know that the first row of each csv is missing speed value, so we can assume it to 0
# We can also scale the coordinates to be between 0 and 1 based on the min and max values of X and Y in each file
# We can also clip the speed that could be weird

In [7]:
# Check if there are some nan values in the classifier csv files, print from where columns do they come
# Print also the number of nan values per file
classifier_dir = 'preprocessed_data_for_classifier'
for file in os.listdir(classifier_dir):
    if file.endswith('.csv'):
        file_path = os.path.join(classifier_dir, file)
        df = pd.read_csv(file_path)
        nan_counts = df.isna().sum()
        total_nans = nan_counts.sum()
        if total_nans > 0:
            print(f"File: {file}")
            print(f"Total NaN values: {total_nans}")
            print("NaN counts per column:")
            print(nan_counts[nan_counts > 0])
            print("\n") 
            