In [4]:
import pandas as pd
import subprocess
import sys
import os
import pathlib
import time
import pandas as pd
import numpy as np
from dataclasses import dataclass
from datetime import datetime
import argparse

# Level 2: Processing CSV

## Globals

In [9]:
DIR = "../data_processed/"

## Tools

### Loading data

In [15]:
def load_csv(filename: str) -> pd.DataFrame:
    """
    Load a CSV file into a pandas DataFrame
    """

    df = pd.read_csv(filename)
    df['time'] = pd.to_datetime(df['time'])
    return df

### Data fixing

In [19]:
def fix_df_time(df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert the time column to datetime and floor it to seconds, in place.
    """
    df["time"] = pd.to_datetime(df['time']).dt.floor('S')

    return df

def fix_df_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Find and remove duplicates from the dataframe, in place.
    """
    df.drop_duplicates(inplace=True, keep="first")
    return df

def fix_sorting_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Sort the dataframe by time, in place.
    """
    df.sort_values("time", inplace=True)
    return df

def fix_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fix the dataframe in place.
    """
    fix_df_time(df)
    fix_df_duplicates(df)
    fix_sorting_df(df)
    return df

### Data validation

In [22]:
def verify_df_column_names(df: pd.DataFrame) -> None:
    """
    Verify that the dataframe has the required columns
    """

    REQUIRED_COLS = ['time', 'event_type', 'channel', 'value']

    cols = df.columns
    for col in REQUIRED_COLS:
        if col not in cols:
            raise ValueError(f"Column {col} not found in dataframe")

def verify_df_sorted(df: pd.DataFrame) -> None:
    """
    Verify that the dataframe is sorted by "time"
    """
    # Find rows where the 'time' is decreasing from the previous row
    not_sorted_mask = df['time'].diff().dt.total_seconds() < 0

    # The first row can't be "not sorted" by definition, so we can exclude it from the mask
    not_sorted_mask.iloc[0] = False

    # Filter the DataFrame to find the not sorted rows
    not_sorted_rows = df[not_sorted_mask]

    if not df['time'].is_monotonic_increasing:
        raise ValueError(f"Dataframe is not sorted by time:\n{not_sorted_rows}")

def verify_df_time_diffs(df: pd.DataFrame, 
                         max_diff_tolerance: np.timedelta64 = np.timedelta64(90, 's'), 
                         min_diff_tolerance: np.timedelta64 = np.timedelta64(500, 'ms')) -> None:
    """
    Verify that the time differences between events are within tolerance.
    If time diff >= max_diff_tolerance, just prints the warning (data holes are permitted).
    If time diff <= min_diff_tolerance, raises an exception (possible floating point errors).
    
    Assumes that the dataframe is non-decreasingly sorted by "time".  
    
    There may me multiple groups of events with the same time.
    
    Args:
        df (pd.DataFrame): input dataframe with "time" column
        max_diff_tolerance (np.timedelta64, optional): max time difference tolerance in ms (warning only)
        min_diff_tolerance (np.timedelta64, optional): min time difference tolerance in ms (exception)

    Raises:
        ValueError: when time differences < min_diff_tolerance (possible floating point errors)
    """

    # get all unique "time" values in df
    times = df['time'].unique()

    # calc time diffs
    time_diffs = np.diff(times)

    # check if all time diffs are not larger than the tolerance
    checks = max_diff_tolerance > time_diffs
    if not all(checks):
        # find all indexes of unmet conditions
        indexes = np.where(checks == False)[0]

        # create a dataframe of times
        df_times = pd.DataFrame(times, columns=["time"])

        # find all holes
        holes = [f"{df_times.iloc[i]['time']} and {df_times.iloc[i + 1]['time']}" for i in indexes]
        
        print("Found time holes out of tolerance at times:", *holes, sep='\n\t')


    # check if all time diffs are not smaller than the tolerance
    # (possible floating point errors)
    checks = min_diff_tolerance < time_diffs
    if not all(checks):
        # find all indexes of unmet conditions
        indexes = np.where(checks == False)[0]

        # create a dataframe of times
        df_times = pd.DataFrame(times, columns=["time"])

        # find all too close values
        too_close = [f"{df_times.iloc[i]['time']} and {df_times.iloc[i + 1]['time']}" for i in indexes]
        
        raise ValueError(
            "Found time values too close to each other at times " +
            "(possible floating point errors):\n\t" +
            "\n\t".join(too_close))


def verify_df_time_counts(df: pd.DataFrame) -> pd.DataFrame:
    """
    Verify that the dataframe has the correct number of events (49) for each time.
    """
    if not all(df["time"].value_counts() == 49):
        raise ValueError("Incorrect number of events for some times")
    return df

def verify_df_time_p_counts(df: pd.DataFrame) -> pd.DataFrame:
    """
    Verify that the dataframe has the correct number of proton events (9) for each time.
    """
    result = df.groupby('time').apply(lambda group: (group['event_type'] == 'p').sum() == 9)
    if not all(result):
        raise ValueError("Incorrect number of proton events for some times")
    return df

def verify_df_time_e_counts(df: pd.DataFrame) -> pd.DataFrame:
    """
    Verify that the dataframe has the correct number of electron events (9) for each time.
    """
    result = df.groupby('time').apply(lambda group: (group['event_type'] == 'e').sum() == 9)
    if not all(result):
        raise ValueError("Incorrect number of electron events for some times")
    return df

def verify_df_time_d_counts(df: pd.DataFrame) -> pd.DataFrame:
    """
    Verify that the dataframe has the correct number of directional events (31) for each time.
    """
    result = df.groupby('time').apply(lambda group: (group['event_type'] == 'd').sum() == 31)
    if not all(result):
        raise ValueError("Incorrect number of directional events for some times")
    return df

def verify_df(df: pd.DataFrame) -> None:
    """
    Verify the integrity of the dataframe
    """

    if df.empty:
        raise ValueError("Dataframe is empty")

    print("Verifying column names")
    verify_df_column_names(df)

    print("Verifying sorting")
    verify_df_sorted(df)

    print("Verifying time diffs")
    verify_df_time_diffs(df)

    print("Verifying time counts")
    verify_df_time_counts(df)
    print("Verifying time p counts")
    verify_df_time_d_counts(df)
    print("Verifying time e counts")
    verify_df_time_e_counts(df)
    print("Verifying time d counts")
    verify_df_time_p_counts(df)

    # df.groupby('time').apply(verify_df_time_group)

    


### Loading and verifying

In [27]:
def load_and_verify_csv(filename: str) -> pd.DataFrame:
    """
    Load a CSV file into a pandas DataFrame and verify its integrity
    """

    df = load_csv(filename)

    verify_df(df)

    return df

def load_and_verify_csvs(filenames: list) -> list:
    """
    Load a list of CSV files into a list of pandas DataFrames and verify their integrity
    """

    dfs = []
    for filename in filenames:
        print("Verifying", filename, "...")
        df = load_and_verify_csv(filename)
        dfs.append(df)

    print("SUCCESS")
    return dfs

## Usage

In [28]:
dfs = [load_csv(DIR + str(i) + ".csv") for i in range(0, 363)]
df = pd.concat(dfs)

In [29]:
fix_df(df)

Unnamed: 0,time,event_type,channel,value
0,2023-04-16 14:05:35,d,1,0
27,2023-04-16 14:05:35,d,28,0
28,2023-04-16 14:05:35,d,29,0
29,2023-04-16 14:05:35,d,30,1
30,2023-04-16 14:05:35,d,31,12
...,...,...,...,...
42600,2024-04-02 14:29:30,d,20,0
42601,2024-04-02 14:29:30,d,21,1
42602,2024-04-02 14:29:30,d,22,3
42591,2024-04-02 14:29:30,d,11,9


In [26]:
verify_df(df)

Verifying column names
Verifying sorting
Verifying time diffs
Found time holes out of tolerance at times:
	2023-04-16 14:07:25 and 2023-04-16 14:32:35
	2023-04-16 15:02:52 and 2023-04-16 15:58:35
	2023-04-16 16:00:25 and 2023-04-16 16:07:35
	2023-04-16 16:37:52 and 2023-04-16 16:54:35
	2023-04-16 16:56:25 and 2023-04-16 17:10:35
	2023-04-16 17:40:52 and 2023-04-18 16:08:48
	2023-04-18 16:08:48 and 2023-04-18 16:17:08
	2023-04-18 16:17:08 and 2023-04-18 16:25:28
	2023-04-18 16:25:28 and 2023-04-18 16:33:48
	2023-04-18 16:33:48 and 2023-04-18 16:42:08
	2023-04-18 16:42:08 and 2023-04-18 16:50:28
	2023-04-18 16:50:28 and 2023-04-18 16:58:48
	2023-04-18 16:58:48 and 2023-04-18 17:07:08
	2023-04-18 17:07:08 and 2023-04-18 17:15:28
	2023-04-18 17:15:28 and 2023-04-18 17:23:48
	2023-04-18 17:23:48 and 2023-04-18 17:32:08
	2023-04-18 17:32:08 and 2023-04-18 17:40:52
	2023-04-18 17:40:52 and 2023-04-18 17:49:12
	2023-04-18 17:49:12 and 2023-04-18 17:57:32
	2023-04-18 17:57:32 and 2023-04-18 18: