In [6]:
import pandas as pd
import numpy as np

def modify_time_column(df):
    """
    Adds a random number to the 'time' column for rows where the 'type' column
    is 0, 1, or 2, grouped by 'record_id'.

    Args:
        df (pd.DataFrame): Input DataFrame with columns 'time', 'record_id', and 'type'.

    Returns:
        pd.DataFrame: Modified DataFrame with updated 'time' values.
    """
    rng = np.random.default_rng()  # Random number generator

    def apply_random_time_adjustment(group):
        # Add a random value only to rows where type is 0, 1, or 2
        mask = group['type'].isin([0, 1, 2])
        group.loc[mask, 'time'] += rng.uniform(0, 100000, size=mask.sum())  # Add random numbers
        return group

    # Apply the modification grouped by 'record_id'
    modified_df = df.groupby('record_id').apply(apply_random_time_adjustment)
    return modified_df

# Example usage with your dataset
data = {
    "time": [0, 0, 1230.520131, 1232.213440, 1104.965687, 9.962088, 773.729854, 596.038141, 971.174044, 143.454326],
    "string_id": [4, 4, 4, 4, 4, 13, 13, 13, 13, 13],
    "module_id": [0, 0, 0, 0, 0, 19, 19, 19, 19, 19],
    "pmt_id": [0, 0, 0, 2, 4, 0, 0, 6, 10, 13],
    "record_id": [14, 14, 14, 14, 14, 14, 14, 14, 14, 14],
    "type": [0, 0, 0, 0, 0, 20, 20, 20, 20, 20]
}
df = pd.DataFrame(data)

# Apply the function
modified_df = modify_time_column(df)
display(modified_df)


Unnamed: 0,time,string_id,module_id,pmt_id,record_id,type
0,33486.417291,4,0,0,14,0
1,10274.705504,4,0,0,14,0
2,56203.45926,4,0,0,14,0
3,95135.977161,4,0,2,14,0
4,27212.25705,4,0,4,14,0
5,9.962088,13,19,0,14,20
6,773.729854,13,19,0,14,20
7,596.038141,13,19,6,14,20
8,971.174044,13,19,10,14,20
9,143.454326,13,19,13,14,20


In [7]:
df

Unnamed: 0,time,string_id,module_id,pmt_id,record_id,type
0,0.0,4,0,0,14,0
1,0.0,4,0,0,14,0
2,1230.520131,4,0,0,14,0
3,1232.21344,4,0,2,14,0
4,1104.965687,4,0,4,14,0
5,9.962088,13,19,0,14,20
6,773.729854,13,19,0,14,20
7,596.038141,13,19,6,14,20
8,971.174044,13,19,10,14,20
9,143.454326,13,19,13,14,20


In [26]:
import pandas as pd
import numpy as np

def modify_time_by_record_and_type(df):
    """
    Adds the same random number to the 'time' column for rows with the same
    'record_id' and 'type' combination, where 'type' is 0, 1, or 2.

    Args:
        df (pd.DataFrame): Input DataFrame with columns 'time', 'record_id', and 'type'.

    Returns:
        pd.DataFrame: Modified DataFrame with updated 'time' values.
        
    """
    df=df.copy()
    rng = np.random.default_rng()  # Random number generator

    # Filter relevant combinations of record_id and type
    relevant_combinations = df[df['type'].isin([0, 1, 2])][['record_id', 'type']].drop_duplicates()

    # Generate random numbers for each combination
    random_offsets = {
        (row['record_id'], row['type']): rng.uniform(0, 10000)
        for _, row in relevant_combinations.iterrows()
    }

    # Apply the random offsets to the DataFrame
    def apply_random_time_adjustment(row):
        key = (row['record_id'], row['type'])
        if key in random_offsets:
            return row['time'] + random_offsets[key]
        return row['time']

    # Modify the time column
    df['time'] = df.apply(apply_random_time_adjustment, axis=1)
    return df

# Example usage with your dataset

df = pd.DataFrame(data)

# Apply the function
display(df)
modified_df = modify_time_by_record_and_type(df.copy())
display(modified_df)


Unnamed: 0,time,string_id,module_id,pmt_id,record_id,type
0,1104.965874,4,0,0,14,0
1,1122.459863,4,0,0,14,0
2,1230.520131,4,0,0,14,0
3,1232.21344,4,0,2,14,0
4,1104.965687,4,0,4,14,0
5,9.962088,13,19,0,14,20
6,773.729854,13,19,0,14,20
7,596.038141,13,19,6,14,20
8,971.174044,13,19,10,14,20
9,143.454326,13,19,13,14,20


Unnamed: 0,time,string_id,module_id,pmt_id,record_id,type
0,6551.868851,4,0,0,14,0
1,6569.36284,4,0,0,14,0
2,6677.423108,4,0,0,14,0
3,6679.116417,4,0,2,14,0
4,6551.868664,4,0,4,14,0
5,9.962088,13,19,0,14,20
6,773.729854,13,19,0,14,20
7,596.038141,13,19,6,14,20
8,971.174044,13,19,10,14,20
9,143.454326,13,19,13,14,20


In [27]:
df['time']-modified_df['time']

0   -5446.902977
1   -5446.902977
2   -5446.902977
3   -5446.902977
4   -5446.902977
5       0.000000
6       0.000000
7       0.000000
8       0.000000
9       0.000000
Name: time, dtype: float64

In [28]:
import h5py
import pandas as pd
import numpy as np

def modify_time_by_record_and_type(df):
    """
    Adds the same random number to the 'time' column for rows with the same
    'record_id' and 'type' combination, where 'type' is 0, 1, or 2.

    Args:
        df (pd.DataFrame): Input DataFrame with columns 'time', 'record_id', and 'type'.

    Returns:
        pd.DataFrame: Modified DataFrame with updated 'time' values.
    """
    rng = np.random.default_rng()  # Random number generator

    # Filter relevant combinations of record_id and type
    relevant_combinations = df[df['type'].isin([0, 1, 2])][['record_id', 'type']].drop_duplicates()

    # Generate random numbers for each combination
    random_offsets = {
        (row['record_id'], row['type']): rng.uniform(0, 10)
        for _, row in relevant_combinations.iterrows()
    }

    # Apply the random offsets to the DataFrame
    def apply_random_time_adjustment(row):
        key = (row['record_id'], row['type'])
        if key in random_offsets:
            return row['time'] + random_offsets[key]
        return row['time']

    # Modify the time column
    df['time'] = df.apply(apply_random_time_adjustment, axis=1)
    return df

def process_hdf5_file(input_file, output_file):
    """
    Opens an HDF5 file, applies modifications to the dataset, and saves it as a new file.

    Args:
        input_file (str): Path to the input HDF5 file.
        output_file (str): Path to save the modified HDF5 file.
    """
    with h5py.File(input_file, "r") as infile, h5py.File(output_file, "w") as outfile:
        for dataset_name in infile:
            # Read dataset into a DataFrame
            data = infile[dataset_name][()]
            column_names = infile[dataset_name].attrs.get("column_names", None)

            if column_names is not None:
                column_names = column_names.decode("utf-8").split(",")
                df = pd.DataFrame(data, columns=column_names)
            else:
                raise ValueError(f"Column names not found for dataset '{dataset_name}'.")

            # Apply the modification
            #display(df.head)
            #modified_df = modify_time_by_record_and_type(df)

            # Save the modified dataset to the new file
            #dset = outfile.create_dataset(dataset_name, data=modified_df.values)
            # Preserve attributes
            #outfile[dataset_name].attrs["column_names"] = ",".join(column_names)




In [39]:
# Example usage
input_hdf5_file = "data/AllMerge10.h5"
output_hdf5_file = "data/output.h5"

process_hdf5_file(input_hdf5_file, output_hdf5_file)

TypeError: Accessing a group is done with bytes or str,  not <class 'tuple'>

In [38]:
with h5py.File(input_hdf5_file,'r') as infile:
    for datasetnames in infile:
        #print(datasetnames)
        data = infile
        display(data)

<HDF5 group "/detector" (1 members)>

<HDF5 group "/hits" (2 members)>

<HDF5 group "/records" (2 members)>

<HDF5 group "/sources" (2 members)>

In [1]:
import pandas as pd
pd.read_parquet('data/graphnet_out/15records_copy4/hits/1_hits.parquet')

Unnamed: 0_level_0,time,string_id,module_id,pmt_id
event_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,786.813049,0,0,0
0,786.433145,0,0,0
0,788.604878,0,0,0
0,783.700004,0,0,0
0,786.487287,0,0,0
...,...,...,...,...
21,938.964799,11,15,15
21,940.499721,11,15,15
21,939.901409,11,15,15
21,938.708949,11,15,15


In [3]:
pd.read_parquet('data/graphnet_out/15records_copy4/truth/1_truth.parquet')

Unnamed: 0_level_0,type
event_no,Unnamed: 1_level_1
0,1
0,1
0,1
0,1
0,1
...,...
21,1
21,1
21,1
21,1
