In [1]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import time
from tqdm import tqdm


In [2]:
base_dir = Path.cwd().parent

# Data directories
tij_dir = base_dir / "data" / "tij"
readers_dir = base_dir / "data" / "tij_readers"
results_dir = base_dir / "data" / "TIJ_with_reader_list"

# File names (just the file names, not full paths)
tij_files = [
    "tij_ECSS18.dat",
    "tij_ECIR19.dat",
    "tij_WS16.dat",
    "tij_ICCSS17.dat"
]

reader_files = [
    "ECSS18_w_readers_20s.csv",
    "ECIR19_w_readers_20s.csv",
    "WS16_w_readers_20s.csv",
    "ICCSS17_w_readers.csv"
]

# Combine into full paths
file_path_tij = [tij_dir / fname for fname in tij_files]
file_path_reader = [readers_dir / fname for fname in reader_files]


In [3]:

def compute_tij_with_readers_list(tij_path, reader_path, result_dir, pooling_interval=20):
    """
    Compute a TIJ dataset with associated reader lists per contact, pooled over a fixed time interval.
    Now includes a tqdm progress bar for contact processing.

    Args:
        tij_path (str or Path): Path to the original TIJ contact file.
        reader_path (str or Path): Path to the reader detection file.
        result_dir (str or Path): Output directory for the processed file.
        pooling_interval (int): Size of the time window for pooling (default: 20 seconds).
    """

    print(f"\n▶️ Processing: {tij_path} + {reader_path}")
    start_time = time.time()

    experiment_name = Path(tij_path).stem.replace("tij_", "")
    Path(result_dir).mkdir(parents=True, exist_ok=True)
    output_filename = f"tij_with_readers_{experiment_name}.dat"
    output_path = Path(result_dir) / output_filename

    # === Load reader data
    try:
        reader_data = pd.read_csv(reader_path, delimiter="\t", dtype={8: str}, low_memory=False)
    except Exception as e:
        print(f"❌ Failed to load reader file: {e}")
        return

    reader_data_reduced = reader_data[['t', 'id', 'reader']].drop_duplicates()
    reader_data_reduced['t'] = (reader_data_reduced['t'] // pooling_interval) * pooling_interval

    # === Load TIJ data
    try:
        tij_data = pd.read_csv(tij_path, delimiter="\t", header=None)
        tij_data.columns = ['t', 'i', 'j']
        tij_data['t'] = (tij_data['t'] // pooling_interval) * pooling_interval
    except Exception as e:
        print(f"❌ Failed to load TIJ file: {e}")
        return

    # === Merge readers for i and j
    merge_i = pd.merge(
        tij_data, reader_data_reduced,
        left_on=['t', 'i'], right_on=['t', 'id'], how='left'
    ).rename(columns={'reader': 'reader_i'}).drop(columns='id')

    merge_ij = pd.merge(
        merge_i, reader_data_reduced,
        left_on=['t', 'j'], right_on=['t', 'id'], how='left'
    ).rename(columns={'reader': 'reader_j'}).drop(columns='id')

    # === Combine long format
    long_readers = pd.concat([
        merge_ij[['t', 'i', 'j', 'reader_i']].rename(columns={'reader_i': 'reader'}),
        merge_ij[['t', 'i', 'j', 'reader_j']].rename(columns={'reader_j': 'reader'})
    ])

    # === Group with tqdm progress
    print("⏳ Grouping readers by contact...")
    tqdm.pandas(desc="⏳ Processing contacts")
    contacts_with_readers = (
        long_readers.dropna(subset=['reader'])
        .groupby(['t', 'i', 'j'])['reader']
        .progress_apply(lambda x: sorted(set(x)))
        .reset_index(name='readers')
    )

    # === Check for missing contacts
    merged_check = tij_data.merge(contacts_with_readers, on=['t', 'i', 'j'], how='left', indicator=True)
    missing_rows = merged_check[merged_check['_merge'] == 'left_only']

    if not missing_rows.empty:
        print(f"⚠️ {len(missing_rows)} contact(s) from the original file are missing in the output!")
        print(missing_rows.head())
    else:
        print("✅ All original (t, i, j) contacts are preserved in the output.")

    # === Save output
    try:
        contacts_with_readers.to_csv(output_path, index=False)
        print(f"✅ File successfully saved to: {output_path}")
    except Exception as e:
        print(f"❌ Failed to save file: {e}")

    end_time = time.time()
    print(f"⏱️  Done in {end_time - start_time:.2f} seconds\n")


In [4]:
for tij_path, reader_path in zip(file_path_tij, file_path_reader):
    compute_tij_with_readers_list(tij_path, reader_path, results_dir)


▶️ Processing: C:\Users\BE\Documents\Corentin\data\tij\tij_ECSS18.dat + C:\Users\BE\Documents\Corentin\data\tij_readers\ECSS18_w_readers_20s.csv
⏳ Grouping readers by contact...


⏳ Processing contacts: 100%|██████████| 96362/96362 [00:03<00:00, 28858.40it/s]


✅ All original (t, i, j) contacts are preserved in the output.
✅ File successfully saved to: C:\Users\BE\Documents\Corentin\data\TIJ_with_reader_list\tij_with_readers_ECSS18.dat
⏱️  Done in 9.17 seconds


▶️ Processing: C:\Users\BE\Documents\Corentin\data\tij\tij_ECIR19.dat + C:\Users\BE\Documents\Corentin\data\tij_readers\ECIR19_w_readers_20s.csv
⏳ Grouping readers by contact...


⏳ Processing contacts: 100%|██████████| 132949/132949 [00:04<00:00, 30425.79it/s]


✅ All original (t, i, j) contacts are preserved in the output.
✅ File successfully saved to: C:\Users\BE\Documents\Corentin\data\TIJ_with_reader_list\tij_with_readers_ECIR19.dat
⏱️  Done in 19.49 seconds


▶️ Processing: C:\Users\BE\Documents\Corentin\data\tij\tij_WS16.dat + C:\Users\BE\Documents\Corentin\data\tij_readers\WS16_w_readers_20s.csv
⏳ Grouping readers by contact...


⏳ Processing contacts: 100%|██████████| 153371/153371 [00:04<00:00, 30810.06it/s]


✅ All original (t, i, j) contacts are preserved in the output.
✅ File successfully saved to: C:\Users\BE\Documents\Corentin\data\TIJ_with_reader_list\tij_with_readers_WS16.dat
⏱️  Done in 10.30 seconds


▶️ Processing: C:\Users\BE\Documents\Corentin\data\tij\tij_ICCSS17.dat + C:\Users\BE\Documents\Corentin\data\tij_readers\ICCSS17_w_readers.csv
⏳ Grouping readers by contact...


⏳ Processing contacts: 100%|██████████| 199309/199309 [00:09<00:00, 21539.68it/s]


✅ All original (t, i, j) contacts are preserved in the output.
✅ File successfully saved to: C:\Users\BE\Documents\Corentin\data\TIJ_with_reader_list\tij_with_readers_ICCSS17.dat
⏱️  Done in 51.77 seconds

