In [1]:
import ast
import json
import os
import tempfile
import shutil
import csv
from tqdm import tqdm

Tokenizer

In [2]:


REMOVED_FIELDS = {
    "intime", "outtime", "admittime", "dischtime",
    "starttime", "charttime", "chartdate", "ordertime", "valuenum", "long_title"
}

def create_token(event_type, details_dict):
    # Process ICD code if present
    if 'icd_code' in details_dict:
        code = details_dict['icd_code']
        if isinstance(code, str) and len(code) >= 3:
            details_dict['icd_code'] = code[:3]
    
    # Concatenate event_type and details into a single string
    details_token = "_".join([f"{k}={v}" for k, v in details_dict.items()])
    token = f"{event_type}_{details_token}"
    return token

def safe_json_save(obj, path):
    """Safely save JSON data to file, using a temporary file to avoid partial writes."""
    tmp_fd, tmp_path = tempfile.mkstemp()
    with os.fdopen(tmp_fd, 'w') as tmp_file:
        json.dump(obj, tmp_file, indent=4)
    shutil.move(tmp_path, path)

def load_all_events(folder_path):
    """Load all events from CSV files in the given folder."""
    all_events = []
    all_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    file_paths = [os.path.join(folder_path, f) for f in all_files]

    for file_path in tqdm(file_paths, desc="Loading All Timeline Files", unit="file"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    event_type = row.get("event_type", None)
                    details_str = row.get("details", None)

                    if not event_type or not details_str:
                        continue

                    try:
                        details_dict = ast.literal_eval(details_str)
                    except (ValueError, SyntaxError) as e:
                        print(f"Skipping malformed details in {file_path}: {e}")
                        continue  # Skip malformed details

                    # Strip known timestamp fields
                    for key in REMOVED_FIELDS:
                        details_dict.pop(key, None)

                    all_events.append((event_type, details_dict))
        except Exception as e:
            print(f"Failed to process {file_path}: {e}")
            continue

    return all_events

def generate_tokens(event_list):
    """Generate a set of unique tokens from event type and details."""
    token_set = set()
    for event_type, details_dict in event_list:
        token = create_token(event_type, details_dict)
        token_set.add(token)
    return token_set

def build_token_map(token_set):
    """Build a token map from the set of unique tokens."""
    token_map = {}
    for token in sorted(token_set):  # sorted for reproducibility
        token_map[token] = len(token_map)
    return token_map

def generate_token_map_from_folder(folder_path, token_map_filepath):
    """Generate a token map from events in a folder and save it to a file."""
    print("Loading all events (streaming)...")
    all_events = load_all_events(folder_path)

    print(f"Tokenizing {len(all_events)} events...")
    token_set = generate_tokens(all_events)

    print(f"Building token map from {len(token_set)} unique tokens...")
    token_map = build_token_map(token_set)

    safe_json_save(token_map, token_map_filepath)
    print(f"Token map saved to: {token_map_filepath}")
    return token_map

# === Usage ===
folder_path = r'D:\CourseworkFolder\DPSynthData\Data Manipulation\timelines'
token_map_filepath = r'D:\CourseworkFolder\DPSynthData\Data Manipulation\token_map.json'

token_map = generate_token_map_from_folder(folder_path, token_map_filepath)


Loading all events (streaming)...


Loading All Timeline Files: 100%|██████████| 107101/107101 [14:31<00:00, 122.83file/s]


Tokenizing 12079048 events...
Building token map from 15566 unique tokens...
Token map saved to: D:\CourseworkFolder\DPSynthData\Data Manipulation\token_map.json
