In [2]:
import polars as pl
import numpy as np
import pandas as pd
import os
import glob

# View Unsorted Data

In [3]:
def print_parquet_sample(directory_path: str, sample_size: int = 5):
    parquet_file = None
    try:
        # Check if the directory exists
        if not os.path.isdir(directory_path):
            raise FileNotFoundError(f"Error: Directory not found at '{directory_path}'")

        # Find the first file with a .parquet extension
        for filename in os.listdir(directory_path):
            if filename.endswith(".parquet"):
                parquet_file = os.path.join(directory_path, filename)
                break  # Stop after finding the first one

        if not parquet_file:
            print(f"No .parquet files found in the directory: '{directory_path}'")
            return

        print(f"Reading sample from: {parquet_file}\n")

        # Read the Parquet file into a pandas DataFrame
        df = pd.read_parquet(parquet_file)

        # Print the first 'sample_size' rows of the DataFrame
        print("--- Sample of Parquet File ---")
        print(df.head(sample_size))
        print("-----------------------------\n")

    except ImportError:
        print("Error: The 'pandas' and 'pyarrow' (or 'fastparquet') libraries are required.")
        print("Please install them using: pip install pandas pyarrow")
    except FileNotFoundError as e:
        print(e)
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


In [14]:
unsorted_path = '/data/scratch/qc25022/liver/intermediate_unsorted/'
sorted_path = '/data/scratch/qc25022/liver/intermediate_sorted/'

In [15]:
print_parquet_sample(unsorted_path)

Reading sample from: /data/scratch/qc25022/liver/intermediate_unsorted/136.parquet

--- Sample of Parquet File ---
         e_patid        time                       code  numeric_value  \
0  1470645551464  2012-06-19       medcodeid//411633013            NaN   
1  1470645551464  2010-11-08       medcodeid//285214011            NaN   
2  1470645551464  2013-07-15  medcodeid//63521000000111            NaN   
3  1470645551464  2012-08-21       medcodeid//411633013            NaN   
4  1470645551464  2014-04-28       medcodeid//457003010            NaN   

   numunitid  
0        NaN  
1        NaN  
2        NaN  
3        NaN  
4        NaN  
-----------------------------



In [17]:
print_parquet_sample(sorted_path)

Reading sample from: /data/scratch/qc25022/liver/intermediate_sorted/136.parquet

--- Sample of Parquet File ---
     subject_id        time                        code  numeric_value  \
0  373308350009  2006-09-08        medcodeid//216043013            NaN   
1  373308350009  2006-09-08  medcodeid//158341000000117            NaN   
2  373308350009  2006-09-08        medcodeid//459936017            NaN   
3  373308350009  2006-09-08        medcodeid//286005016            NaN   
4  373308350009  2006-09-08        medcodeid//453390010            NaN   

   numunitid  
0        NaN  
1        NaN  
2        NaN  
3        NaN  
4        NaN  
-----------------------------



# View Shard- Patient Level

In [4]:
def view_patient_data_from_shards(patient_id, output_dir, splits_df):
    """
    Loads and prints the final processed data for a specific patient
    by searching for them within the correct output shard file.
    """
    print(f"\n--- Searching for Patient ID: {patient_id} in sharded output ---")
    
    # --- Step 1: Find the patient's split ---
    patient_split_info = splits_df.filter(pl.col("subject_id") == patient_id)
    
    if patient_split_info.is_empty():
        print(f"Error: Patient ID {patient_id} not found in the subject information file.")
        return
        
    split = patient_split_info.get_column("split")[0]
    
    if split not in SPLIT_MAP:
        print(f"Error: Unknown split '{split}' for patient {patient_id}.")
        return

    # --- Step 2: Search for the patient in the correct split directory ---
    subdir = SPLIT_MAP[split]
    search_dir = os.path.join(output_dir, subdir)
    
    # Use glob to find all shard files in the directory
    shard_files = glob.glob(os.path.join(search_dir, "shard_*.parquet"))
    
    if not shard_files:
        print(f"Error: No shard files found in directory '{search_dir}'.")
        return

    patient_found = False
    for shard_path in shard_files:
        try:
            # Lazily scan the shard file to efficiently check if the patient is in it
            shard_lf = pl.scan_parquet(shard_path)
            patient_data_lf = shard_lf.filter(pl.col('subject_id') == patient_id)
            
            # Collect the data. If the resulting DataFrame is not empty, we found the patient.
            patient_df = patient_data_lf.collect()
            
            if not patient_df.is_empty():
                print(f"Found patient in shard: {os.path.basename(shard_path)}")
                # Use a Polars context manager to ensure all rows are printed
                with pl.Config(tbl_rows=-1):
                    print(patient_df)
                patient_found = True
                break # Stop searching once the patient is found
        except Exception as e:
            # Continue to the next shard if there's an error reading one
            print(f"An error occurred while reading shard '{shard_path}': {e}")
            continue 

    if not patient_found:
        print(f"Error: Patient ID {patient_id} was not found in any shard in the '{split}' directory.")
    
    print("---------------------------------------------------\n")

In [5]:
final_output_dir = '/data/scratch/qc25022/liver/event_streams/'
all_subjects_df = pl.read_csv('/data/home/qc25022/cancer-extraction-pipeline/output/liver_study/subject_information.csv')
SPLITS_PATH = os.path.join('/data/home/qc25022/cancer-extraction-pipeline/output/liver_study/subject_information.csv')
OUTPUT_DIR = os.path.join('/data/scratch/qc25022/liver/', "intermediate_sorted")
SPLIT_MAP = {'train': 'train', 'val': 'tuning', 'test': 'held_out'}
splits = pl.read_csv(SPLITS_PATH)

view_patient_data_from_shards(1000242250487, final_output_dir, all_subjects_df)


--- Searching for Patient ID: 1000242250487 in sharded output ---
Found patient in shard: shard_3.parquet
shape: (1_295, 6)
┌───────────────┬──────────────┬──────────────────────────┬───────────────┬────────────┬───────────┐
│ subject_id    ┆ time         ┆ code                     ┆ numeric_value ┆ text_value ┆ numunitid │
│ ---           ┆ ---          ┆ ---                      ┆ ---           ┆ ---        ┆ ---       │
│ i64           ┆ datetime[μs] ┆ str                      ┆ f32           ┆ str        ┆ i64       │
╞═══════════════╪══════════════╪══════════════════════════╪═══════════════╪════════════╪═══════════╡
│ 1000242250487 ┆ 1948-01-01   ┆ MEDS_BIRTH               ┆ null          ┆ null       ┆ null      │
│               ┆ 00:00:00     ┆                          ┆               ┆            ┆           │
│ 1000242250487 ┆ 1977-01-01   ┆ MEDICAL//Oesteoarthritis ┆ null          ┆ null       ┆ null      │
│               ┆ 00:00:00     ┆ //N05z…                  ┆        

In [6]:
final_output_dir = '/data/scratch/qc25022/liver/final_cleaned_events/'
view_patient_data_from_shards(1000242250487, final_output_dir, all_subjects_df)


--- Searching for Patient ID: 1000242250487 in sharded output ---
Found patient in shard: shard_3.parquet
shape: (1_295, 5)
┌───────────────┬─────────────────────┬───────────────────────────────┬───────────────┬────────────┐
│ subject_id    ┆ time                ┆ code                          ┆ numeric_value ┆ text_value │
│ ---           ┆ ---                 ┆ ---                           ┆ ---           ┆ ---        │
│ i64           ┆ datetime[μs]        ┆ str                           ┆ f32           ┆ str        │
╞═══════════════╪═════════════════════╪═══════════════════════════════╪═══════════════╪════════════╡
│ 1000242250487 ┆ 1948-01-01 00:00:00 ┆ MEDS_BIRTH                    ┆ null          ┆ null       │
│ 1000242250487 ┆ 1977-01-01 00:00:00 ┆ MEDICAL//Oesteoarthritis//N05 ┆ null          ┆ null       │
│               ┆                     ┆ z…                            ┆               ┆            │
│ 1000242250487 ┆ 1984-01-01 00:00:00 ┆ MEDICAL//TK0..00//119641000