# Imports

In [None]:
# Imports
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib_venn import venn2
from matplotlib_venn.layout import venn2 as venn2_layout

# Read data

In [None]:
# Read data
df = pd.read_parquet("/cluster/projects/gliugroup/2BLAST/data/final/data_2025-03-29/processed/treatment_centered_data.parquet")
oacc = pd.read_csv("/cluster/home/t128190uhn/datasets/oacc/cleaned/cleaned_and_filtered_oacc_data.csv")

In [None]:
df = pd.read_parquet("/cluster/projects/gliugroup/2BLAST/data/final/data_2025-03-29/interim/chemo.parquet")

In [None]:
date_columns = [col for col in df.columns if 'date' in col.lower()]
print(date_columns)

In [None]:
import os

folder_path = "/cluster/projects/gliugroup/2BLAST/data/final/data_2025-03-29/interim/t"

if os.path.exists(folder_path):
    print(os.listdir(folder_path))
else:
    print("Folder not found:", folder_path)

In [None]:
import pandas as pd
import os

base_path = "/cluster/projects/gliugroup/2BLAST/data/final/data_2025-03-29/interim/"

acute_care_use = pd.read_parquet(os.path.join(base_path, "acute_care_use.parquet"))
lab = pd.read_parquet(os.path.join(base_path, "lab.parquet"))
radiation = pd.read_parquet(os.path.join(base_path, "radiation.parquet"))
symptom = pd.read_parquet(os.path.join(base_path, "symptom.parquet"))
reports = pd.read_parquet(os.path.join(base_path, "reports.parquet"))
demographic = pd.read_parquet(os.path.join(base_path, "demographic.parquet"))
chemo = pd.read_parquet(os.path.join(base_path, "chemo.parquet"))
last_seen_dates = pd.read_parquet(os.path.join(base_path, "last_seen_dates.parquet"))

In [None]:
datasets = {
    "acute_care_use": acute_care_use,
    "lab": lab,
    "radiation": radiation,
    "symptom": symptom,
    "reports": reports,
    "demographic": demographic,
    "chemo": chemo,
    "last_seen_dates": last_seen_dates
}

for name, df in datasets.items():
    date_cols = [col for col in df.columns if "date" in col.lower()]
    print(f"{name}: {date_cols}")

In [None]:
dfs = {
    "acute_care_use": acute_care_use,
    "lab": lab,
    "radiation": radiation,
    "symptom": symptom,
    "reports": reports,
    "demographic": demographic,
    "chemo": chemo,
    "last_seen_dates": last_seen_dates,
    "ct": ct
}

date_columns = {
    "acute_care_use": ["admission_date_raw"],
    "lab": ["obs_date"],
    "radiation": ["treatment_start_date"],
    "symptom": ["obs_date"],
    "reports": ["initial_report_date"],
    "demographic": ["diagnosis_date"],
    "chemo": ["treatment_date"],
    "ct":["ae_grade_start_date"]
}
def coerce_dates_and_report(dfs, date_columns):
    for name, cols in date_columns.items():
        df = dfs[name]
        print(f"\n{name.upper()}:")
        for col in cols:
            if col in df.columns:
                # Coerce to datetime; unparseable values become NaT
                df[col] = pd.to_datetime(df[col], errors='coerce')
                n_total = len(df[col])
                n_valid = df[col].notna().sum()
                n_na = n_total - n_valid
                print(f"  {col}: {df[col].min()} â†’ {df[col].max()}  | valid={n_valid}, NaT={n_na}")
            else:
                print(f"  {col}: not found")
coerce_dates_and_report(dfs, date_columns)

In [None]:
import pandas as pd

summary_rows = []

for name, cols in date_columns.items():
    df = dfs[name]
    # Identify the MRN column (if it's consistently named)
    mrn_col = [c for c in df.columns if "mrn" in c.lower()]
    mrn_col = mrn_col[0] if mrn_col else None

    # Use the first date column listed for that dataset
    col = cols[0] if cols else None
    if col and col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        min_date = df[col].min()
        max_date = df[col].max()
    else:
        min_date = max_date = None

    summary_rows.append({
        "dataset": name,
        "min_date": min_date,
        "max_date": max_date,
        "records": len(df),
        "unique_mrn": df[mrn_col].nunique() if mrn_col else "N/A"
    })

summary_df = pd.DataFrame(summary_rows)
summary_df = summary_df[["dataset", "min_date", "max_date", "records", "unique_mrn"]]
display(summary_df)


In [None]:
df['treatment_date'].min(), df['treatment_date'].max()

In [None]:
oacc=pd.read_parquet("/cluster/home/t128190uhn/datasets/oacc/raw/raw_oacc_db.parquet")

In [None]:
ct=pd.read_parquet("/cluster/home/t128190uhn/datasets/clinical_trials/cleaning/ct.parquet")

In [None]:
# Ensure your date column is properly formatted
ct["ae_grade_start_date"] = pd.to_datetime(ct["ae_grade_start_date"], errors="coerce")

summary_by_source = (
    ct.groupby("source")
    .agg(
        records=("mrn", "count"),
        unique_mrn=("mrn", "nunique"),
        min_date=("ae_grade_start_date", "min"),
        max_date=("ae_grade_start_date", "max")
    )
    .reset_index()
)

summary_by_source

In [None]:
ct.head()

In [None]:
# Make sure the date column is in datetime format
oacc["DateReferred"] = pd.to_datetime(oacc["DateReferred"], errors="coerce")

# Calculate summary stats
summary = pd.DataFrame({
    "min_date": [oacc["DateReferred"].min()],
    "max_date": [oacc["DateReferred"].max()],
    "records": [len(oacc)],
    "unique_mrn": [oacc["MRN"].nunique()]
})

summary

# Venn diagram

In [None]:
# --- Define sets ---
oacc_mrns = set(oacc['mrn'].unique())
df_mrns = set(df['mrn'].unique())
common_mrns = oacc_mrns & df_mrns

# --- Create figure ---
plt.figure(figsize=(10,10))

# --- Use the new layout API ---
venn = venn2(
    subsets=(
        len(oacc_mrns - df_mrns),
        len(df_mrns - oacc_mrns),
        len(common_mrns)
    ),
    set_labels=('OACC', 'Clinical Trials DF'),
    set_colors=('#A7C7E7', '#FFB6B6'), 
    alpha=0.7,
    layout_algorithm=venn2_layout.DefaultLayoutAlgorithm(normalize_to=1.0)
)

# --- Customize intersection color to purple ---
if venn.get_patch_by_id('11'):
    venn.get_patch_by_id('11').set_color('#CBA0E3') 
    venn.get_patch_by_id('11').set_alpha(0.8)

# --- Add legend outside the plot ---
plt.legend(
    [
        f"OACC (n={len(oacc_mrns)})",
        f"Clinical Trials DF (n={len(df_mrns)})",
        f"Common MRNs (n={len(common_mrns)})"
    ],
    loc='center left',
    bbox_to_anchor=(1, 0.5),
    frameon=True,
    title='Sets Summary'
)

plt.title('Overlap of MRNs between OACC and Clinical Trials DF', fontsize=13)
plt.tight_layout(rect=[0, 0, 0.8, 1])
plt.show()