# Imports and setups

In [None]:
# Imports
import pandas as pd
from src.eda import (
    plot_records_per_grade,
    plot_records_per_year_side_by_side,
    plot_ae_category_distribution,
    build_rank_table,
    summarize_other_specify_terms,
    plot_top_terms_per_category,
    plot_number_of_records_and_studies_by_disease_site
)

In [None]:
# Set the maximum number of columns/rows to display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None) 

# Read data

In [None]:
# Read data
ct=pd.read_parquet("/cluster/home/t128190uhn/datasets/clinical_trials/cleaning/ct.parquet")
ct_high_grades=pd.read_parquet("/cluster/home/t128190uhn/datasets/clinical_trials/cleaning/ct_high_grades.parquet")

In [None]:
# Columns to keep (ordered)
cols_of_interest = [
    'study_name',
    'study_title',
    'sponsor_type',
    'investigator_initiated',
    'primary_investigator',
    'phase',
    'detected_ctcae_version',
    'mrn',
    'birthdate',
    'age_at_enrolment',
    'disease_site_group',
    'ae_category',
    'ae_term',
    'ae_specific',
    'ae_grade',
    'ae_grade_start_date',
    'mapped_soc',
    'mapped_term',
    'mapped_grade'
]

In [None]:
# Add a source column and align columns; missing ones become <NA>
cols_all = cols_of_interest + ['source']

ct_crr_sel  = (mapped_ct_crr
               .assign(source='crr')
               .reindex(columns=cols_all)
               .dropna(axis=1, how='all')
)

ct_epic_sel = (mapped_ct_epic
               .assign(source='epic')
               .reindex(columns=cols_all)
               .dropna(axis=1, how='all')
)               

# Append (row-bind) into a single dataframe
ct = pd.concat([ct_crr_sel, ct_epic_sel], ignore_index=True)

In [None]:
ct.dtypes

# Exploratory Data Analysis (EDA)

In [None]:
# Print record counts and percentage of high-grade cases
print(f"Total number of records: {len(ct):,}")
print(f"Number of grade 3+ records: {len(ct_high_grades):,}")
print(f"Percentage of grade 3+ records: {len(ct_high_grades) / len(ct) * 100:.1f}%")

## #Records per AE grade

In [None]:
plot_records_per_grade(ct)

## #Records per year

In [None]:
plot_records_per_year_side_by_side(ct, ct_high_grades)

## #Records per AE category

In [None]:
plot_ae_category_distribution(ct, ct_high_grades)

## Rank change 

In [None]:
rank_table = build_rank_table(
    ct, ct_high_grades,
    category_col="mapped_soc",
    save_path="/cluster/home/t128190uhn/datasets/clinical_trials/exploratory_data_analysis/rank_table.csv"
)

In [None]:
# Display
rank_table.head(15).drop(columns=["n_all_grades", "n_grade_3plus"])

## #Records with AE terms that end with "Other, specify"

In [None]:
summarize_other_specify_terms(ct)

## Top 7 AE terms per categories - all records

In [None]:
# Plot for all records
plot_top_terms_per_category(ct, df_name="ct")

## Top 7 AE terms per categories - grade 3+

In [None]:
# Plot for high-grade records
plot_top_terms_per_category(ct_high_grades, df_name="ct_high_grades")

## Top disease site locations

Note: The disease location group is assigned at the study level, not per patient.

In [None]:
plot_number_of_records_and_studies_by_disease_site(ct)