# Imports

In [None]:
# Imports
import pandas as pd
from src.eda import (
    plot_records_per_grade,
    plot_records_per_year_side_by_side,
    plot_ae_category_distribution,
    build_rank_table,
    summarize_other_specify_terms,
    plot_top_terms_per_category,
    plot_number_of_records_and_studies_by_disease_site,
    plot_top_ae_terms
)

# Read data

In [None]:
# Read data
ct=pd.read_parquet("/cluster/home/t128190uhn/datasets/clinical_trials/cleaning/ct.parquet")
ct_high_grades=pd.read_parquet("/cluster/home/t128190uhn/datasets/clinical_trials/cleaning/ct_high_grades.parquet")

# Exploratory Data Analysis (EDA)

In [None]:
# Print record counts and percentage of high-grade cases
print(f"Total number of records: {len(ct):,}")
print(f"Number of grade 3+ records: {len(ct_high_grades):,}")
print(f"Percentage of grade 3+ records: {len(ct_high_grades) / len(ct) * 100:.1f}%")

## #Records per AE grade

In [None]:
plot_records_per_grade(ct)

## #Records per year

This section shows the number of records per year for all records and grade ≥3 records side by side.

Note that there are separate axes for each.

In [None]:
plot_records_per_year_side_by_side(ct, ct_high_grades)

## #Records per AE category

This section compares the distribution of AE categories between all records and grade ≥3 records.

In [None]:
plot_ae_category_distribution(ct, ct_high_grades)

## Rank change 

This section compares AE category rankings between all records and only those with grade ≥3 events.

In [None]:
# Build and save a rank table comparing AE category frequencies
# between all records and grade ≥3 records
rank_table = build_rank_table(
    ct, ct_high_grades,
    category_col="mapped_soc",
    save_path="/cluster/home/t128190uhn/datasets/clinical_trials/exploratory_data_analysis/rank_table.csv"
)

Here is a short description of what each column of the below table mean:

- **`ae_category`** – Name of the adverse event (AE) category.  
- **`rank_all_grades`** – Rank of the AE category based on frequency across *all grades*.  
- **`rank_grade_3plus`** – Rank of the AE category based on frequency among *grade ≥3* records.  
- **`rank_difference`** – Change in rank position from all grades to grade ≥3 (positive = dropped, negative = improved).  
- **`rank_trend`** – Direction of rank change (↑ improved, ↓ dropped, → same).  


In [None]:
# Display
rank_table.head(15).drop(columns=["n_all_grades", "n_grade_3plus"])

## #Records with AE terms that end with "Other, specify"

In CTCAE classification system, adverse events follow a hierarchy of `category → term → specifics`. Terms ending with `Other, specify` refer to general groupings that include several specific events. To perform a detailed analysis, the specific information for these terms is needed. However, in this data pull, those specifics were not retrieved, which leads to a loss of detail. These `Other, specify` terms make up nearly 12% of all records, so this limitation is important to consider.

In [None]:
summarize_other_specify_terms(ct)

## Top 7 AE terms per categories

### All records

This section visualizes the most frequent AE terms within each AE category **for all records**, highlighting those that end with `Other, specify.`

In [None]:
# Plot for all records
plot_top_terms_per_category(ct, df_name="ct")

### Grade 3+

This section visualizes the most frequent AE terms within each AE category **for records with grade ≥3**, highlighting those that end with `Other, specify.`


In [None]:
# Plot for high-grade records
plot_top_terms_per_category(ct_high_grades, df_name="ct_high_grades")

## Top disease site locations

Note: The disease location group is assigned at the study level, not per patient.

In [None]:
plot_number_of_records_and_studies_by_disease_site(ct)

## Top adverse event terms

This section visualizes the most frequent adverse event (AE) terms across all records and those with grade ≥ 3.

### All records

In [None]:
plot_top_ae_terms(ct)

### Grade 3+

In [None]:
plot_top_ae_terms(ct_high_grades)