In [1]:
from ingest_data import fetch_data
from segmentation import segmentation
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import os

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dataset_name = "louisbrulenaudet/clinical-trials"
ds = fetch_data(dataset_name)

In [None]:
tqdm.pandas(desc="Segmenting Criteria")

In [None]:
ds

In [None]:
split = ds['train'].train_test_split(test_size=0.1, seed=42)
split

In [None]:
df=split["train"].to_pandas()
df.head()

In [None]:
df.columns

In [None]:
COLUMNS_TO_KEEP = [
    'nct_id', 'eligibility_criteria', 'overall_status', 'phases', 'study_type',
    'minimum_age', 'maximum_age', 'sex', 'healthy_volunteers', 'conditions',
    'keywords', 'interventions', 'mesh_terms', 'locations', 'brief_title',
    'official_title', 'brief_summary'
]

In [None]:
df=df[COLUMNS_TO_KEEP]
df.dropna(subset=['eligibility_criteria'], inplace=True)

In [None]:
df['criteria_length'] = df['eligibility_criteria'].str.len()
plt.figure(figsize=(12, 6))
sns.histplot(df['criteria_length'].dropna(), bins=50, kde=True)
plt.title('Distribution of Eligibility Criteria Text Length)')
plt.xlabel('No of Char')
plt.ylabel('No of Trials')
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

In [None]:
df=segmentation(df)

In [None]:
df1=split["test"].to_pandas()
df1=df1[COLUMNS_TO_KEEP]
df1.dropna(subset=['eligibility_criteria'], inplace=True)
df1=segmentation(df1)

In [11]:
df=pd.read_csv("../data/pro/train.csv")
df1=pd.read_csv("../data/pro/test.csv")

In [12]:
print(f"{df.shape}, {df1.shape}")

(486804, 21), (54190, 20)


In [13]:
df.columns

Index(['Unnamed: 0', 'nct_id', 'eligibility_criteria', 'overall_status',
       'phases', 'study_type', 'minimum_age', 'maximum_age', 'sex',
       'healthy_volunteers', 'conditions', 'keywords', 'interventions',
       'mesh_terms', 'locations', 'brief_title', 'official_title',
       'brief_summary', 'criteria_length', 'eligibility_criteria_clean',
       'segmented_criteria'],
      dtype='object')

In [14]:
df_exploded = df.explode('segmented_criteria').rename(columns={'segmented_criteria': 'criterion_text'})
df_exploded.dropna(subset=['criterion_text'], inplace=True)
final_columns = [col for col in df.columns if col not in ['eligibility_criteria', 'criteria_length', 'segmented_criteria']]
final_columns.insert(1, 'criterion_text')
df_final = df_exploded[final_columns].copy()

In [15]:
df_final.shape

(486804, 19)

In [17]:
df_exploded1 = df1.explode('segmented_criteria').rename(columns={'segmented_criteria': 'criterion_text'})
df_exploded1.dropna(subset=['criterion_text'], inplace=True)
final_columns = [col for col in df1.columns if col not in ['eligibility_criteria', 'criteria_length', 'segmented_criteria']]
final_columns.insert(1, 'criterion_text')
df_final1 = df_exploded1[final_columns].copy()

In [18]:
df_final1.shape

(54190, 19)

In [19]:
os.makedirs("../data/pro", exist_ok=True)
df_final.to_csv("../data/pro/train.csv")
df_final1.to_csv("../data/pro/test.csv")