In [97]:
import os
import pandas as pd
import re
import unicodedata

In [98]:
# Get the PROJECT ROOT (biomed-extractor/)
PROJECT_ROOT = 'c:\\Users\\elena.jolkver\\Documents\\github\\biomed_extractor'

# Data directory at top level
DATA_DIR = os.path.join(PROJECT_ROOT, 'data')

def load_trials_json(filepath, filename):
    path = os.path.join(filepath, filename)
    if not os.path.isfile(path):
        raise FileNotFoundError(f"File not found: {path}")
    df = pd.read_json(path)
    print(f"Loaded {len(df)} records from {filename}")
    return df

df_json = load_trials_json(filepath = DATA_DIR, filename='example_trials.json')
print(df_json.head())

Loaded 100 records from example_trials.json
                                     protocolSection  \
0  {'identificationModule': {'nctId': 'NCT0313227...   
1  {'identificationModule': {'nctId': 'NCT0642423...   
2  {'identificationModule': {'nctId': 'NCT0313145...   
3  {'identificationModule': {'nctId': 'NCT0525613...   
4  {'identificationModule': {'nctId': 'NCT0256551...   

                                   annotationSection  \
0  {'annotationModule': {'unpostedAnnotation': {'...   
1                                                NaN   
2                                                NaN   
3                                                NaN   
4                                                NaN   

                                      derivedSection  hasResults  \
0  {'miscInfoModule': {'versionHolder': '2025-07-...       False   
1  {'miscInfoModule': {'versionHolder': '2025-07-...        True   
2  {'miscInfoModule': {'versionHolder': '2025-07-...        True   
3  {'miscI

In [99]:
# Helper function to flatten list of dictionaries
def flatten_list_of_dicts(lst, keys):
    if isinstance(lst, list):
        return ['; '.join(f"{k}: {d.get(k, '')}" for k in keys if isinstance(d, dict)) for d in lst]
    return lst

# Define a function to clean the text
def clean_text(text):
    if pd.isna(text):
        return ''
    # Normalize encoding
    text = unicodedata.normalize('NFKD', text)
    # Remove HTML/XML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove non-printable characters
    text = ''.join(c for c in text if c.isprintable()) 
    # Remove asterisks
    text = text.replace('*', '; ')
    # Remove asterisks at beginning of sentences
    text = text.replace(':;', ':')
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # drop trademark symbols
    text = re.sub(r'[\u00AE\u2122\u2120]', '', text)
    # Convert to lowercase
    text = text.lower() 

    return text



# Function to split eligibility into inclusion and exclusion criteria
def split_eligibility(text):
    inclusion = ''
    exclusion = ''
    if isinstance(text, str):
        text_lower = text.lower()
        inc_start = text_lower.find('inclusion criteria:')
        exc_start = text_lower.find('exclusion criteria:')
        if inc_start != -1 and exc_start != -1:
            inclusion = text[inc_start + len('inclusion criteria:'):exc_start].strip()
            exclusion = text[exc_start + len('exclusion criteria:'):].strip()
        elif inc_start != -1:
            inclusion = text[inc_start + len('inclusion criteria:'):].strip()
        elif exc_start != -1:
            exclusion = text[exc_start + len('exclusion criteria:'):].strip()
    return pd.Series([inclusion, exclusion])



def extract_elegibility_trtm_from_clinicaltrials(df):
    """    Extracts eligibility and treatment information from clinical trials DataFrame. """
    # Normalize from json to pandas df
    protocol_df = pd.json_normalize(df['protocolSection']) 
    # Apply flattening using .loc
    protocol_df.loc[:, 'interventions_clean'] = protocol_df['armsInterventionsModule.interventions'].apply(
        lambda x: flatten_list_of_dicts(x, ['type', 'name', 'description'])
    )

    protocol_df.loc[:, 'primaryOutcomes_clean'] = protocol_df['outcomesModule.primaryOutcomes'].apply(
        lambda x: flatten_list_of_dicts(x, ['measure', 'description', 'timeFrame'])
    )
    # Explode each list-based column
    protocol_df = protocol_df.explode('interventions_clean') \
                        .explode('primaryOutcomes_clean') \
                        .reset_index(drop=True)
    # extract relevant information from the cleaned columns
    protocol_df['intervention_name'] = protocol_df['interventions_clean'].str.extract(r'name:\s*(.*?)\s*;?\s*description:')
    protocol_df['outcomes_name'] = protocol_df['primaryOutcomes_clean'].str.extract(r'measure:\s*(.*?)\s*;?\s*description:')
    protocol_df['intervention_name_clean'] = protocol_df['intervention_name'].str.replace(r'\s*\d+\s*(mg|mcg|g|ml)', '', case=False, regex=True) 


    # Group by study ID and aggregate other columns
    combined_df = protocol_df.groupby('identificationModule.nctId').agg({
        'descriptionModule.briefSummary': lambda x: '; '.join(pd.unique(x.dropna())),
        'descriptionModule.detailedDescription': lambda x: '; '.join(pd.unique(x.dropna())),
        'eligibilityModule.eligibilityCriteria': lambda x: '; '.join(pd.unique(x.dropna())),
        'intervention_name_clean': lambda x: '; '.join(pd.unique(x.dropna())),
        'outcomes_name': lambda x: '; '.join(pd.unique(x.dropna()))
    }).reset_index()  

    # Apply the cleaning function to the column
    combined_df['eligibility_clean'] = combined_df['eligibilityModule.eligibilityCriteria'].apply(clean_text)

    # Apply the function to split the column
    combined_df[['inclusion_criteria', 'exclusion_criteria']] = combined_df['eligibility_clean'].apply(split_eligibility)

        # Drop unnecessary columns cleaned DataFrame
    cleaned_df = combined_df[[
        'identificationModule.nctId',
        'descriptionModule.briefSummary',
        'descriptionModule.detailedDescription',
        'inclusion_criteria', 
        'exclusion_criteria',
        'intervention_name_clean',
        'outcomes_name'
    ]].copy()

    return cleaned_df

#protocol_df = pd.json_normalize(df_json['protocolSection'])
#protocol_df.head()

mydf = extract_elegibility_trtm_from_clinicaltrials(df_json)
mydf.head()

Unnamed: 0,identificationModule.nctId,descriptionModule.briefSummary,descriptionModule.detailedDescription,inclusion_criteria,exclusion_criteria,intervention_name_clean,outcomes_name
0,NCT00105105,The purpose of this study is to evaluate the e...,"This will be a double blind, placebo controlle...",diagnosis of alzheimer's disease; women must h...,women with an intact uterus; a clinically sign...,Mifepristone,effects on cognition
1,NCT00160147,This is a 10-week study with bifeprunox and pl...,,diagnosis of dementia of the alzheimer's type,history of seizure disorder; clinically signif...,bifeprunox; Placebo,Brief Psychiatric Rating Scale (BPRS) Total Score
2,NCT00299988,The overall goal of this double-blind Phase II...,Abnormal processing of the beta-amyloid protei...,1. diagnosis of probable alzheimer's disease (...,1. non-alzheimer dementia.2. active renal dise...,Intravenous Immunoglobulin; Placebo,ADAS-Cog; CGIC
3,NCT00334568,Clinical features in patients with the familia...,,meets the national institute of neurological a...,has a history of or suffers from claustrophobi...,Rosiglitazone XR (extended release) oral table...,Change in global and regional cerebral glucose...
4,NCT00362024,MK0952 is a phosphodiesterase type IV (PDE4) i...,,"male or females; age \>/= 55 years, with mild-...",patients must not be living in nursing home or...,MK0952; Comparator: Placebo,


## Continue with single-trial processing

In [100]:
# Get the PROJECT ROOT (biomed-extractor/)
PROJECT_ROOT = 'c:\\Users\\elena.jolkver\\Documents\\github\\biomed_extractor'

# Data directory at top level
DATA_DIR = os.path.join(PROJECT_ROOT, 'data\\annotated\\ctg-studies_for_gold_individual')
#data\annotated\ctg-studies_for_gold_individual\NCT00667810.json

df_json_single = load_trials_json(filepath = DATA_DIR, filename='NCT00667810.json')
print(df_json_single.head())
#mydf_single = extract_elegibility_trtm_from_clinicaltrials(df_json_single)
#mydf_single.head()

Loaded 20 records from NCT00667810.json
                                                              protocolSection  \
identificationModule        {'nctId': 'NCT00667810', 'nctIdAliases': ['NCT...   
statusModule                {'statusVerifiedDate': '2015-12', 'overallStat...   
sponsorCollaboratorsModule  {'responsibleParty': {'type': 'SPONSOR'}, 'lea...   
oversightModule                                     {'oversightHasDmc': True}   
descriptionModule           {'briefSummary': 'This is a study to evaluate ...   

                           resultsSection derivedSection  hasResults  
identificationModule                  NaN            NaN        True  
statusModule                          NaN            NaN        True  
sponsorCollaboratorsModule            NaN            NaN        True  
oversightModule                       NaN            NaN        True  
descriptionModule                     NaN            NaN        True  


In [101]:
# Normalize from json to pandas df
protocol_df = pd.json_normalize(df_json_single['protocolSection']) 
protocol_df.head()

Unnamed: 0,nctId,nctIdAliases,secondaryIdInfos,briefTitle,officialTitle,orgStudyIdInfo.id,organization.fullName,organization.class,statusVerifiedDate,overallStatus,...,eligibilityCriteria,healthyVolunteers,sex,minimumAge,maximumAge,stdAges,overallOfficials,locations,references,seeAlsoLinks
0,NCT00667810,[NCT00909623],"[{'id': 'B2521001', 'type': 'OTHER', 'domain':...",Study Evaluating The Efficacy And Safety Of Ba...,"A Phase 3, Multicenter, Randomized, Double-bli...",3133K1-3000,Pfizer,INDUSTRY,,,...,,,,,,,,,,
1,,,,,,,,,2015-12,TERMINATED,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [102]:
protocol_df.columns

Index(['nctId', 'nctIdAliases', 'secondaryIdInfos', 'briefTitle',
       'officialTitle', 'orgStudyIdInfo.id', 'organization.fullName',
       'organization.class', 'statusVerifiedDate', 'overallStatus',
       'whyStopped', 'studyFirstSubmitDate', 'studyFirstSubmitQcDate',
       'resultsFirstSubmitDate', 'resultsFirstSubmitQcDate',
       'lastUpdateSubmitDate', 'expandedAccessInfo.hasExpandedAccess',
       'startDateStruct.date', 'primaryCompletionDateStruct.date',
       'primaryCompletionDateStruct.type', 'completionDateStruct.date',
       'completionDateStruct.type', 'studyFirstPostDateStruct.date',
       'studyFirstPostDateStruct.type', 'resultsFirstPostDateStruct.date',
       'resultsFirstPostDateStruct.type', 'lastUpdatePostDateStruct.date',
       'lastUpdatePostDateStruct.type', 'responsibleParty.type',
       'leadSponsor.name', 'leadSponsor.class', 'oversightHasDmc',
       'briefSummary', 'conditions', 'keywords', 'studyType', 'phases',
       'designInfo.allocation',

In [103]:
# Apply flattening using .loc
# THIS NEEDED TO CHANGE
protocol_df.loc[:, 'interventions_clean'] = protocol_df['interventions'].apply(
    lambda x: flatten_list_of_dicts(x, ['type', 'name', 'description'])
)

protocol_df.loc[:, 'primaryOutcomes_clean'] = protocol_df['primaryOutcomes'].apply(
    lambda x: flatten_list_of_dicts(x, ['measure', 'description', 'timeFrame'])
    )

# Explode each list-based column
protocol_df = protocol_df.explode('interventions_clean') \
                    .explode('primaryOutcomes_clean') \
                    .reset_index(drop=True)
# extract relevant information from the cleaned columns
protocol_df['intervention_name'] = protocol_df['interventions_clean'].str.extract(r'name:\s*(.*?)\s*;?\s*description:')
protocol_df['outcomes_name'] = protocol_df['primaryOutcomes_clean'].str.extract(r'measure:\s*(.*?)\s*;?\s*description:')
protocol_df['intervention_name_clean'] = protocol_df['intervention_name'].str.replace(r'\s*\d+\s*(mg|mcg|g|ml)', '', case=False, regex=True) 


# Fill NaNs in each row with the first non-null value from that row

non_null_value = protocol_df['nctId'].dropna().unique()
if len(non_null_value) == 1:
    protocol_df['nctId'] = protocol_df['nctId'].fillna(non_null_value[0])


#protocol_df['nctId'] = protocol_df.apply(lambda row: row[row.first_valid_index()] if pd.isna(row['nctId']) else row['nctId'], axis=1)
print(protocol_df.head())


         nctId   nctIdAliases  \
0  NCT00667810  [NCT00909623]   
1  NCT00667810            NaN   
2  NCT00667810            NaN   
3  NCT00667810            NaN   
4  NCT00667810            NaN   

                                    secondaryIdInfos  \
0  [{'id': 'B2521001', 'type': 'OTHER', 'domain':...   
1                                                NaN   
2                                                NaN   
3                                                NaN   
4                                                NaN   

                                          briefTitle  \
0  Study Evaluating The Efficacy And Safety Of Ba...   
1                                                NaN   
2                                                NaN   
3                                                NaN   
4                                                NaN   

                                       officialTitle orgStudyIdInfo.id  \
0  A Phase 3, Multicenter, Randomized, Double-bli... 

In [104]:
# make sure that all required columns are present
required_columns = [
    'nctId',
    'briefSummary',
    'detailedDescription',
    'eligibilityCriteria',
    'intervention_name_clean',
    'outcomes_name'
]

for col in required_columns:
    if col not in protocol_df.columns:
        protocol_df[col] = pd.NA


In [105]:
# Group by study ID and aggregate other columns
combined_df = protocol_df.groupby('nctId').agg({
    'briefSummary': lambda x: '; '.join(pd.unique(x.dropna())),
    'detailedDescription': lambda x: '; '.join(pd.unique(x.dropna())),
    'eligibilityCriteria': lambda x: '; '.join(pd.unique(x.dropna())),
    'intervention_name_clean': lambda x: '; '.join(pd.unique(x.dropna())),
    'outcomes_name': lambda x: '; '.join(pd.unique(x.dropna()))
}).reset_index() 
#combined_df 




print(combined_df)
# Apply the cleaning function to the column
combined_df['eligibility_clean'] = combined_df['eligibilityCriteria'].apply(clean_text)

# Apply the function to split the column
combined_df[['inclusion_criteria', 'exclusion_criteria']] = combined_df['eligibility_clean'].apply(split_eligibility)

    # Drop unnecessary columns cleaned DataFrame
cleaned_df = combined_df[[
    'nctId',
    'briefSummary',
    'detailedDescription',
    'inclusion_criteria', 
    'exclusion_criteria',
    'intervention_name_clean',
    'outcomes_name'
]].copy()

cleaned_df


         nctId                                       briefSummary  \
0  NCT00667810  This is a study to evaluate the efficacy and s...   

  detailedDescription                                eligibilityCriteria  \
0                      Inclusion Criteria:\n\n* Diagnosis of probable...   

  intervention_name_clean                                      outcomes_name  
0   bapineuzumab; placebo  The Change From Baseline in the Alzheimer's Di...  


Unnamed: 0,nctId,briefSummary,detailedDescription,inclusion_criteria,exclusion_criteria,intervention_name_clean,outcomes_name
0,NCT00667810,This is a study to evaluate the efficacy and s...,,"diagnosis of probable alzheimer disease (ad), ...",significant neurological disease other than ad...,bapineuzumab; placebo,The Change From Baseline in the Alzheimer's Di...


In [106]:
# wrap into function
def extract_elegibility_trtm_from_clinicaltrial(df):
    protocol_df = pd.json_normalize(df['protocolSection']) 
    # Apply flattening using .loc
    # THIS NEEDED TO CHANGE
    protocol_df.loc[:, 'interventions_clean'] = protocol_df['interventions'].apply(
        lambda x: flatten_list_of_dicts(x, ['type', 'name', 'description'])
    )

    protocol_df.loc[:, 'primaryOutcomes_clean'] = protocol_df['primaryOutcomes'].apply(
        lambda x: flatten_list_of_dicts(x, ['measure', 'description', 'timeFrame'])
        )

    # Explode each list-based column
    protocol_df = protocol_df.explode('interventions_clean') \
                        .explode('primaryOutcomes_clean') \
                        .reset_index(drop=True)
    # extract relevant information from the cleaned columns
    protocol_df['intervention_name'] = protocol_df['interventions_clean'].str.extract(r'name:\s*(.*?)\s*;?\s*description:')
    protocol_df['outcomes_name'] = protocol_df['primaryOutcomes_clean'].str.extract(r'measure:\s*(.*?)\s*;?\s*description:')
    protocol_df['intervention_name_clean'] = protocol_df['intervention_name'].str.replace(r'\s*\d+\s*(mg|mcg|g|ml)', '', case=False, regex=True) 


    # Fill NaNs in each row with the first non-null value from that row

    non_null_value = protocol_df['nctId'].dropna().unique()
    if len(non_null_value) == 1:
        protocol_df['nctId'] = protocol_df['nctId'].fillna(non_null_value[0])

    # make sure that all required columns are present
    required_columns = [
        'nctId',
        'briefSummary',
        'detailedDescription',
        'eligibilityCriteria',
        'intervention_name_clean',
        'outcomes_name'
    ]

    for col in required_columns:
        if col not in protocol_df.columns:
            protocol_df[col] = pd.NA

    # Group by study ID and aggregate other columns
    combined_df = protocol_df.groupby('nctId').agg({
        'briefSummary': lambda x: '; '.join(pd.unique(x.dropna())),
        'detailedDescription': lambda x: '; '.join(pd.unique(x.dropna())),
        'eligibilityCriteria': lambda x: '; '.join(pd.unique(x.dropna())),
        'intervention_name_clean': lambda x: '; '.join(pd.unique(x.dropna())),
        'outcomes_name': lambda x: '; '.join(pd.unique(x.dropna()))
    }).reset_index() 
    # Apply the cleaning function to the column
    combined_df['eligibility_clean'] = combined_df['eligibilityCriteria'].apply(clean_text)

    # Apply the function to split the column
    combined_df[['inclusion_criteria', 'exclusion_criteria']] = combined_df['eligibility_clean'].apply(split_eligibility)

        # Drop unnecessary columns cleaned DataFrame
    cleaned_df = combined_df[[
        'nctId',
        'briefSummary',
        'detailedDescription',
        'inclusion_criteria', 
        'exclusion_criteria',
        'intervention_name_clean',
        'outcomes_name'
    ]].copy()

    return cleaned_df

In [107]:
mydf = extract_elegibility_trtm_from_clinicaltrial(df_json_single)
mydf

Unnamed: 0,nctId,briefSummary,detailedDescription,inclusion_criteria,exclusion_criteria,intervention_name_clean,outcomes_name
0,NCT00667810,This is a study to evaluate the efficacy and s...,,"diagnosis of probable alzheimer disease (ad), ...",significant neurological disease other than ad...,bapineuzumab; placebo,The Change From Baseline in the Alzheimer's Di...


### Refactoring both functions to increase mainatinability and clarity

In [108]:
def flatten_and_extract(df, intervention_col, outcome_col):
    df['interventions_clean'] = df[intervention_col].apply(
        lambda x: flatten_list_of_dicts(x, ['type', 'name', 'description'])
    )
    df['primaryOutcomes_clean'] = df[outcome_col].apply(
        lambda x: flatten_list_of_dicts(x, ['measure', 'description', 'timeFrame'])
    )
    df = df.explode('interventions_clean').explode('primaryOutcomes_clean').reset_index(drop=True)
    df['intervention_name'] = df['interventions_clean'].str.extract(r'name:\s*(.*?)\s*;?\s*description:')
    df['outcomes_name'] = df['primaryOutcomes_clean'].str.extract(r'measure:\s*(.*?)\s*;?\s*description:')
    df['intervention_name_clean'] = df['intervention_name'].str.replace(r'\s*\d+\s*(mg|mcg|g|ml)', '', case=False, regex=True)
    return df

def ensure_columns(df, required_columns):
    for col in required_columns:
        if col not in df.columns:
            df[col] = pd.NA
    return df

def aggregate_and_clean(df, id_col, summary_col, desc_col, elig_col):
    combined_df = df.groupby(id_col).agg({
        summary_col: lambda x: '; '.join(pd.unique(x.dropna())),
        desc_col: lambda x: '; '.join(pd.unique(x.dropna())),
        elig_col: lambda x: '; '.join(pd.unique(x.dropna())),
        'intervention_name_clean': lambda x: '; '.join(pd.unique(x.dropna())),
        'outcomes_name': lambda x: '; '.join(pd.unique(x.dropna()))
    }).reset_index()

    combined_df['eligibility_clean'] = combined_df[elig_col].apply(clean_text)
    combined_df[['inclusion_criteria', 'exclusion_criteria']] = combined_df['eligibility_clean'].apply(split_eligibility)

    return combined_df[[id_col, summary_col, desc_col, 'inclusion_criteria', 'exclusion_criteria', 'intervention_name_clean', 'outcomes_name']]

def extract_from_clinicaltrial(df):
    protocol_df = pd.json_normalize(df['protocolSection'])
    protocol_df = flatten_and_extract(protocol_df, 'interventions', 'primaryOutcomes')

    if protocol_df['nctId'].dropna().nunique() == 1:
        protocol_df['nctId'] = protocol_df['nctId'].fillna(protocol_df['nctId'].dropna().iloc[0])

    required = ['nctId', 'briefSummary', 'detailedDescription', 'eligibilityCriteria']
    protocol_df = ensure_columns(protocol_df, required)

    return aggregate_and_clean(protocol_df, 'nctId', 'briefSummary', 'detailedDescription', 'eligibilityCriteria')

def extract_from_clinicaltrials(df):
    protocol_df = pd.json_normalize(df['protocolSection'])
    protocol_df = flatten_and_extract(protocol_df, 'armsInterventionsModule.interventions', 'outcomesModule.primaryOutcomes')

    return aggregate_and_clean(
        protocol_df,
        'identificationModule.nctId',
        'descriptionModule.briefSummary',
        'descriptionModule.detailedDescription',
        'eligibilityModule.eligibilityCriteria'
    )


In [109]:
mydf = extract_from_clinicaltrial(df_json_single)
mydf

Unnamed: 0,nctId,briefSummary,detailedDescription,inclusion_criteria,exclusion_criteria,intervention_name_clean,outcomes_name
0,NCT00667810,This is a study to evaluate the efficacy and s...,,"diagnosis of probable alzheimer disease (ad), ...",significant neurological disease other than ad...,bapineuzumab; placebo,The Change From Baseline in the Alzheimer's Di...


In [110]:
mydf = extract_from_clinicaltrials(df_json)
mydf

Unnamed: 0,identificationModule.nctId,descriptionModule.briefSummary,descriptionModule.detailedDescription,inclusion_criteria,exclusion_criteria,intervention_name_clean,outcomes_name
0,NCT00105105,The purpose of this study is to evaluate the e...,"This will be a double blind, placebo controlle...",diagnosis of alzheimer's disease; women must h...,women with an intact uterus; a clinically sign...,Mifepristone,effects on cognition
1,NCT00160147,This is a 10-week study with bifeprunox and pl...,,diagnosis of dementia of the alzheimer's type,history of seizure disorder; clinically signif...,bifeprunox; Placebo,Brief Psychiatric Rating Scale (BPRS) Total Score
2,NCT00299988,The overall goal of this double-blind Phase II...,Abnormal processing of the beta-amyloid protei...,1. diagnosis of probable alzheimer's disease (...,1. non-alzheimer dementia.2. active renal dise...,Intravenous Immunoglobulin; Placebo,ADAS-Cog; CGIC
3,NCT00334568,Clinical features in patients with the familia...,,meets the national institute of neurological a...,has a history of or suffers from claustrophobi...,Rosiglitazone XR (extended release) oral table...,Change in global and regional cerebral glucose...
4,NCT00362024,MK0952 is a phosphodiesterase type IV (PDE4) i...,,"male or females; age \>/= 55 years, with mild-...",patients must not be living in nursing home or...,MK0952; Comparator: Placebo,
...,...,...,...,...,...,...,...
95,NCT05670912,In clinical trials of preclinical pharmacodyna...,Wei Li Bai Capsule is composed of sodium ferul...,1. age 50 to 80 years old (including 50 and 80...,"1. during screening, mri examination showed si...",Wei Li Bai capsules; Placebo Comparator of Wei...,Alzheimer's Disease Assessment Scale-Cognitive...
96,NCT05744401,A long-term extension study to evaluate the sa...,"This is a Phase 2, parallel-group, long-term e...",completion of the planned treatment period in ...,participants deemed not able to provide consen...,AL002,Safety and tolerability as measured by number ...
97,NCT06424236,The purpose of this study is to assess the saf...,Alzheimer's disease (AD) is defined by the pre...,between 18-80 years of age; individuals who kn...,history or presence of brain mri scans indicat...,Gantenerumab,Change From Baseline in Composite [11C] Pittsb...
98,NCT06677203,The main purpose of this study is to evaluate ...,,1. male or female age 50 to 80 years.2. a clin...,1. any medical or neurological/neurodegenerati...,ASN51; Placebo,Number of Participants With Adverse Events (AE...


## Process clinical trial csv file

In [113]:
# Get the PROJECT ROOT (biomed-extractor/)
PROJECT_ROOT = 'c:\\Users\\elena.jolkver\\Documents\\github\\biomed_extractor'

# Data directory at top level
DATA_DIR = os.path.join(PROJECT_ROOT, 'data')

def load_trials_csv(filepath, filename):
    path = os.path.join(filepath, filename)
    if not os.path.isfile(path):
        raise FileNotFoundError(f"File not found: {path}")
    df = pd.read_csv(path)
    print(f"Loaded {len(df)} records from {filename}")
    df_csv_focused = df[['NCT Number', 'Brief Summary','Interventions', 'Primary Outcome Measures']].copy()
    required = ['detailedDescription', 'inclusion_criteria', 'exclusion_criteria']
    df_csv_focused = ensure_columns(df_csv_focused, required)

    standardized_columns = {
    'NCT Number': 'nctId',
    'Brief Summary': 'briefSummary',
    'descriptionModule.detailedDescription': 'detailedDescription',
    'inclusion_criteria': 'inclusion_criteria',
    'exclusion_criteria': 'exclusion_criteria',
    'Interventions': 'intervention_name_clean',
    'Primary Outcome Measures': 'outcomes_name'
    }

    df_csv_focused = df_csv_focused.rename(columns=standardized_columns)
    
    desired_order = [
        'nctId',
        'briefSummary',
        'detailedDescription',
        'inclusion_criteria',
        'exclusion_criteria',
        'intervention_name_clean',
        'outcomes_name'
    ]

    df_csv_focused = df_csv_focused[desired_order]

    return df_csv_focused


df_csv = load_trials_csv(filepath = DATA_DIR, filename ='example_trials.csv')
df_csv

Loaded 100 records from example_trials.csv


Unnamed: 0,nctId,briefSummary,detailedDescription,inclusion_criteria,exclusion_criteria,intervention_name_clean,outcomes_name
0,NCT03132272,Efficacy of immunoadsorption for treatment of ...,,,,DEVICE: Immunoadsorption with Globaffin,"Changes in cerebral blood flow, estimated by A..."
1,NCT06424236,The purpose of this study is to assess the saf...,,,,DRUG: Gantenerumab,Change From Baseline in Composite [11C] Pittsb...
2,NCT03131453,The purpose of this study is to determine the ...,,,,DRUG: CNP520 50mg|DRUG: CNP520 15mg|OTHER: Mat...,Time to Event (Diagnosis of Mild Cognitive Imp...
3,NCT05256134,A study to evaluate the efficacy and safety of...,,,,DRUG: Gantenerumab|DRUG: Placebo,"Change From Baseline in PACC-5 Score, The PACC..."
4,NCT02565511,The purpose of this study was to test whether ...,,,,BIOLOGICAL: CAD106 Immunotherapy|OTHER: Placeb...,Time to Event (Diagnosis of Mild Cognitive Imp...
...,...,...,...,...,...,...,...
95,NCT00299988,The overall goal of this double-blind Phase II...,,,,DRUG: Intravenous Immunoglobulin|OTHER: Placebo,"ADAS-Cog, The Alzheimer's Disease Assessment S..."
96,NCT02221947,This study is being done to evaluate the safet...,,,,DRUG: Bryostatin 1|DRUG: Placebo,Number of Participants With Adverse Events as ...
97,NCT00679627,The purpose of this study is to compare the ef...,,,,DRUG: Galantamine|DRUG: Placebo,Change From Baseline in the Mini-Mental State ...
98,NCT02670083,"This randomized, double-blind, placebo-control...",,,,DRUG: Crenezumab|DRUG: Placebo,Change From Baseline to Week 105 in Clinical D...


### Extract the protocol section from the single trial csv file

```python

In [114]:
# Get the PROJECT ROOT (biomed-extractor/)
PROJECT_ROOT = 'c:\\Users\\elena.jolkver\\Documents\\github\\biomed_extractor'

# Data directory at top level
DATA_DIR = os.path.join(PROJECT_ROOT, 'data\\annotated\\ctg-studies_for_gold_individual_csv')


mydf_csv = load_trial_csv(filepath = DATA_DIR, filename='NCT00667810.csv')
mydf_csv

Loaded 1 records from NCT00667810.csv


Unnamed: 0,nctId,briefSummary,Conditions,intervention_name_clean,outcomes_name,detailedDescription,inclusion_criteria,exclusion_criteria
0,NCT00667810,This is a study to evaluate the efficacy and s...,Alzheimer Disease,DRUG: bapineuzumab|DRUG: bapineuzumab|DRUG: pl...,The Change From Baseline in the Alzheimer's Di...,,,
