In [53]:
import requests
import json
import os
import sys
import pandas as pd
import logging

In [22]:
class clindata_downloader:
    def __init__(self, base_url, params, outpath):
        self.base_url = base_url   # Specify the API Base URL for ClinicalTrials.gov API
        self.params = params        # Specify the query parameters
        self.outpath = outpath
        self.data_path = None
        self.combined_studies = None
        
    def fetch_data(self):
        # Initialize an empty list to store the data
        data_list = []

        while True:
            if next_page_token:
                # Add the nextPageToken to the parameters for subsequent requests
                params["pageToken"] = next_page_token

            # Sending the request
            response = requests.get(self.base_url, params=self.params)

            # Handling the response
            if response.status_code == 200:
                # Parse the JSON response
                data = response.json()
                studies = data.get('studies', [])  # Extract the list of studies

                data_path = os.path.join(outpath, "rawdata")
                
                # path where all the downloaded data from API will be saved in a folder named as rawdata
                os.makedirs(self.outpath, exist_ok=True)
                # Append studies to the data list
                
                data_list.extend(studies)

                page_filename = f"studies_page_{next_page_token}.json" if next_page_token else "studies_page_1.json"
                # dump to a json file

                with open(os.path.join(data_path, page_filename), "w") as file:
                    json.dump(data, file, indent=4)

            #   Check for the nextPageToken
                next_page_token = data.get("nextPageToken")
                print(next_page_token)

                # Update the parameters with the nextPageToken
                params["pageToken"] = next_page_token

                if not next_page_token:
                    print("No more pages to fetch.")
                    break

            else:
                print(f"Error: {response.status_code} - {response.text}")
                break

        # After all pages are fetched, save all data into a single file
        with open(os.path.join(data_path, "all_studies.json"), "w") as file:
            json.dump(data_list, file, indent=4)

        print(f"Fetched a total of {len(data_list)} studies.")
        self.data_path = data_path
        return data_list
    
    def load_json(self):
        with open(os.path.join(self.data_path, "all_studies.json")) as f:
            combined_studies = json.load(f)
            self.combined_studies = combined_studies
            return self.combined_studies
        
    
            
        
        
base_url = "https://clinicaltrials.gov/api/v2/studies"


# Parameters for the query
params = {
    "format": "json",  # Requesting JSON format
    "query.term": "AREA[LastUpdatePostDate]RANGE[2024-10-20,2024-10-21]",  # Essie expression
}

example_schema = {
 "trialId": "NCT00560521",
 "title": "Effect of Continuous Positive Airway Pressure on Fluid Absorption Among Patients With Pleural Effusion Due to Tuberculosis",
 "startDate": "2005-03-01",
 "endDate": "2007-03-01",
 "phase": "Other",
 "principalInvestigator": {
 "name": "Juliana F Oliveira",
 "affiliation": "Universidade Federal do Rio de Janeiro"
 },
 "locations": [
 {
 "facility": "Federal University of Rio de Janeiro",
 "city": "Rio de Janeiro",
 "country": "Brazil"
 }
 ],
 "eligibilityCriteria": "Inclusion Criteria:\n\nConfirmed diagnosis of pleural tuberculosis.\nPatients 18 years of age and older.\n\nExclusion criteria:\n\nBe under previous treatment of respirat ory physiotherapy.\nIrregular use or abandonment of the anti-TB standard regimen.\nTo fail one or more physiotherapy section.\nTo fail one or more radiological evaluation."
}



In [81]:
import re
    
class clindata_parser:
    def __init__(self, data_struct, schema, comb_studies, field_dict = None):
        self.data_struct = data_struct
        self.comb_studies = comb_studies
        self.target_schema = schema
        self.relevant_fields_df = None
        
        # Use the provided field_dict or set a default empty dictionary
        self.field_dict = field_dict if field_dict is not None else {}
        
        self.super_dict = None
        
        
    def xpath(a: str) -> str:
        '''function to modify XPath strings.'''
        if '/' in a:
            b = a.replace('/', '.')
            if 'Study' in b:
                return b.replace('.Study.', '')
        else:
            return a

    def map_datastruct(self):
        relevant_fields = {}        # relevant fields that look like keys in example schema 
        for field in self.target_schema.keys():
        #     print(field)
            if field == 'trialId':
                print(field)
                field_original = field
                field = 'id'
                result = [y for y in self.data_struct['Piece Name'] if re.search(f"{field}", str(y), re.IGNORECASE)]
                print(result)
        #         print(self.data_struct[self.data_struct['Piece Name'].isin(result)][['Piece Name', 'Classic XPath']])
                relevant_fields.update({field_original: self.data_struct[self.data_struct['Piece Name'].isin(result)][['Piece Name', 'Classic XPath']]})

            elif field == 'endDate':
                print(field)
                field_original = field
                field = 'CompletionDate'
                result = [y for y in self.data_struct['Piece Name'] if re.search(f"{field}", str(y), re.IGNORECASE)]
                print(result)
                relevant_fields.update({field_original: self.data_struct[self.data_struct['Piece Name'].isin(result)][['Piece Name', 'Classic XPath']]})

            elif field == 'principalInvestigator':
                print(field)
                field_original = field
                field = 'Investigator'
                result = [y for y in self.data_struct['Piece Name'] if re.search(f"{field}", str(y), re.IGNORECASE)]
                print(result)
                relevant_fields.update({field_original: self.data_struct[self.data_struct['Piece Name'].isin(result)][['Piece Name', 'Classic XPath']]})
    
    
            else:
                print(field)
                result = [y for y in self.data_struct['Piece Name'] if re.search(f"{field}", str(y), re.IGNORECASE)]
                print(result)
                relevant_fields.update({field: self.data_struct[self.data_struct['Piece Name'].isin(result)][['Piece Name', 'Classic XPath']]})
            print()
    
        relevant_fields_df = pd.concat(relevant_fields.values())
        self.relevant_fields_df = relevant_fields_df
        #     relevant_fields_df[relevant_fields_df['Piece Name'] == 'completionDateStruct']
        return self.relevant_fields_df


    def flatten_json(y, parent_key='', sep='.'):
        """
        Flattens a nested JSON object into a single level.
        Keys will be in the form 'parent.child.grandchild'.
        """
        items = []
        for k, v in y.items():
            new_key = f"{parent_key}{sep}{k}" if parent_key else k
            if isinstance(v, dict):
                items.extend(flatten_json(v, new_key, sep=sep).items())
            elif isinstance(v, list):
                for i, item in enumerate(v):
                    items.extend(flatten_json({f"{k}[{i}]": item}, parent_key, sep=sep).items())
            else:
                items.append((new_key, v))
        return dict(items)
    
    
    def map_schema(self):
        super_dict = {}
        for n, x in enumerate(self.comb_studies):
        #     print(n)
            mapped_data = {}
            flat_json_1 = flatten_json(self.comb_studies[n])
        #     print(flat_json_1.keys())
            for item in self.field_dict:
                i = self.field_dict[item]
        #         print(item, i, '__________________')

                if isinstance(i, list):
        #             print(i)
                    try:
                        investigater = {}
                        for f in i:
                            col_a = xpath(self.relevant_fields_df[self.relevant_fields_df['Piece Name'] == f]['Classic XPath'].values[0])
        #                     print(col_a)
                            try:
                                investigater.update({f: flat_json_1[col_a]})
                            except:
                                pass
                        mapped_data.update({item : investigater})
                    except:
                        pass

                else:
                    if item == 'locations':
                        col_a = xpath(self.relevant_fields_df[self.relevant_fields_df['Piece Name'] == i]['Classic XPath'].values[0])
        #                 print(col_a)

                        try:
                            location = [a for a in flat_json_1.keys() if a.startswith(str(col_a))]
                            if len(location) == 1:
                                mapped_data.update({item : flat_json_1[col_a]})
                            else:
                #             print(location)
                                location_data = pd.Series({v: flat_json_1[v] for v in location})
                    #             print(location_data)
                                location_df = pd.DataFrame([range(1, len(location_data.index)+1), location_data, location_data.index]).T

                                location_df['keyid'] = [re.findall(r'\d+', string)[0] for string in location_df[2]]
                                location_df['subfield'] = [string.split('].')[1] for string in location_df[2]]
                                loc = {}
                    #             print(location_df)
                                for num, (g, h) in enumerate(location_df.groupby(by='keyid')):
                                    loc.update({num: {key: val for key, val in zip(h['subfield'], h[1])}})
                #                     print()
            #                     print(loc)
                                mapped_data.update({item : loc})
                        except:
                            pass

                    elif item == 'startDate':
                        col_a = xpath(self.relevant_fields_df[self.relevant_fields_df['Piece Name'] == self.field_dict[item]]['Classic XPath'].values[0])
                        try:
                            sdate = [a for a in flat_json_1.keys() if a.startswith(str(col_a))][0]
                            mapped_data.update({item : flat_json_1[sdate]})
        #                     print(sdate)
                        except:
                            pass
        #                 
                    elif item == 'endDate':
                        col_a = xpath(self.relevant_fields_df[self.relevant_fields_df['Piece Name'] == self.field_dict[item]]['Classic XPath'].values[0])
                        try:
                            edate = [a for a in flat_json_1.keys() if a.startswith(str(col_a))][0]
                            mapped_data.update({item : flat_json_1[edate]})
        #                     print(edate)
                        except:
                            pass

                    else:
        #                 print(i)
                        try:
                            col_a = xpath(self.relevant_fields_df[self.relevant_fields_df['Piece Name'] == self.field_dict[item]]['Classic XPath'].values[0])
        #                     print(col_a, '***')
        #                     print()
                            mapped_data.update({item : flat_json_1[col_a]})
                        except:
                            pass

            super_dict.update({n : mapped_data})
      
        self.super_dict = super_dict
        return self.super_dict
    
    # Function to clean text in criteria
    def clean_criteria(section):
        # Remove bullet points (e.g., '* ', '1. ', etc.), colons, and split into lines
        lines = re.split(r'\n+', section)
        cleaned_lines = [re.sub(r'^[\*\d\.\s]+|[:]', '', line).strip() for line in lines]
        # Filter out empty strings
        return [line for line in cleaned_lines if line]
    
    
    def inclusion_criteria(self):
        processed_data = {}
        
        # Initialize an empty list to collect error messages
        error_log = []
        
        for i, j in list(self.super_dict.items()):
        #     print(i, j.keys())
        #     print(j['eligibilityCriteria'])
            try:
                criteria_text = j['eligibilityCriteria']
        #         print(criteria_text)
                # Splitting the text into inclusion and exclusion sections
                sections = criteria_text.split("Exclusion Criteria:")
                inclusion_section = [y.split("Inclusion Criteria")[1].strip('\n') for y in [x for x in sections if x.startswith("Inclusion Criteria")]]
#                 print(inclusion_section)
                inclusion_criteria = clean_criteria("\n".join(inclusion_section))
#                 print(inclusion_criteria)
                processed_data.update({i: inclusion_criteria})
            except Exception as e:
                error_log.append(f"Error encountered with value i={i}: {e}")
                pass

        # Write all collected errors to a log file at the end of the loop
        if error_log:
            with open("error_log.txt", "w") as log_file:
                log_file.write("\n".join(error_log))
            print("Errors have been logged to 'error_log.txt'.")
        else:
            print("No errors encountered during the loop.")
            
            # Convert dictionary to DataFrame
        df = pd.DataFrame.from_dict(processed_data, orient="index").reset_index()
        df.columns = ["ID"] + [f"Sentence_{i}" for i in range(1, len(df.columns))]

        # df.columns = ["ID", "Sentence"]

        df['merged'] = df.apply(lambda row: ' '.join([str(val) for val in row if val not in [None, '']]), axis=1)

        print(df)
        return df
    
    

            
## Manually written dict based on the keywords found from searching keys from example schema, required as the field names are different.
# field_list = {'trialId':'nctId', 'title':'officialTitle', 'startDate':'startDateStruct', 'endDate':'completionDateStruct', 'phase':'Phase', 'principalInvestigator':['investigatorFullName',  'investigatorAffiliation'], 'locations':'locations\xa0⤷', 'eligibilityCriteria':'eligibilityCriteria'}


In [29]:
data_structure = pd.read_csv(os.getcwd()+'\\datastruct\\Protocolsection.csv', header=0, skiprows=1)
print(data_structure)


                   Piece Name        Unnamed: 1 Unnamed: 2 Unnamed: 3  \
0             protocolSection  Protocol Section        NaN     STRUCT   
1             ProtocolSection               NaN        NaN        NaN   
2        identificationModule               NaN        NaN     STRUCT   
3        IdentificationModule               NaN        NaN        NaN   
4                       nctId               NaN     NCT-ID       TEXT   
..                        ...               ...        ...        ...   
467       IPDSharingTimeFrame               NaN        NaN        NaN   
468            accessCriteria               NaN        NaN     MARKUP   
469  IPDSharingAccessCriteria               NaN        NaN        NaN   
470                       url               NaN        NaN       TEXT   
471             IPDSharingURL               NaN        NaN        NaN   

               Unnamed: 4                   Unnamed: 5 Unnamed: 6 Unnamed: 7  \
0         ProtocolSection               Stu

In [82]:

field_list = {'trialId':'nctId', 'title':'officialTitle', 'startDate':'startDateStruct', 'endDate':'completionDateStruct', 'phase':'Phase', 'principalInvestigator':['investigatorFullName',  'investigatorAffiliation'], 'locations':'locations\xa0⤷', 'eligibilityCriteria':'eligibilityCriteria'}
parse = clindata_parser(data_structure, example_schema, combined_studies, field_dict=field_list)


In [83]:
r_fields = parse.map_datastruct()
super_dict = parse.map_schema()

trialId
['identificationModule', 'IdentificationModule', 'nctId', 'NCTId', 'nctIdAliases', 'NCTIdAlias', 'orgStudyIdInfo', 'OrgStudyIdInfo', 'id', 'OrgStudyId', 'OrgStudyIdType', 'OrgStudyIdLink', 'secondaryIdInfos', 'SecondaryIdInfo', 'id', 'SecondaryId', 'SecondaryIdType', 'SecondaryIdDomain', 'SecondaryIdLink', 'numSecondaryIds\xa0✗', 'NumSecondaryIds', 'nctId', 'ExpandedAccessNCTId', 'statusForNctId', 'ExpandedAccessStatusForNCTId', 'nPtrsToThisExpAccNctId', 'NPtrsToThisExpAccNCTId', 'individual', 'ExpAccTypeIndividual', 'pmid', 'ReferencePMID', 'pmid', 'RetractionPMID', 'id', 'AvailIPDId']

title
['briefTitle', 'BriefTitle', 'officialTitle', 'OfficialTitle', 'investigatorTitle', 'ResponsiblePartyInvestigatorTitle', 'oldNameTitle', 'ResponsiblePartyOldNameTitle']

startDate
['startDateStruct', 'StartDateStruct', 'StartDate', 'StartDateType']

endDate
['primaryCompletionDateStruct', 'PrimaryCompletionDateStruct', 'PrimaryCompletionDate', 'PrimaryCompletionDateType', 'completionDateS

  location_data = pd.Series({v: flat_json_1[v] for v in location})


In [78]:
# data_path = os.getcwd()+'\\rawdata\\'
# with open(os.path.join(data_path, "all_studies.json")) as f:
#     combined_studies = json.load(f)

  location_data = pd.Series({v: flat_json_1[v] for v in location})


In [77]:
super_dict[0].keys()

dict_keys(['trialId', 'title', 'startDate', 'endDate', 'principalInvestigator', 'locations', 'eligibilityCriteria'])

In [84]:
incl_data = parse.inclusion_criteria()

Errors have been logged to 'error_log.txt'.
      ID                                         Sentence_1  \
0      0  Patient or legally authorized representative p...   
1      1  Non-small cell lunch cancer (NSLC) with untrea...   
2      2                                            Age≥18y   
3      3  Clinical diagnosis of mild to severe OSA in th...   
4      4  Has localized high-risk or very high-risk pros...   
..   ...                                                ...   
565  567  patients with planned extubation assigned to t...   
566  568                                home resident older   
567  569  Written informed consent and HIPAA authorizati...   
568  570                       A patient age of 14-70 years   
569  571  Patients with head and neck cancer scheduled f...   

                                            Sentence_2  \
0    Patient has or is intended to receive or be tr...   
1                NSLC lacks oncogenic driver mutations   
2    Histologically or cy

In [52]:
print(len(super_dict))

572


In [3]:
datastruct_protocol = pd.read_csv(os.getcwd()+'\\datastruct\\Protocolsection.csv', header=0, skiprows=1)
# print(datastruct_protocol)

# f.close()
data_path = os.getcwd()+'\\rawdata\\'
with open(os.path.join(data_path, "all_studies.json")) as f:
    combined_studies = json.load(f)
#     print(combined_studies)
len(combined_studies)


572

In [30]:
example_schema = {
 "trialId": "NCT00560521",
 "title": "Effect of Continuous Positive Airway Pressure on Fluid Absorption Among Patients With Pleural Effusion Due to Tuberculosis",
 "startDate": "2005-03-01",
 "endDate": "2007-03-01",
 "phase": "Other",
 "principalInvestigator": {
 "name": "Juliana F Oliveira",
 "affiliation": "Universidade Federal do Rio de Janeiro"
 },
 "locations": [
 {
 "facility": "Federal University of Rio de Janeiro",
 "city": "Rio de Janeiro",
 "country": "Brazil"
 }
 ],
 "eligibilityCriteria": "Inclusion Criteria:\n\nConfirmed diagnosis of pleural tuberculosis.\nPatients 18 years of age and older.\n\nExclusion criteria:\n\nBe under previous treatment of respirat ory physiotherapy.\nIrregular use or abandonment of the anti-TB standard regimen.\nTo fail one or more physiotherapy section.\nTo fail one or more radiological evaluation."
}

trialId
['identificationModule', 'IdentificationModule', 'nctId', 'NCTId', 'nctIdAliases', 'NCTIdAlias', 'orgStudyIdInfo', 'OrgStudyIdInfo', 'id', 'OrgStudyId', 'OrgStudyIdType', 'OrgStudyIdLink', 'secondaryIdInfos', 'SecondaryIdInfo', 'id', 'SecondaryId', 'SecondaryIdType', 'SecondaryIdDomain', 'SecondaryIdLink', 'numSecondaryIds\xa0✗', 'NumSecondaryIds', 'nctId', 'ExpandedAccessNCTId', 'statusForNctId', 'ExpandedAccessStatusForNCTId', 'nPtrsToThisExpAccNctId', 'NPtrsToThisExpAccNCTId', 'individual', 'ExpAccTypeIndividual', 'pmid', 'ReferencePMID', 'pmid', 'RetractionPMID', 'id', 'AvailIPDId']

title
['briefTitle', 'BriefTitle', 'officialTitle', 'OfficialTitle', 'investigatorTitle', 'ResponsiblePartyInvestigatorTitle', 'oldNameTitle', 'ResponsiblePartyOldNameTitle']

startDate
['startDateStruct', 'StartDateStruct', 'StartDate', 'StartDateType']

endDate
['primaryCompletionDateStruct', 'PrimaryCompletionDateStruct', 'PrimaryCompletionDate', 'PrimaryCompletionDateType', 'completionDateS

In [11]:
## Manually written dict based on the keywords found from searching keys from example schema, required as the field names are different.
field_list = {'trialId':'nctId', 'title':'officialTitle', 'startDate':'startDateStruct', 'endDate':'completionDateStruct', 'phase':'Phase', 'principalInvestigator':['investigatorFullName',  'investigatorAffiliation'], 'locations':'locations\xa0⤷', 'eligibilityCriteria':'eligibilityCriteria'}


def xpath(a: str) -> str:
    '''function to modify XPath strings.'''
    if '/' in a:
        b = a.replace('/', '.')
        if 'Study' in b:
            return b.replace('.Study.', '')
    else:
        return a

    
def flatten_json(y, parent_key='', sep='.'):
    """
    Flattens a nested JSON object into a single level.
    Keys will be in the form 'parent.child.grandchild'.
    """
    items = []
    for k, v in y.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_json(v, new_key, sep=sep).items())
        elif isinstance(v, list):
            for i, item in enumerate(v):
                items.extend(flatten_json({f"{k}[{i}]": item}, parent_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)


In [19]:
from transformers import TFAutoModel
from transformers import BertTokenizer

In [None]:
# Example usage:
text = "The patient has diabetes and hypertension."
results = ner_pipeline(text)

# Output the results
print(results)