# Load Training data

In [2]:
import json

with open(r'C:\SemEval -2024\training_data\train.json', 'r') as json_file:
    train_data = json.load(json_file)

In [6]:
train_data

{'5bc844fc-e852-4270-bfaf-36ea9eface3d': {'Type': 'Comparison',
  'Section_id': 'Intervention',
  'Primary_id': 'NCT01928186',
  'Secondary_id': 'NCT00684983',
  'Statement': 'All the primary trial participants do not receive any oral capecitabine, oral lapatinib ditosylate or cixutumumab IV, in conrast all the secondary trial subjects receive these.',
  'Label': 'Contradiction'},
 '86b7cb3d-6186-4a04-9aa6-b174ab764eed': {'Type': 'Single',
  'Section_id': 'Eligibility',
  'Primary_id': 'NCT00662129',
  'Statement': 'Patients with Platelet count over 100,000/mm¬¨‚â•, ANC <  1,700/mm¬¨‚â• and Hemoglobin between 4 to 5 grams per deciliter are eligible for the primary trial.',
  'Label': 'Contradiction'},
 'dbed5471-c2fc-45b5-b26f-430c9fa37a37': {'Type': 'Comparison',
  'Section_id': 'Adverse Events',
  'Primary_id': 'NCT00093145',
  'Secondary_id': 'NCT00703326',
  'Statement': 'Heart-related adverse events were recorded in both the primary trial and the secondary trial.',
  'Label': 'Ent

### view all the statements in the training data

In [18]:
# Iterate over the entries and print the statement
for uuid, content in train_data.items():
    print(content["Statement"])

All the primary trial participants do not receive any oral capecitabine, oral lapatinib ditosylate or cixutumumab IV, in conrast all the secondary trial subjects receive these.
Patients with Platelet count over 100,000/mm¬¨‚â•, ANC <  1,700/mm¬¨‚â• and Hemoglobin between 4 to 5 grams per deciliter are eligible for the primary trial.
Heart-related adverse events were recorded in both the primary trial and the secondary trial.
Adult Patients with histologic confirmation of invasive bilateral breast carcinoma (T1 N1 M1) are eligible for the primary trial.
Laser Therapy is in each cohort of the primary trial and the secondary trial, along with neoadjuvant chemotherapy.
Patients must have already participated in a specific clinical study to participate in the primary trial or the secondary trial.
Patients with Clinical stage II (T2 N1) invasive mammary carcinoma are not eligible for the primary trial.
the primary trial and the secondary trial have Hypnotherapy based interventions, the secon

### extract statement and save in separate text file.

In [19]:
# Directory where to save the files 
output_dir = "C:/SemEval -2024/Statement_files/"

# Iterate over the entries and save each statement as a text file
for uuid, content in train_data.items():
    # print(uuid)
    # print(content)
    # print(content["Statement"])
    file_path = f"{output_dir}{uuid}.txt"
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content["Statement"])

### Process text files with cTAKES

In [21]:
import subprocess
import os
import glob

class CTakesProcessor:
    def __init__(self, ctakes_dir, input_dir, output_dir, pipeline_key):
        self.ctakes_dir = ctakes_dir
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.pipeline_key = pipeline_key
        
    def run_ctakes_pipeline(self):
        command = fr'{self.ctakes_dir}\bin\runClinicalPipeline -i {self.input_dir}\ --xmiOut {self.output_dir}\ --key {self.pipeline_key}'
        subprocess.run(command, shell=True, cwd=self.ctakes_dir)
        self.rename_output_files()
    
    def rename_output_files(self):
        for file in glob.glob(f"{self.output_dir}/*.txt"):
            os.rename(file, file[:-4])

# Usage
if __name__ == "__main__":
    ctakes_dir = r'C:/apache-ctakes-4.0.0.1/'
    input_dir = r'C:/apache-ctakes-4.0.0.1/Statement_files'
    output_dir = r'C:/apache-ctakes-4.0.0.1/Statement_files/statement_cTAKES_output'
    pipeline_key = 'efd9c726-5226-43c1-8cb1-c5ac40bae98c'

    ctakes_processor = CTakesProcessor(ctakes_dir, input_dir, output_dir, pipeline_key)
    ctakes_processor.run_ctakes_pipeline()

### Convert XML output to Json

In [24]:
import os
import xmltodict
import json

def convert_xml_to_json(xml_file_path, json_file_path):
    with open(xml_file_path, encoding='utf-8') as xml_file:
        data_dict = xmltodict.parse(xml_file.read())
    json_data = json.dumps(data_dict)

    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json_file.write(json_data)

def convert_all_xml_to_json(xml_folder_path, json_folder_path):
    # Create the output folder if it doesn't exist
    if not os.path.exists(json_folder_path):
        os.makedirs(json_folder_path)

    # Convert each XML file to JSON
    for filename in os.listdir(xml_folder_path):
        if filename.endswith(".xmi"):
            xml_file_path = os.path.join(xml_folder_path, filename)
            # removing the extension without adding a new one
            json_file_path = os.path.join(json_folder_path, os.path.splitext(filename)[0])
            convert_xml_to_json(xml_file_path, json_file_path)

# usage
xml_folder_path = "C:/SemEval -2024/Statement_files/statement_cTAKES_output"
json_folder_path = "C:/SemEval -2024/Statement_files/statement_cTAKES_output/statement_cTAKES_Json_output"

convert_all_xml_to_json(xml_folder_path, json_folder_path)

# Parse json file and data manipulation

In [14]:
import json

file_path = r"C:\SemEval -2024\Statement_files\statement_cTAKES_output\statement_cTAKES_Json_output\00dc0e37-1d0b-4f53-a037-86bf9799dae6.txt"

# Load the JSON data from the file
with open(file_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Now, 'data' contains the content of the JSON file
print(data)

{'xmi:XMI': {'@xmlns:util': 'http:///org/apache/ctakes/typesystem/type/util.ecore', '@xmlns:tcas': 'http:///uima/tcas.ecore', '@xmlns:xmi': 'http://www.omg.org/XMI', '@xmlns:cas': 'http:///uima/cas.ecore', '@xmlns:type10': 'http:///org/cleartk/syntax/constituent/type.ecore', '@xmlns:ne': 'http:///org/cleartk/type/ne.ecore', '@xmlns:textsem': 'http:///org/apache/ctakes/typesystem/type/textsem.ecore', '@xmlns:types2': 'http:///org/apache/ctakes/assertion/zoner/types.ecore', '@xmlns:type6': 'http:///org/apache/ctakes/smokingstatus/i2b2/type.ecore', '@xmlns:refsem': 'http:///org/apache/ctakes/typesystem/type/refsem.ecore', '@xmlns:type11': 'http:///org/cleartk/syntax/dependency/type.ecore', '@xmlns:type': 'http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore', '@xmlns:type14': 'http:///org/cleartk/util/type.ecore', '@xmlns:assertion': 'http:///org/apache/ctakes/typesystem/type/temporary/assertion.ecore', '@xmlns:type8': 'http:///org/cleartk/score/type.ecore', '@xmlns:syntax': 'htt

In [15]:
# final data structure
document_parser = {
    "UUID":[],
    "statement":[],
    "clinical_mention": {
    "textsem:SignSymptomMention": [],
    "textsem:AnatomicalSiteMention": [],
    "textsem:DiseaseDisorderMention": [],
    "textsem:ProcedureMention": [],
    "textsem:MedicationMention": [],
    "textsem:LabMention": []
    }
}

In [16]:
xmi_data = data.get("xmi:XMI", {})

# Function to process UmlsConcept and extract cui and tui
def process_umls_concepts(umls_concept_data):
    umls_concepts = {}
    if isinstance(umls_concept_data, dict):  # for Single UMLS Concept
        umls_concepts[umls_concept_data["@xmi:id"]] = {
            "cui": umls_concept_data["@cui"],
            "tui": umls_concept_data["@tui"]
        }
    elif isinstance(umls_concept_data, list):  # for List of UMLS Concepts
        for concept in umls_concept_data:
            umls_concepts[concept["@xmi:id"]] = {
                "cui": concept["@cui"],
                "tui": concept["@tui"]
            }
    return umls_concepts

# # Extract UMLS Concepts data and process it
umls_concepts_data = data.get("xmi:XMI", {}).get("refsem:UmlsConcept", {})
umls_concepts = process_umls_concepts(umls_concepts_data)

for mention_type, mentions_list in xmi_data.items():
    if mention_type.startswith("structured:DocumentID"):
        UUID = mentions_list.get("@documentID")
        document_parser["UUID"].append(UUID)
    elif mention_type.startswith("cas:Sofa"):
        sofa = mentions_list.get("@sofaString")
        document_parser["statement"].append(sofa)
    elif mention_type.startswith("textsem:"):
        # Check if the mention type exists in clinical_mention
        if mention_type in document_parser['clinical_mention']:
            if isinstance(mentions_list, list):
                for mention in mentions_list:
                    mention_info = {  # Extract necessary information from each mention
                        "beginOffset": mention.get("@begin", ""),
                        "endOffset": mention.get("@end", ""),
                        "ontologyConceptArr": mention.get("@ontologyConceptArr", ""),
                        "confidence": mention.get("@confidence",""),
                        "polarity": mention.get("@polarity", "")
                    }
                    
                    # Extracting matching lemmas
                    matching_tokens = []
                    matching_lemmas = []
                    for node in xmi_data.get("syntax:ConllDependencyNode", []):
                        if int(mention.get("@begin", "")) <= int(node.get("@begin", "")) and int(mention.get("@end", "")) >= int(node.get("@end", "")):
                            matching_tokens.append(node.get("@form", ""))
                            matching_lemmas.append(node.get("@lemma", ""))
                    mention_info["token"] = " ".join(matching_tokens)
                    mention_info["lemma"] = " ".join(matching_lemmas)
                    
                    # Extracting matching parts of speech
                    matching_POS = []
                    for node in xmi_data.get("syntax:WordToken", []):
                        if int(mention_list.get("@begin", "")) <= int(node.get("@begin", "")) and int(mention_list.get("@end", "")) >= int(node.get("@end", "")):
                            matching_POS.append(node.get("@partOfSpeech", ""))
                    mention_info["partOfSpeech"] = " ".join(matching_POS)
                    
                    # Extract CUI and TUI from the ontology concept array
                    ontology_ids = mention.get("@ontologyConceptArr", "")
                    if ontology_ids:
                        if " " in ontology_ids:
                            ontology_ids = ontology_ids.split()
                        else:
                            ontology_ids = [ontology_ids]
                        
                        # Loop through each ontology ID
                        for ontology_id in ontology_ids:
                            umls_info = umls_concepts.get(ontology_id, {})
                            mention_info["cui"] = umls_info.get("cui", "")
                            mention_info["tui"] = umls_info.get("tui", "")
                    
                        document_parser['clinical_mention'][mention_type].append(mention_info)
                    
            elif isinstance(mentions_list, dict):  # In case there is only one mention and it's not in a list
                mention_info = {
                    "beginOffset": mentions_list.get("@begin", ""),
                    "endOffset": mentions_list.get("@end", ""),
                    "ontologyConceptArr": mentions_list.get("@ontologyConceptArr", ""),
                    "confidence": mentions_list.get("@confidence",""),
                    "polarity": mentions_list.get("@polarity","")
                }
                # Extracting matching lemmas
                matching_tokens = []
                matching_lemmas = []
                for node in xmi_data.get("syntax:ConllDependencyNode", []):
                    if int(mentions_list.get("@begin", "")) <= int(node.get("@begin", "")) and int(mentions_list.get("@end", "")) >= int(node.get("@end", "")):
                        matching_tokens.append(node.get("@form", ""))
                        matching_lemmas.append(node.get("@lemma", ""))
                mention_info["token"] = " ".join(matching_tokens)
                mention_info["lemma"] = " ".join(matching_lemmas)
                
                # Extracting matching parts of speech
                matching_POS = []
                for node in xmi_data.get("syntax:WordToken", []):
                    if int(mentions_list.get("@begin", "")) <= int(node.get("@begin", "")) and int(mentions_list.get("@end", "")) >= int(node.get("@end", "")):
                        matching_POS.append(node.get("@partOfSpeech", ""))
                mention_info["partOfSpeech"] = " ".join(matching_POS)
                
                # Extract CUI and TUI from the ontology concept array
                ontology_ids = mentions_list.get("@ontologyConceptArr", "")
                if ontology_ids:
                    if " " in ontology_ids:
                        ontology_ids = ontology_ids.split()
                    else:
                        ontology_ids = [ontology_ids]
                    # Loop through each ontology ID
                    for ontology_id in ontology_ids:
                        umls_info = umls_concepts.get(ontology_id, {})
                        mention_info["cui"] = umls_info.get("cui", "")
                        mention_info["tui"] = umls_info.get("tui", "")
                    
                    document_parser['clinical_mention'][mention_type].append(mention_info)

# print to check the data
document_parser_json = json.dumps(document_parser, indent=4)
print(document_parser_json)


{
    "UUID": [
        "00dc0e37-1d0b-4f53-a037-86bf9799dae6"
    ],
    "statement": [
        "Female patients with a womb cannot take part in either the secondary trial or the primary trial."
    ],
    "clinical_mention": {
        "textsem:SignSymptomMention": [],
        "textsem:AnatomicalSiteMention": [
            {
                "beginOffset": "23",
                "endOffset": "27",
                "ontologyConceptArr": "337",
                "confidence": "0.0",
                "polarity": "1",
                "token": "womb",
                "lemma": "womb",
                "partOfSpeech": "NN",
                "cui": "C0042149",
                "tui": "T023"
            }
        ],
        "textsem:DiseaseDisorderMention": [],
        "textsem:ProcedureMention": [
            {
                "beginOffset": "82",
                "endOffset": "89",
                "ontologyConceptArr": "296",
                "confidence": "0.0",
                "polarity": "1",
      

# Final Code

In [17]:
import json
import os

# Source directory containing the files to be parsed
source_dir = r"C:\SemEval -2024\Statement_files\statement_cTAKES_output\statement_cTAKES_Json_output"

# Destination directory to save the parsed files
destination_dir = r"C:\SemEval -2024\Statement_files\statement_cTAKES_output\statement_cTAKES_Json_output\statement_preprocessed"

# Ensure destination directory exists
if not os.path.exists(destination_dir):
    os.makedirs(destination_dir)
    

# Iterate over each file in the source directory
for file_name in os.listdir(source_dir):
    file_path = os.path.join(source_dir, file_name)

    # Check if the file is a .txt file
    if os.path.isfile(file_path) and file_path.endswith(".txt"):
        # Load the JSON data from the file
        with open(file_path, 'r', encoding='utf-8') as json_file:
            print(file_path)
            data = json.load(json_file)
            
        document_parser = {
            "UUID":[],
            "statement":[],
            "clinical_mention": {
                "textsem:SignSymptomMention": [],
                "textsem:AnatomicalSiteMention": [],
                "textsem:DiseaseDisorderMention": [],
                "textsem:ProcedureMention": [],
                "textsem:MedicationMention": [],
                "textsem:LabMention": []
                }
            }
        xmi_data = data.get("xmi:XMI", {})
        
        # # Function to process UmlsConcept and extract cui and tui
        # def process_umls_concepts(umls_concept_data):
        #     umls_concepts = {}
        #     if isinstance(umls_concept_data, dict):  # Single UMLS Concept
        #         umls_concepts[umls_concept_data["@xmi:id"]] = {
        #             "cui": umls_concept_data["@cui"],
        #             "tui": umls_concept_data["@tui"]
        #         }
        #     elif isinstance(umls_concept_data, list):  # List of UMLS Concepts
        #         for concept in umls_concept_data:
        #             umls_concepts[concept["@xmi:id"]] = {
        #                 "cui": concept["@cui"],
        #                 "tui": concept["@tui"]
        #             }
        #     return umls_concepts
        
        def process_umls_concepts(umls_concept_data):
            umls_concepts = {}
            if isinstance(umls_concept_data, dict):  # Single UMLS Concept
                xmi_id = umls_concept_data.get("@xmi:id", None)
                if xmi_id is not None:
                    umls_concepts[xmi_id] = {
                        "cui": umls_concept_data.get("@cui", ""),
                        "tui": umls_concept_data.get("@tui", ""),
                        "preferredText": umls_concept_data.get("@preferredText", "")
                        }
            elif isinstance(umls_concept_data, list):  # List of UMLS Concepts
                for concept in umls_concept_data:
                    xmi_id = concept.get("@xmi:id", None)
                    if xmi_id is not None:
                        umls_concepts[xmi_id] = {
                            "cui": concept.get("@cui", ""),
                            "tui": concept.get("@tui", ""),
                            "preferredText": concept.get("@preferredText", "")
                            }
            return umls_concepts

        # # Extract UMLS Concepts data and process it
        umls_concepts_data = data.get("xmi:XMI", {}).get("refsem:UmlsConcept", {})
        umls_concepts = process_umls_concepts(umls_concepts_data)
        
        for mention_type, mentions_list in xmi_data.items():
            if mention_type.startswith("structured:DocumentID"):
                UUID = mentions_list.get("@documentID")
                document_parser["UUID"].append(UUID)
            elif mention_type.startswith("cas:Sofa"):
                sofa = mentions_list.get("@sofaString")
                document_parser["statement"].append(sofa)
            elif mention_type.startswith("textsem:"):
                # Check if the mention type exists in clinical_mention
                if mention_type in document_parser['clinical_mention']:
                    if isinstance(mentions_list, list):
                        for mention in mentions_list:
                            mention_info = {  # Extract necessary information from each mention
                                "beginOffset": mention.get("@begin", ""),
                                "endOffset": mention.get("@end", ""),
                                "ontologyConceptArr": mention.get("@ontologyConceptArr", ""),
                                "confidence": mention.get("@confidence",""),
                                "polarity": mention.get("@polarity", "")
                                # Add more fields as required
                            }
                            
                            # # Extracting matching lemmas
                            matching_tokens = []
                            matching_lemmas = []
                            for node in xmi_data.get("syntax:ConllDependencyNode", []):
                                if int(mention.get("@begin", "")) <= int(node.get("@begin", "")) and int(mention.get("@end", "")) >= int(node.get("@end", "")):
                                    matching_tokens.append(node.get("@form", ""))
                                    matching_lemmas.append(node.get("@lemma", ""))
                            mention_info["token"] = " ".join(matching_tokens)
                            mention_info["lemma"] = " ".join(matching_lemmas)
                            
                            # Extracting matching parts of speech
                            matching_POS = []
                            for node in xmi_data.get("syntax:WordToken", []):
                                if int(mention.get("@begin", "")) <= int(node.get("@begin", "")) and int(mention.get("@end", "")) >= int(node.get("@end", "")):
                                    matching_POS.append(node.get("@partOfSpeech", ""))
                            mention_info["partOfSpeech"] = " ".join(matching_POS)
                            
                            # Extract CUI and TUI from the ontology concept array
                            ontology_ids = mention.get("@ontologyConceptArr", "")
                            if ontology_ids:
                                if " " in ontology_ids:
                                    ontology_ids = ontology_ids.split()
                                else:
                                    ontology_ids = [ontology_ids]
                                
                                # Loop through each ontology ID
                                for ontology_id in ontology_ids:
                                    umls_info = umls_concepts.get(ontology_id, {})
                                    mention_info["cui"] = umls_info.get("cui", "")
                                    mention_info["tui"] = umls_info.get("tui", "")
                                    mention_info["preferredText"] = umls_info.get("preferredText", "")
                            
                                document_parser['clinical_mention'][mention_type].append(mention_info)
                            
                    elif isinstance(mentions_list, dict):  # In case there is only one mention and it's not in a list
                        mention_info = {
                            "beginOffset": mentions_list.get("@begin", ""),
                            "endOffset": mentions_list.get("@end", ""),
                            "ontologyConceptArr": mentions_list.get("@ontologyConceptArr", ""),
                            "confidence": mentions_list.get("@confidence",""),
                            "polarity": mentions_list.get("@polarity","")
                        }
                        # Extracting matching lemmas
                        matching_tokens = []
                        matching_lemmas = []
                        for node in xmi_data.get("syntax:ConllDependencyNode", []):
                            if int(mentions_list.get("@begin", "")) <= int(node.get("@begin", "")) and int(mentions_list.get("@end", "")) >= int(node.get("@end", "")):
                                matching_tokens.append(node.get("@form", ""))
                                matching_lemmas.append(node.get("@lemma", ""))
                        mention_info["token"] = " ".join(matching_tokens)
                        mention_info["lemma"] = " ".join(matching_lemmas)
                        
                        # Extracting matching parts of speech
                        matching_POS = []
                        for node in xmi_data.get("syntax:WordToken", []):
                            if int(mentions_list.get("@begin", "")) <= int(node.get("@begin", "")) and int(mentions_list.get("@end", "")) >= int(node.get("@end", "")):
                                matching_POS.append(node.get("@partOfSpeech", ""))
                        mention_info["partOfSpeech"] = " ".join(matching_POS)
                        
                        # Extract CUI and TUI from the ontology concept array
                        ontology_ids = mentions_list.get("@ontologyConceptArr", "")
                        if ontology_ids:
                            if " " in ontology_ids:
                                ontology_ids = ontology_ids.split()
                            else:
                                ontology_ids = [ontology_ids]
                            # Loop through each ontology ID
                            for ontology_id in ontology_ids:
                                # print(ontology_id)
                                umls_info = umls_concepts.get(ontology_id, {})
                                mention_info["cui"] = umls_info.get("cui", "")
                                mention_info["tui"] = umls_info.get("tui", "")
                                mention_info["preferredText"] = umls_info.get("preferredText", "")
                            
                            document_parser['clinical_mention'][mention_type].append(mention_info)
                        
        # Save the parsed data to the destination directory
        output_file_path = os.path.join(destination_dir, file_name)
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            json.dump(document_parser, output_file, indent=4)

C:\SemEval -2024\Statement_files\statement_cTAKES_output\statement_cTAKES_Json_output\00466f98-52b8-41f3-9bf1-2edaad950be9.txt
C:\SemEval -2024\Statement_files\statement_cTAKES_output\statement_cTAKES_Json_output\0046e113-8ac5-4725-a285-e78b8c26f825.txt
C:\SemEval -2024\Statement_files\statement_cTAKES_output\statement_cTAKES_Json_output\0057172f-d019-401b-a516-993a7b46a67b.txt
C:\SemEval -2024\Statement_files\statement_cTAKES_output\statement_cTAKES_Json_output\006f6b4e-6245-4f09-9786-327bbed3d766.txt
C:\SemEval -2024\Statement_files\statement_cTAKES_output\statement_cTAKES_Json_output\007de11b-4265-4695-b18e-e0d6909a347a.txt
C:\SemEval -2024\Statement_files\statement_cTAKES_output\statement_cTAKES_Json_output\0093175a-38cb-4f63-b391-709ac48158b8.txt
C:\SemEval -2024\Statement_files\statement_cTAKES_output\statement_cTAKES_Json_output\009d23bb-2179-4ce3-927d-4dedca6b32a8.txt
C:\SemEval -2024\Statement_files\statement_cTAKES_output\statement_cTAKES_Json_output\00dc0e37-1d0b-4f53-a037-8

# Combine all Json into one

In [394]:
import json
import os

source_dir = r"C:\SemEval -2024\Statement_files\statement_cTAKES_output\statement_cTAKES_Json_output\statement_preprocessed"  # Replace with your directory path
output_file = r'C:\SemEval -2024\Statement_files\statement_cTAKES_output\statement_cTAKES_Json_output\statement_preprocessed\combined_result\combined.json'  # Replace with desired output file path    
    
combined_data = []

# Iterate over each file in the source directory
for file_name in os.listdir(source_dir):
    file_path = os.path.join(source_dir, file_name)

    # Check if the file is a .json file
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as json_file:
            data = json.load(json_file)
            combined_data.append(data)

# Write the combined data to the output file
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(combined_data, f, indent=4)
