In [None]:
%pip install lxml

In [None]:
import os
import re
import lxml.etree as etree
import re

def escape_ampersands(content):
    return re.sub(r'&(?!(amp;|lt;|gt;|apos;|quot;|#\d+;))', '&amp;', content)

def parse_xml(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read().strip()
            content = escape_ampersands(content)
        
        parser = etree.XMLParser(recover=True)  
        tree = etree.fromstring(content.encode('utf-8'), parser=parser)
        if parser.error_log:
            print(f"Errors during parsing {file_path}:")
            for error in parser.error_log:
                print(error.message, "at line", error.line)
        
        texts = tree.xpath('//TEXT//text()')
        return ' '.join(texts)
    except etree.XMLSyntaxError as e:
        print(f"XMLSyntaxError while parsing file {file_path}: {str(e)}")
        print(f"Content begins with: {content[:100]}")
        return None
    except Exception as e:
        print(f"General error with file {file_path}: {str(e)}")
        return None

def process_all_xml(root_directory):
    all_texts = []
    for root, dirs, files in os.walk(root_directory):
        for file in files:
            if file.endswith('.xml'):
                file_path = os.path.join(root, file)
                text = parse_xml(file_path)
                if text:
                    all_texts.append(text)
    return all_texts

root_directory_1 = '/home/lokesh/ds_comparison/2006n2c2'
collected_texts = process_all_xml(root_directory_1)


In [None]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

In [None]:
cleaned_texts_n2c2 = [clean_text(text) for text in collected_texts]

In [None]:
n2c2_string = ' '.join(cleaned_texts_n2c2)

In [None]:
n2c2_string

In [None]:
import pandas as pd

In [None]:
pip install openpyxl

In [None]:
dfTrain = pd.read_excel('/home/lokesh/ds_comparison/n2c2_synthetic/1596_summary_gemini_final.xlsx')

In [None]:
n2c2_synthetic_list = dfTrain['Annotated_Summaries'].tolist()

In [None]:
n2c2_synthetic_list 

In [None]:
cleaned_strings = [s.replace('\\n', ' ') for s in n2c2_synthetic_list]

In [None]:
cleaned_strings

In [None]:
synthetic_n2c2_string = ' '.join(cleaned_strings)

In [None]:
combined_discharge_summary_n2c2 = n2c2_string + "\n" + synthetic_n2c2_string 

In [None]:
combined_discharge_summary_n2c2

In [None]:
import os
import json
import nltk
from nltk.tokenize import word_tokenize

In [None]:
root_directory_2 = '/lockbox/sgpgi_ds'

In [None]:
def extract_and_parse_jsonl(root_directory):
    parsed_json_data = [] 
    for subdir, _, files in os.walk(root_directory):
        for file in files:
            if file.endswith('.jsonl'):
                file_path = os.path.join(subdir, file)
                try:
                    with open(file_path, 'r') as f:
                        for line in f:
                            try:
                                json_data = json.loads(line)
                                parsed_json_data.append(json_data)
                            except json.JSONDecodeError as e:
                                print(f"Error parsing JSON in file '{file_path}': {e}")
                except Exception as e:
                    print(f"Error processing file '{file_path}': {e}")
    return parsed_json_data 


In [None]:
texts = extract_and_parse_jsonl(root_directory_2)

In [None]:
texts

In [None]:
extracted_texts = [entry['text'] for entry in texts if 'text' in entry]

In [None]:
cleaned_texts_sgpgi = [clean_text(text) for text in extracted_texts]

In [None]:
sgpgi_string = ' '.join(cleaned_texts_sgpgi)

In [None]:
import os
import numpy as np
from nltk.tokenize import word_tokenize

def read_and_normalize_text_files(root_directory):
    file_texts = []
    for subdir, _, files in os.walk(root_directory):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(subdir, file)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        text = f.read().replace('\n', ' ')
                        file_texts.append(text)
                except Exception as e:
                    print(f"Error reading file '{file_path}': {e}")
    return file_texts

In [None]:
candidate_datapath_llama3 = '/lockbox/llama3_20240509/llama3'

In [None]:
syn_text_llama3 = read_and_normalize_text_files(candidate_datapath_llama3)

In [None]:
syn_text_llama3

In [None]:
string_syn_sgpgi = ' '.join(syn_text_llama3)

In [None]:
combined_discharge_summary_sgpgi = string_syn_sgpgi  + "\n" + sgpgi_string

In [None]:
combined_discharge_summary_sgpgi

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, bigrams, trigrams, FreqDist
import string
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def preprocess(text):
    text = text.lower()

    text = text.translate(str.maketrans('', '', string.punctuation))

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return tokens

In [None]:
tokens_n2c2 = preprocess(combined_discharge_summary_n2c2)
tokens_sgpgi = preprocess(combined_discharge_summary_sgpgi)

In [None]:
def jaccard_distance(set1, set2):
    """Calculate the Jaccard distance between two sets."""
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return 1 - len(intersection) / len(union)

In [None]:
set_n2c2 = set(tokens_n2c2)
set_sgpgi = set(tokens_sgpgi)

In [None]:
distance = jaccard_distance(set_n2c2, set_sgpgi)
print(f"Jaccard Distance: {distance}")

In [None]:
import sys
sys.path.insert(0, '/home/lokesh/ds_comparison/bert_score')

In [None]:
from bert_score import score

In [None]:
P, R, F1 = score([n2c2_string], [sgpgi_string], lang='en', model_type="dmis-lab/biobert-v1.1")
print("Precision:", P)
print("Recall:", R)
print("F1 Score:", F1)