In [None]:
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import os
import json
import datetime
import logging

In [None]:
# # # Count summaries
# directory_path = "/lockbox/llama3_20240509/llama3/"
# text_file_count = len([filename for filename in os.listdir(directory_path) if filename.endswith('.txt')])
# text_file_count

In [None]:
path_to_folders = '/lockbox/sgpgi_ds/'
path_to_summaries = "prompt2_pii_detection/" 

In [None]:
def make_directory_if_not_exists(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
        print(f"Directory created at {directory_path}")
    else:
        print(f"Directory already exists at {directory_path}")

In [None]:
import json

# def summary_to_xml(json_data):

#     text_element = ''
#     text = json_data["text"]
#     ner_entities = json_data["entities"]

#     chunks = []
#     start_index = 0
#     for entity in ner_entities:
#         end_index = text.find(entity["fake_entity"], start_index)
#         if end_index != -1:
#             chunks.append(text[start_index:end_index])
#             chunks.append(entity)
#             start_index = end_index + len(entity["fake_entity"])
#     chunks.append(text[start_index:])

#     template = '<ENTITY TYPE="{}">{}</ENTITY>'

#     # Build XML structure
#     for chunk in chunks:
#         if isinstance(chunk, dict):
#             text_element += template.format(chunk["label"], chunk["fake_entity"])
#         else:
#             text_element += chunk

#     return text_element

In [None]:
sgpgi_ds = []
folder_path_name = []

# Iterate through each folder in the specified path
for folder in os.listdir(path_to_folders):#
    
    folder_path = os.path.join(path_to_folders, folder)

    # Check if the item in the folder is a directory
    if os.path.isdir(folder_path) and int(folder_path.split('/')[-1]) in range(80,100):
        
        folder_path_name.append(folder_path)
        # Iterate through each file in the directory
        for file_name in os.listdir(folder_path):
            
            file_path = os.path.join(folder_path, file_name)
            
            # Check if the file is a JSONL file
            if file_name.endswith('.jsonl'):
                # Open the JSONL file and read each line
                with open(file_path, 'r') as json_file:
                    all_text = []
                    for index,line in enumerate(json_file):
                        data = json.loads(line)
                        # Check if 'text' key exists in the JSON data
                        # if 'text' in data:
                        all_text.append(data["text"])
                         
                    sgpgi_ds.append(all_text)

In [None]:
len(sgpgi_ds[0])

In [None]:
folder_path_name

In [None]:
PROMPT ="""You are an expert in annotating the entities given in the prompt.
Make sure to return annotated summaries without altering/rephrasing the text in the original discharge summary.
Your task is to annotate the given discharge summary with only the given list of annotations in the prompt.

List of annotations:
- Patient_Name
- Hospital_Name
- Staff_Name
- Doctor_Name
- Guardian_Name
- Age
- Patient_ID
- Patient_DOB
- Treatment_Date
- Phone_No
- City
- State
- Street
- Zip
- Country
- Other_Location
- Landline
- Email
- IP_Address
- Fax
- Ward_Location
- Insurance_Number
- Web_url
- Aadhar
- Driver_License
- Voter_ID
- PAN_Card

EXAMPLE:  

'annotated output' : 
<INSERT_EXAMPLE_MEDICAL_REPORT>

Instructions: 
It's very crucial to provide annotations for the discharge summary text with as much accuracy and detail as possible. 
Please always enclose the annotated output with <RECORD> </RECORD> tags.

Now, annotate the following discharge summary text: """

### meta-llama/Meta-Llama-3-8B-Instruct 

In [None]:
model_name = 'llama3'
make_directory_if_not_exists(path_to_summaries + model_name)

In [None]:
# # Configure logging to write to a file
logging.basicConfig(filename=f'{path_to_summaries}{model_name}_log.txt', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/lockbox/models/Meta-Llama-3-8B-Instruct/",device_map="cuda")
model = AutoModelForCausalLM.from_pretrained("/lockbox/models/Meta-Llama-3-8B-Instruct/",device_map="cuda")

In [None]:
for summary_number in range(len(folder_path_name)):
    for i,summary in enumerate(sgpgi_ds[summary_number]):

        messages = [
        {"role": "system", "content": PROMPT},
        {"role": "user", "content": summary},]
    
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)
    
        terminators = [
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]
        
        
        try:
            output = model.generate(
                input_ids,
                max_new_tokens=3000,
                eos_token_id=terminators,
                do_sample=True,
                temperature=0.9,
                top_p=0.9
            )
            response = output[0][input_ids.shape[-1]:]
            generated_summary = tokenizer.decode(response, skip_special_tokens=True)
    
            file_name = f"llama3_pii_{folder_path_name[summary_number].split('/')[-1]}_{i}.txt"
            
            # file_name = f"llama3_generated_ds_{i}.txt"
            file_path = f"{path_to_summaries}{model_name}/{file_name}"
    
            # Open the file in write mode and save the content
            with open(file_path, "w") as file:
                file.write(generated_summary)
                # print('#'*100)
                # print(generated_summary)
                # print('*'*100)
    
            logging.info(f"Content saved to {file_path}")          
            
        except (ValueError, Exception) as e:
            logging.error(f"File No.: {folder_path_name[summary_number]} Error occurred: {e}. Moving on to the next iteration...")
            continue
print('Fin!')

### google/gemma-1.1-7b-it

In [None]:
model_name = 'gemma'
make_directory_if_not_exists(path_to_summaries + model_name)

In [None]:
# Configure logging to write to a file
logging.basicConfig(filename=f'{path_to_summaries}{model_name}_log.txt', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/lockbox/models/gemma-1.1-7b-it/",device_map="cuda")
model = AutoModelForCausalLM.from_pretrained("/lockbox/models/gemma-1.1-7b-it/",device_map="cuda")

In [None]:
for summary_number in range(len(folder_path_name)):

    for i,summary in enumerate(sgpgi_ds[summary_number]):

        prompt = PROMPT.format(summary)

        try:
            chat = [
                { "role": "user", "content":  prompt},
            ]
            prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

            inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
            outputs = model.generate(input_ids=inputs.to(model.device),  temperature = 0.9, max_length=3000) # max_length=model.config.max_position_embeddings - 2
            decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
                        
            # Define the file path with the timestamp in the name
            file_name = f"gemma_generated_ds{folder_path_name[summary_number].split('/')[-1]}_{i}.txt"
            
            # file_name = f"llama3_generated_ds_{i}.txt"
            file_path = f"{path_to_summaries}{model_name}/{file_name}"
            
            # Open the file in write mode and save the content
            with open(file_path, "w") as file:
                file.write(decoded_output.split('model')[-1])

            logging.info(f"Content saved to {file_path}")
            
        except (ValueError, Exception) as e:
            logging.error(f"File No.: {folder_path_name[summary_number]} Error occurred: {e}. Moving on to the next iteration...")
            continue   

print('Fin!')

### mistralai/Mistral-7B-Instruct-v0.1

In [None]:
model_name = 'mistral'
make_directory_if_not_exists(path_to_summaries + model_name)

In [None]:
# Configure logging to write to a file
logging.basicConfig(filename=f'{path_to_summaries}{model_name}_log.txt', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
model = AutoModelForCausalLM.from_pretrained("/lockbox/models/Mistral-7B-Instruct-v0.1/",device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained("/lockbox/models/Mistral-7B-Instruct-v0.1/",device_map="cuda")

In [None]:
for summary_number in range(len(folder_path_name)):

    for i,summary in enumerate(sgpgi_ds[summary_number]):

        prompt = PROMPT.format(summary)
        
        try:
            messages = [
                {"role": "user", "content": prompt},
            ]
            encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

            model_inputs = encodeds.to(model.device)

            generated_ids = model.generate(model_inputs, max_new_tokens=3000, do_sample=True,temperature=0.9)
            decoded = tokenizer.batch_decode(generated_ids)
            
            decoded_output = decoded[0].split('[/INST]')[-1]
            
            # Define the file path with the timestamp in the name
            file_name = f"mistral_generated_ds{folder_path_name[summary_number].split('/')[-1]}_{i}.txt"
            
            file_path = f"{path_to_summaries}{model_name}/{file_name}"
            
            # Open the file in write mode and save the content
            with open(file_path, "w") as file:
                file.write(decoded_output.split('model')[-1])

            logging.info(f"Content saved to {file_path}")
            
        except (ValueError, Exception) as e:
            logging.error(f"File No.: {folder_path_name[summary_number]} Error occurred: {e}. Moving on to the next iteration...")
            continue   

print('Fin!')

### Analysis

In [None]:
import os
from bs4 import BeautifulSoup
import pandas as pd

def get_all_keys(rawText):
    keys=[]
    soup = BeautifulSoup(rawText, 'xml')
    for record in soup.find_all('RECORD'):
        text = record.text
        start = 0
        tag = []
        for phi in record.find_all('ENTITY'):
            start = text.find(phi.text, start)
            end = start + len(phi.text)
            try:
                tagType = phi['TYPE']
                keys.append(tagType)
                tag.append({'start': str(start), 'end': str(end), 'text': phi.text, 'label': tagType})
            except Exception:
                pass
    
    return keys
        

#folder_path = "/lockbox/summaries_pii/"
folder_path = "prompt2_pii_detection/"
all_tags = []
for folder in os.listdir(folder_path):
    if not 'ipynb' in folder:
        folder_path = os.path.join(folder_path, folder)

        if  'log' not in folder_path:

            for file_name in os.listdir(folder_path):
    
            #for file_name in os.listdir(folder_path):
                if file_name.endswith(".txt") and 'log' not in file_name:
                    file_path = os.path.join(folder_path, file_name)
                    #print(file_path)
                    
                    with open(file_path, 'r') as file:
                        file_contents = file.read()
    
                        all_tags.extend(get_all_keys(file_contents))


gold_tags = ['Patient_Name','Hospital_Name','Staff_Name','Doctor_Name','Age','Gaurdian_Name','Gender','Patient_ID','Misc_Medical_ID','Aadhar',    'Driver_License','Voter_ID','PAN_Card','Patient_DOB','Treatment_Date',
'Treatment_Time','Phone_No','Landline','Email','IP_Address','Fax',      'Doctor_Specialisation','Patient_Profession','City','Ward_Location',
'Device_Number','Other_Info','State','Street','Zip','Country',       'Other_Location','Other_Govt_ID','Insurance_Number','Web_url']


set(gold_tags) - set(all_tags)
#set(all_tags)

# EVALUATION

In [None]:
#%pip install nervaluate
from bs4 import BeautifulSoup
import pandas as pd
from nervaluate import Evaluator
import json
import os
import re

def read_xml(text,file_path):
    tags=[]        
    with open(file_path, 'r') as file:
        content = file.read()
        pattern = r'<RECORD>(.*?)<\/RECORD>'
        matches = re.findall(pattern, content, re.DOTALL)
        record = ''.join(matches)
        start = 0
        entity_matches = re.finditer(r'<ENTITY TYPE="(.*?)">(.*?)<\/ENTITY>', str(record), re.DOTALL)
        
        for match in entity_matches:
            tag = []
            start = text.find(match.group(2))  # Find the start index of the entity text
            end = start + len(match.group(2))  # Calculate the end index
            
            tag.append({
                'start': str(start),
                'end': str(end),
                'text': match.group(2),
                'label': match.group(1)
            })

            tags.extend(tag)
    return tags

def modelfilepath(model_str, line_str):
    directory = f'prompt2_pii_detection/{model_str}'
    for filename in os.listdir(directory):
        if model_str in filename and  line_str in filename:
            return True , os.path.join(directory, filename)
    return False,''
    
def get_annotations(model):
    sgpgi_data =[]
    model_data=[]
    for subdir, dirs, files in os.walk('/lockbox/sgpgi_ds/'):
        subdir_split = subdir.split('/')
        if len(subdir_split) >= 3 and subdir_split[3] in  set(str(i) for i in range(80, 100)): #['01','02','03','04','05']:
            for file in files:
                if file.endswith(".jsonl"):
                    file_path = os.path.join(subdir, file)
                    with open(file_path, "r") as file:
                        for index, line in enumerate(file, start=1):
                            data = json.loads(line.strip())
                            text = data.get("text")
                            one_doc = []
                            exists, filepath = modelfilepath(model, f'{os.path.basename(subdir_split[3])}_{index-1}.txt')
                            if  exists :
                                model_data.append(read_xml(text,filepath))
                            else:
                                model_data.append([])
                            for entity in data.get("entities"):
                                one_doc.append({'label': entity['label'],
                                                       'start': entity['start_offset'],
                                                       'end': entity['end_offset'],
                                                       'fake_entity': entity['fake_entity']})
                            sgpgi_data.append(one_doc)
    return sgpgi_data,model_data

labels = ['DATE','LOCATION','HOSPITAL','ID','AGE','PHONE','DOCTOR','PATIENT']
def map_labels(annotations):
    for item in annotations:
        for value in item:
            if value['label'] in ['Patient_Name','Gaurdian_Name']:
                value['label'] = 'PATIENT'
            elif value['label'] in ['Staff_Name','Doctor_Name']:
                value['label'] = 'DOCTOR'
            elif value['label'] in ['Patient_DOB','Treatment_Date']:#,'Treatment_Time'
                value['label'] = 'DATE'
            elif value['label'] in ['Hospital_Name','Ward_Location']:
                value['label'] = 'HOSPITAL'
            elif value['label'] == 'Age':
                value['label'] = 'AGE'
            elif value['label'] in ['Other_Govt_ID','Insurance_Number','Patient_ID','Misc_Medical_ID','Aadhar','Driver_License','Voter_ID','PAN_Card','Device_Number']:
                value['label'] = 'ID'
            elif value['label'] in ['Phone_No','Landline','Email','IP_Address','Fax','Web_url']:
                value['label'] = 'CONTACT'
            elif value['label'] in ['City','State','Street','Zip','Country', 'Other_Location',]:
                value['label'] = 'LOCATION'

            value['start'] = int(value['start'])
            value['end'] = int(value['end'])

for key in ['llama3','gemma','mistral']:
    print(key)
    gold_annotations,pred_annotations = get_annotations(key)
    print(len(gold_annotations), len(pred_annotations))
    map_labels(gold_annotations)
    map_labels(pred_annotations)

    results, results_per_tag = Evaluator(gold_annotations, pred_annotations, tags=labels).evaluate()

    if not os.path.exists('results/'):
        os.makedirs('results/')
    with open(f'results/{key}_results.json', 'w') as results_file:
        json.dump(results, results_file, indent=4)
    with open(f'results/{key}_results_per_tag.json', 'w') as results_per_tag_file:
        json.dump(results_per_tag, results_per_tag_file, indent=4)
print('Fin!!!!!!')