# Installing Dependencies



In [24]:
!pip install transformers torch pandas datasets pandarallel

Collecting pandarallel
  Downloading pandarallel-1.6.5.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
  Created wheel for pandarallel: filename=pandarallel-1.6.5-py3-none-any.whl size=16674 sha256=c12ab5d5a640d874ed51ab6e017063e636dde47f8f20055c5d54689b58829f26
  Stored in directory: /root/.cache/pip/wheels/50/4f/1e/34e057bb868842209f1623f195b74fd7eda229308a7352d47f
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.6.5


# Model Initialization and Setup

In [25]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Initialize the tokenizer and model
model_name = 'bigbio/bigbio-mtl'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Data Loading and Preprocessing

In [28]:
from google.colab import drive
drive.mount('/content/gdrive')

file_path = '/content/gdrive/MyDrive/combined_dataset.json'

# Load data from combined_dataset.json
import json
with open(file_path, 'r', encoding="utf-8") as f:
    data = json.load(f)

    # Extract note_id, note_text, and segment annotations
notes_data = []

for note_id, entry in data['annotated_entries'].items():
    note_text = entry['note_text']
    notes_data.append({'note_text': note_text})

# Create a DataFrame
df = pd.DataFrame(notes_data)

In [29]:
print(len(df))

908


In [30]:
print(df.columns)

Index(['note_text'], dtype='object')


# NER Extraction and Filtering Setup

In [31]:
# Load the semantic types file into a DataFrame
semantic_types_df = pd.read_csv(
    '/content/gdrive/MyDrive/semantic_types_file.txt',
    sep='|',
    header=None,
    names=['Abbreviation', 'TUI', 'Full_Name']
)

# Define relevant TUIs
relevant_tuis = {
    'T116', 'T200', 'T047', 'T060', 'T050', 'T033', 'T037', 'T074',
    'T191', 'T046', 'T121', 'T184', 'T061', 'T017', 'T031', 'T022',
    'T025', 'T028', 'T044', 'T040', 'T024'
}

# Filter the DataFrame to include only relevant TUIs
relevant_semantic_types_df = semantic_types_df[semantic_types_df['TUI'].isin(relevant_tuis)]

# Create a dictionary for TUI to Full Name mapping
tui_to_full_name = dict(zip(relevant_semantic_types_df['TUI'], relevant_semantic_types_df['Full_Name']))

In [32]:
def extract_ner_tags(text):
    # Tokenize the input text
    inputs = tokenizer(
        text.split(),
        return_tensors="pt",
        truncation=True,
        padding='max_length',
        max_length=512,
        is_split_into_words=True
    )
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits

    # Get the predicted class indices for each token
    predicted_token_class_ids = torch.argmax(logits, dim=-1)[0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    word_ids = inputs.word_ids()

    # Map tokens to their corresponding words and labels
    word_to_labels = {}
    for token, word_id, label_id in zip(tokens, word_ids, predicted_token_class_ids):
        if word_id is not None:
            label = model.config.id2label[label_id]
            # Extract the TUI from the label
            tui = label.split('-')[-1].strip(')')
            # Map the TUI to its full name
            full_name = tui_to_full_name.get(tui, tui)
            if word_id in word_to_labels:
                word_to_labels[word_id].add(full_name)
            else:
                word_to_labels[word_id] = {full_name}

    # Combine words with their corresponding full semantic type names
    words = text.split()
    word_label_pairs = [(words[word_id], list(labels)) for word_id, labels in word_to_labels.items()]
    return word_label_pairs


In [33]:
# Create a set of relevant full semantic type names
relevant_full_names = set(tui_to_full_name.values())

def filter_relevant_ner_tags(ner_tags):
    """
    Filters NER tags to retain only those with relevant semantic types.

    Parameters:
    ner_tags (list of tuples): Each tuple contains a word and a list of associated semantic type labels.

    Returns:
    list of tuples: Filtered list containing only relevant NER tags.
    """
    filtered_tags = []
    for word, labels in ner_tags:
        # labels is a list of full semantic type names or 'O'
        for label in labels:
            # If the label is not 'O' and is in our set of relevant semantic type names
            if label != 'O' and label in relevant_full_names:
                filtered_tags.append((word, labels))
                break  # Stop checking other labels for this word as we found a relevant label
    return filtered_tags

In [34]:
# Set the maximum column width to None to display all content without truncation
pd.set_option('display.max_colwidth', None)

# Parallel NER Extraction

In [35]:
from pandarallel import pandarallel

# Initialize pandarallel with a progress bar
pandarallel.initialize(progress_bar=True)

# Apply the functions in parallel
df['raw_ner_tags'] = df['note_text'].parallel_apply(extract_ner_tags)
df['relevant_ner_tags'] = df['raw_ner_tags'].parallel_apply(filter_relevant_ner_tags)

INFO: Pandarallel will run on 1 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=908), Label(value='0 / 908'))),))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=908), Label(value='0 / 908'))),))

In [36]:
# Display the DataFrame with both raw and relevant NER tags
print(df[['note_text','raw_ner_tags', 'relevant_ner_tags']].head(5))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [37]:
# Save the DataFrame to a CSV file
output_file_csv = "ner_results.csv"
df.to_csv(output_file_csv, index=False)

print(f"DataFrame saved as {output_file_csv}")

DataFrame saved as ner_results.csv
