In [None]:
!pip install flair --quiet
!pip install "flair[word-embeddings]"
!pip install biopython
!pip install spacy
!pip install nltk

In [None]:
import os
import ssl
import string
import pandas as pd
from tqdm import tqdm
import numpy as np
from typing import List, Tuple, Optional
import re
from Bio import Entrez
import spacy
import matplotlib.pyplot as plt
import ast

from sklearn.metrics.pairwise import cosine_similarity as cos_sim
from sklearn.model_selection import train_test_split

from flair.data import Sentence
from flair.embeddings import (
    WordEmbeddings,
    FlairEmbeddings,
    StackedEmbeddings,
)
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.datasets import ColumnCorpus

from gensim.models import Word2Vec
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

In [2]:
# Avoid SSL certificate verification issues
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
getattr(ssl, '_create_unverified_context', None)):
    ssl._create_default_https_context = ssl._create_unverified_context

# Helper functions

In [None]:
def get_evaluation_metrics(pred_drug: List[List[str]], true_drug: List[List[str]], pred_adr: List[List[str]], true_adr: List[List[str]]) -> pd.DataFrame:
    """
    Compute precision, recall, and F1 score for predicted drug/ADRs.
    """
    # Drug/adr metrics
    drug_tp = 0
    drug_fp = 0
    drug_fn = 0
    adr_tp = 0
    adr_fp = 0
    adr_fn = 0
    
    for prd, ad, pa, aa in zip(pred_drug, true_drug, pred_adr, true_adr):
        # Drug metrics
        if len(prd) > 0:
            for d in prd:
                for a in ad:
                    if d == a:
                        drug_tp += 1
                    else:
                        drug_fp += 1
        else:
            drug_fn += len(ad)
        
        # ADR metrics
        if len(prd) > 0:
            for d in pa:
                for a in aa:
                    if d == a:
                        adr_tp += 1
                    else:
                        adr_fp += 1
        else:
            adr_fn += len(ad)

    # Drug and ADR metrics
    precision_drug = drug_tp / (drug_tp + drug_fp)
    recall_drug = drug_tp / (drug_tp + drug_fn)
    f1_drug = 2 * (precision_drug * recall_drug) / (precision_drug + recall_drug)

    precision_adr = adr_tp / (adr_tp + adr_fp)
    recall_adr = adr_tp / (adr_tp + adr_fn)
    f1_adr = 2 * (precision_adr * recall_adr) / (precision_adr + recall_adr)

    # Overall metrics
    true_positives = drug_tp + adr_tp
    false_positives = drug_fp + adr_fp
    false_negatives = drug_fn + adr_fn

    precision = true_positives / (true_positives+false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2 * (precision * recall) / (precision + recall)

    # Return results
    results = pd.DataFrame({"Metric": ["drug", "adr", "overall"],
                            "Precision": [precision_drug, precision_adr, precision],
                            "Recall": [recall_drug, recall_adr, recall],
                            "F1 Score": [f1_drug, f1_adr, f1]})

    return results

"""Sequence tag functions"""
def clean_string(text: str) -> str:
    """
    Returns a cleaned version of input string:
    - Strips leading/trailing whitespace
    - Removes leading/trailing punctuation
    - Converts to lowercase
    - Splits hyphenated multiwords into two words
    """

    if isinstance(text, str):
        # Remove leading/trailing whitespace
        text = text.strip()

        # Remove leading/trailing punctuation
        text = text.strip(string.punctuation)

        # Convert to lowercase
        text = text.lower()
    
    return text

def _get_entity_spans(entity: str, text: str) -> Tuple[int, int]:
    """
    Returns start and end indexes of entity occurence in text.
    """
    literal_str = re.escape(entity)
    matched_results = re.finditer(literal_str, text, flags=re.IGNORECASE)
    
    return [m.span() for m in matched_results]

def sequence_tag_text(drugname: str, adr: str, text: str, drug_spans: Optional[List[Tuple[int, int]]] = None, adr_spans: Optional[List[Tuple[int, int]]] = None, detect_spans: bool = True) -> List[Tuple[int, int]]:
    """
    Sequence tag a input text with IOB tags for drugname and ADR entities.
    Optionally switch off span detection and provide spans directly.

    The following tags are generated:
    - B-DRUG: Beginning of Drug entity
    - I-DRUG: Inside Drug entity
    - B-ADR: Beginning of ADR entity
    - I-ADR: Inside ADR entity
    - O: Outside any entity
    """
    sentence = Sentence(text, use_tokenizer=True)

    # Initialize all tags as Others first
    tags = ['O'] * len(sentence)

    # Detect spans if not provided
    if detect_spans:
        drug_spans = _get_entity_spans(drugname, text)
        adr_spans = _get_entity_spans(adr, text)

    # IOB tag drug entities
    for start, end in drug_spans:
        for i, w in enumerate(sentence):
            w_start, w_end = w.start_position, w.end_position

            if w_start >= start and w_end <= end:
                # Mark first word as Begining of Drug entity
                if w_start == start:
                    tags[i] = 'B-DRUG'
                
                # Subsequent words are Inside Drug spans
                else:
                    tags[i] = 'I-DRUG'

    # IOB tag ADR entities
    for start, end in adr_spans:
        for i, w in enumerate(sentence):
            w_start, w_end = w.start_position, w.end_position

            if w_start >= start and w_end <= end:
                # Mark first word as Begining of ADR entity
                if w_start == start:
                    tags[i] = 'B-ADR'
                
                # Subsequent words are Inside ADR spans
                else:
                    tags[i] = 'I-ADR'
    
    # Remove any I- tags that are preceded by O tags
    preceeding_tag = 'B-placeholder'
    preceeding_index = -1
    for tag in tags:
        if tag.startswith('I-'):
            if preceeding_tag == 'O':
                tags[preceeding_index+1] = 'O'
        
        preceeding_tag = tag
        preceeding_index += 1

    return list(zip([token.text for token in sentence], tags))

def save_sequence_tagged_data(sequence_tagged_pairs: List[List[Tuple[str, str]]], output_dir: str, output_filename: str) -> None:
    """
    Save sequence tagged data to column-formatted txt output file.
    Each line contains a word token and its corresponding tag.
    """
    output_filepath = os.path.join(output_dir, output_filename)
    with open(output_filepath, 'w', encoding='utf-8') as f:
        for sentence in sequence_tagged_pairs:
            for w, tag in sentence:
                f.write(f"{w} {tag}\n")
        
            # Blank line between sentences
            f.write("\n")
            
"""Cosine Similarity functions"""
def get_mean_word_vector(phrase: str, wordVector_model) -> list:
    words = phrase.split()
    vectors = [wordVector_model.wv[word] for word in words if word in wordVector_model.wv]
    """
    Compute cosine similarity using mean of all word vectors in multi-word terms.
    """
    if vectors:
        mean_vector = np.mean(vectors, axis=0)
        return mean_vector

    return None

def get_pair_cosine_similarity(drug: str, effect: str, wordVector_model) -> float:
    """
    Compute cosine similarity between drug and effect terms using pretrained word vectors.
    """
    # Get mean of word vectors for multi-word terms
    drug_vector = get_mean_word_vector(drug, wordVector_model)
    effect_vector = get_mean_word_vector(effect, wordVector_model)
    
    # Compute cosine similarity
    if drug_vector is None or effect_vector is None:
        return None

    return cos_sim([drug_vector], [effect_vector])[0][0]

def compute_cosine_similarities(drug_adr_df: pd.DataFrame, lstm_model, wordvec_model) -> pd.DataFrame:
    """
    Compute cosine similarities for all drug/ADR pairs in input dataframe.
    """
    results = []
    for i, row in tqdm(drug_adr_df.iterrows(), total=len(drug_adr_df)):
        sentence = Sentence(row['text'])
        lstm_model.predict(sentence)

        # Extract predicted drug and ADR entities
        predicted_drugs = []
        predicted_adrs = []
        for entity in sentence.get_spans('ner'):
            if entity.get_label('ner').value == 'DRUG':
                predicted_drugs.append(entity.text)
            elif entity.get_label('ner').value == 'ADR':
                predicted_adrs.append(entity.text)
        
        # Remove duplicates
        predicted_drugs = list(set(predicted_drugs))
        predicted_adrs = list(set(predicted_adrs))

        # Compute cosine similarity for each drug, ADR pair
        cosine_similarities = []
        if len(predicted_drugs) > 0 and len(predicted_adrs) > 0:
            for drug in predicted_drugs:
                for adr in predicted_adrs:
                    cosine_sim = get_pair_cosine_similarity(drug, adr, wordvec_model)
                    cosine_similarities.append((drug,adr,cosine_sim))
        
        else:
            cosine_similarities.append((None,None,None))

        results.append({"drug": predicted_drugs, "effect": predicted_adrs, "cosine_similarities": cosine_similarities})

    results = pd.DataFrame(results)

    return results

"""Dependency Parsing functions"""
def _identify_noun_phrases(sentence: str) -> str:
    """
    Identify Noun Phrases in input sentence.
    """
    tokens = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(tokens)

    # Pattern for noun phrases: optional determiner, adjectives, nouns
    NP_grammar = r'NP: {<DT>?<JJ>*<NN|NNS|NNP|NNPS>+}'

    chunk_parser = nltk.RegexpParser(NP_grammar)
    results = chunk_parser.parse(pos_tags)

    phrases = []
    for tree in results.subtrees():
        if tree.label() == "NP":
            words = [w for w, _ in tree.leaves()]

            phrases.append(" ".join(words))
    
    return phrases

def _cross_ref_database(noun_phrases: List[str], drug_database: List[str], adr_database: List[str]) -> Tuple[List[str], List[str]]:
    """
    Extract noun phrases that are present in drug and ADR databases.
    """
    # case insensitive
    drugs = [np.lower() for np in noun_phrases if any(np.lower() in db_drug for db_drug in drug_database)]
    adrs = [np.lower() for np in noun_phrases if any(np.lower() in db_adr for db_adr in adr_database)]

    return list(set(drugs)), list(set(adrs))


def identify_drug_adr_dp(sentence: str, drug_database: List[str], adr_database: List[str]) -> Tuple[List[str], List[str]]:
    """
    Extract drug/ADR pairs using regex pattern matching on causal related grammatical structures.
    """
    # List of candidate causal terms
    causal_terms = ['caused', 'causes', 'cause', 'causing', 
                    'experience', 'experienced', 'experiences', 'expriencing',
                    'induced', 'induces', 'inducing', 'induce',
                    'led', 'leading', 'leads to',
                    'correlate', 'correlated', 'correlates', 'correlating',
                    'associated', 'associate', 'associates', 'associating',
                    'resulted', 'result', 'results', 'resulting',
                    'due to', 'because of']
    
    # regex pattern to match causal grammatical structures
    causal_pattern = '|'.join([re.escape(c) for c in causal_terms])
    regex_pattern = r'(?P<NP1>\b\w+(?: \w+)*\b)\s+(?:' + causal_pattern + r')\s+(?P<NP2>\b\w+(?: \w+)*\b)'

    # Extract all NPs
    matches = re.finditer(regex_pattern, sentence, re.IGNORECASE)
    nps = []
    for match in matches:
        nps.extend(_identify_noun_phrases(match.group('NP1')))
        nps.extend(_identify_noun_phrases(match.group('NP2')))

    # Cross-reference with drug/ADR database to identify drug and ADR pairs
    if len(nps) > 0:
        drugs, adrs = _cross_ref_database(nps, drug_database, adr_database)        
    
    else:
        drugs, adrs = [], []

    return drugs, adrs


# Collate drug and ADR dabase
Obtain drug/ADR names from FAERs and CVP databases from the last 10 years. Also include drug/ADR names found in the HuggingFace training data.

## FAERS data

In [45]:
# Concatenate FAERS data 
faers_dir = "data/FAERS"
faers_drugname_files = [os.path.join(faers_dir, file) for file in os.listdir(faers_dir) if file.startswith("DRUG")]
faers_reaction_files = [os.path.join(faers_dir, file) for file in os.listdir(faers_dir) if file.startswith("REAC")]

faers_drugname_df = []
for file in tqdm(faers_drugname_files):
    faers_drugname_df.append(pd.read_csv(file, sep='$', usecols=['primaryid', 'drugname']))
faers_drugname_df = pd.concat(faers_drugname_df)

faers_reaction_df = []
for file in tqdm(faers_reaction_files):
    faers_reaction_df.append(pd.read_csv(file, sep='$', usecols=['primaryid', 'drug_rec_act']))
faers_reaction_df = pd.concat(faers_reaction_df)


100%|██████████| 43/43 [01:58<00:00,  2.75s/it]
100%|██████████| 43/43 [00:36<00:00,  1.19it/s]


In [46]:
# Remove leading/trailing whitespaces and end-punctuations, lowercase all text
drugname_df = faers_drugname_df.map(lambda x: x.strip().rstrip(string.punctuation).lower() if isinstance(x, str) else x)
reaction_df = faers_reaction_df.map(lambda x: x.strip().rstrip(string.punctuation).lower() if isinstance(x, str) else x)

# Drop duplicates
drugname_df = drugname_df.drop_duplicates()
reaction_df = reaction_df.drop_duplicates()

# Merge drugname and ADR data on primaryid
faers_df = pd.merge(drugname_df, reaction_df, on='primaryid', how='inner')

# Discard incomplete records
faers_df = faers_df.dropna(subset=['drugname', 'drug_rec_act'])

# Drop duplicates
faers_df = faers_df.drop_duplicates()

# Write out extracted FAERS data
faers_df.to_csv("data/FAERS/FAERS_train_data.csv", index=False)

## CVP data

In [47]:
# Concatenate CVP data 
cvp_dir = "data/CVP"
cvp_drug_file = os.path.join(cvp_dir, "report_drug.txt")
cvp_reaction_file = os.path.join(cvp_dir, "reactions.txt")

cvp_drug_df = pd.read_csv(cvp_drug_file, sep='$', header=None, usecols=[1, 3], names=["report_id", "drugname"])
cvp_reaction_df = pd.read_csv(cvp_reaction_file, sep='$', header = None, usecols=[1,5], names =["report_id", "adr"])

In [48]:
# Merge drugname and ADR based on ID
cvp_df = pd.merge(cvp_drug_df, cvp_reaction_df, on='report_id', how='inner')
cvp_df = cvp_df.drop_duplicates()

# Remove leading/trailing whitespaces and lowercase all text
cvp_df = cvp_df.map(lambda x: x.strip().lower() if isinstance(x, str) else x)

# Discard incomplete records
cvp_df = cvp_df.dropna(subset=['drugname', 'adr'])

# Drop duplicates
cvp_df = cvp_df.drop_duplicates()

# Write out extracted CVP data
cvp_df.to_csv("data/CVP/CVP_train_data.csv", index=False)

## Obtain databases

In [None]:
# Gather database of drug names and ADR terms
huggingFace_train_data = pd.read_csv("data/HuggingFace/train_split.csv", usecols=['drug', 'effect'])
faers_train_data = pd.read_csv("data/FAERS/FAERS_train_data.csv", usecols=['drugname' ,'drug_rec_act'])
cvp_train_data = pd.read_csv("data/CVP/CVP_train_data.csv", usecols=['drugname', 'adr'])

drug_database = set(huggingFace_train_data['drug']).union(set(faers_train_data['drugname'])).union(set(cvp_train_data['drugname']))
adr_database = set(huggingFace_train_data['effect']).union(set(faers_train_data['drug_rec_act'])).union(set(cvp_train_data['adr']))

# Save drug and ADR databases as txt files
with open("data/drug_database.txt", 'w', encoding='utf-8') as f:
    for drug in drug_database:
        f.write(f"{drug}\n")
with open("data/adr_database.txt", 'w', encoding='utf-8') as f:
    for adr in adr_database:
        f.write(f"{adr}\n")

# Sequence tag data for LSTM training
Sequence tag HuggingFace test data with IOB tags in preparation for LSTM training.

In [23]:
# Load HuggingFace data 
hf_df = pd.read_parquet("hf://datasets/ade-benchmark-corpus/ade_corpus_v2/Ade_corpus_v2_drug_ade_relation/train-00000-of-00001.parquet")

# Clean strings in data
for col in hf_df.select_dtypes(include='object').columns:
    hf_df[col] = hf_df[col].apply(clean_string)

# Shuffle data, and split into a 70/15/15 train/val/test split
train_df, temp_df = train_test_split(hf_df, test_size=0.3, random_state=45)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=45)

hf_data = [(train_df, "train"), (val_df, "val"), (test_df, "test")]

In [5]:
# Sequence tag HuggingFace data using provided start/end indexes
for df, split in hf_data:
    sequence_tagged_data = []
    for i, row in tqdm(df.iterrows(), total=len(df)):
        indexes = row["indexes"]
        drug_spans = [(s, e) for s, e in zip(indexes['drug']['start_char'], indexes['drug']['end_char'])]
        adr_spans = [(s, e) for s, e in zip(indexes['effect']['start_char'], indexes['effect']['end_char'])]
        
        word_tag_pairs = sequence_tag_text(row['drug'], row['effect'], row['text'], drug_spans=drug_spans, adr_spans=adr_spans, detect_spans=False)
        sequence_tagged_data.append(word_tag_pairs)

    # Write out sequence tagged HuggingFace data
    save_sequence_tagged_data(sequence_tagged_data, output_dir='data', output_filename=f'huggingFace_seqtag_{split}_data.txt')

100%|██████████| 4774/4774 [00:08<00:00, 556.96it/s]
100%|██████████| 1023/1023 [00:01<00:00, 549.12it/s]
100%|██████████| 1024/1024 [00:01<00:00, 752.39it/s]


# Train & Evaluate base LSTM
Using sequence tagged data from HuggingFace, we train a LSTM model to predict drug and ADR pairs from input text. Validation and testing is done on sequence tagged validation and testing split from HuggingFace data.

## Training

In [71]:
# Create FLAIR corpus from datasets
columns = {0: "text", 1: "ner"}
corpus = ColumnCorpus(data_folder="data/HuggingFace", column_format=columns,
                      train_file="huggingFace_seqtag_train_data.txt", test_file="huggingFace_seqtag_test_data.txt", dev_file="huggingFace_seqtag_val_data.txt")

# Label column to predict
label_type = "ner"

# Create label dictionary from corpus
label_dict = corpus.make_label_dictionary(
    label_type=label_type, add_unk=False
)
print(label_dict)

# Initialize embedding stack with Flair and GloVe
embedding_types = [
    WordEmbeddings("glove"),
    FlairEmbeddings("news-forward"),
    FlairEmbeddings("news-backward"),
]
embeddings = StackedEmbeddings(embeddings=embedding_types)

2025-11-22 17:21:59,942 Reading data from data/HuggingFace
2025-11-22 17:21:59,961 Train: data/HuggingFace/huggingFace_seqtag_train_data.txt


2025-11-22 17:21:59,970 Dev: data/HuggingFace/huggingFace_seqtag_val_data.txt
2025-11-22 17:21:59,976 Test: data/HuggingFace/huggingFace_seqtag_test_data.txt
2025-11-22 17:22:09,820 Computing label dictionary. Progress:


0it [00:00, ?it/s]
4774it [00:00, 20235.15it/s]

2025-11-22 17:22:10,193 Dictionary created for label 'ner' with 2 values: ADR (seen 4856 times), DRUG (seen 4440 times)





Dictionary with 2 tags: ADR, DRUG


In [3]:
# Initialize sequence tagger and trainer
tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=label_dict,
    tag_type=label_type,
)
trainer = ModelTrainer(tagger, corpus)

2025-11-18 22:37:59,886 SequenceTagger predicts: Dictionary with 9 tags: O, S-ADR, B-ADR, E-ADR, I-ADR, S-DRUG, B-DRUG, E-DRUG, I-DRUG


In [None]:
# Train the model
runname = "base-model"
output_dir = f"taggers/{runname}"
os.makedirs(output_dir, exist_ok=True)

trainer.train(
    f"taggers/{runname}",
    learning_rate=0.1,
    mini_batch_size=3,
    save_model_each_k_epochs=10,
    monitor_test=True,
    max_epochs=10,
)

## Testing

In [None]:
# Load trained lstm model
trained_lstm_model = SequenceTagger.load(
    'taggers/base-model/best-model.pt'
)

# Evaluate dependency parsing results
test_df = pd.read_csv("data/HuggingFace/test_split.csv")
actual_drugs = []
actual_adrs = []
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    actual_drugs.append([row['drug']])
    actual_adrs.append([row['effect']])

test_results = []
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    sentence = Sentence(row['text'])
    trained_lstm_model.predict(sentence)

    # Extract predicted drug and ADR entities
    predicted_drugs = []
    predicted_adrs = []
    for entity in sentence.get_spans('ner'):
        if entity.get_label('ner').value == 'DRUG':
            predicted_drugs.append(entity.text)
        elif entity.get_label('ner').value == 'ADR':
            predicted_adrs.append(entity.text)

    test_results.append({"drug": list(set(predicted_drugs)), "effect": list(set(predicted_adrs))})

test_results = pd.DataFrame(test_results)
test_results.to_csv("data/evaluation_results/test_split_LSTM_results.csv", index=False)

# Compute evaluation metrics
evaluation_metrics = get_evaluation_metrics(
    pred_drug=test_results['drug'],
    true_drug=actual_drugs,
    pred_adr=test_results['effect'],
    true_adr=actual_adrs
)
evaluation_metrics.to_csv("data/evaluation_results/eval_LSTM.csv", index=False)
print(evaluation_metrics)

# Improving false positives
Generate cosine similarity scores of LSTM identified drug, ADR pairs. Pairs are discarded as false positives if the cosine similarity score is < 0.5.

## Build word vector model from training data
Tokenize text and use it to train a word2vec model.

In [None]:
# Use spaCy to tokenize text in training data
train_df = pd.read_csv("data/HuggingFace/train_split.csv")
nlp = spacy.load(
    "en_core_web_sm", disable=["ner", "parser"]
)

collection = [
    [
        token.text
        for token in nlp(line.lower())
        if not token.is_punct
    ]
    for line in tqdm(train_df["text"])
]

In [None]:
# Train model using tokenized text
wordVec_model = Word2Vec(
    sentences=collection,
    vector_size=100,
    window=5,
    min_count=1,
    sg=1,
    negative=8,
)

In [None]:
# Save word vectors
word_vectors = wordVec_model.wv
word_vectors.save("data/word2vec_vectors.wordvectors")

## Assess cosine similarity threshold
Assess the feasibility of using cosine similarities to identify true drug/ADR pairs using training data.

In [None]:
# Load wordVector/LSTM model and training data
trained_lstm_model = SequenceTagger.load(
    'taggers/base-model/best-model.pt'
)
wordVec_model = Word2Vec.load("data/word2vec.model")
train_df = pd.read_csv("data/HuggingFace/train_split.csv")
train_cosine_similarities = compute_cosine_similarities(train_df, trained_lstm_model, wordVec_model)

# Save cosine similarities
train_cosine_similarities.to_csv("data/train_split_cosine_scores.csv", index=False)

In [None]:
# Plot cosine similarities distribution
hist_data = []
for i, row in tqdm(train_cosine_similarities.iterrows(), total=len(train_cosine_similarities)):
  for c in row['cosine_similarities']:
    if c[0] and len(c[0]) > 0 and c[1] and len(c[1]) > 1:
      if c[2] is not None:
        hist_data.append(c[2])

plt.hist(hist_data, bins=50)
plt.title('Cosine Similarities Distribution (Training Data))')
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')

plt.savefig("data/evaluation_results/cosine_hist_train_split.png")
plt.close()

## Incorporate cosine similarity in LSTM predictions
Discard any drug/ADR predictions with cosine similarity < 0.4 as false positives.

In [None]:
# Load trained LSTM and word vector models
trained_lstm_model = SequenceTagger.load(
    'taggers/base-model/best-model.pt'
)
wordVec_model = Word2Vec.load("data/word2vec.model")

In [None]:
# Implement cosine similarity for test data
test_df = pd.read_csv("data/HuggingFace/test_split.csv")

cosine_results = []
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    sentence = Sentence(row['text'])
    trained_lstm_model.predict(sentence)

    # Extract LSTM predicted drug and ADR entities
    predicted_drugs = []
    predicted_adrs = []
    for entity in sentence.get_spans('ner'):
        if entity.get_label('ner').value == 'DRUG':
            predicted_drugs.append(entity.text)
        elif entity.get_label('ner').value == 'ADR':
            predicted_adrs.append(entity.text)
    
    # Compute cosine similarity
    if len(predicted_drugs) > 0 or len(predicted_adrs) > 0:
        drugs = []
        adrs = []
        for drug in predicted_drugs:
            for adr in predicted_adrs:
                cosine_similarity = get_pair_cosine_similarity(drug, adr, wordVec_model)

                # Keep LSTM prediction if more than 0.4 similarity
                if cosine_similarity and cosine_similarity >= 0.4:
                    drugs.append(drug)
                    adrs.append(adr) 
                
                # Keep LSTM prediction if None found
                if cosine_similarity is None:
                    drugs.append(drug)
                    adrs.append(adr)
    
    # Keep LSTM predictions if no cosine similarity available
    else:
        drugs = predicted_drugs
        adrs = predicted_adrs
    
    cosine_results.append({"drug": list(set(drugs)), "effect": list(set(adrs))})

cosine_results = pd.DataFrame(cosine_results)
cosine_results.to_csv("data/evaluation_results/test_split_LSTM_cosine_results.csv", index=False)

In [None]:
# Evaluate final results
test_df = pd.read_csv("data/HuggingFace/test_split.csv")
actual_drugs = []
actual_adrs = []
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    actual_drugs.append([row['drug']])
    actual_adrs.append([row['effect']])

final_results = pd.read_csv("data/evaluation_results/test_split_LSTM_cosine_results.csv")
predicted_drugs = [ast.literal_eval(drug) for drug in final_results['drug']]
predicted_adrs = [ast.literal_eval(adr) for adr in final_results['effect']]
eval_metrics = get_evaluation_metrics(
    pred_drug=predicted_drugs,
    true_drug=actual_drugs,
    pred_adr=predicted_adrs,
    true_adr=actual_adrs
)
eval_metrics.to_csv("data/evaluation_results/eval_cosine_corrected.csv", index=False)

print(eval_metrics)

# Improving true positives
Implement dependency parsing to extract drug/ADR pairs that the LSTM might have missed.

In [None]:
# Load trained LSTM and word vector models
trained_lstm_model = SequenceTagger.load(
    'taggers/base-model/best-model.pt'
)
wordVec_model = Word2Vec.load("data/word2vec.model")

In [None]:
# Implement dependency parsing for test data
test_df = pd.read_csv("data/HuggingFace/test_split.csv")
with open("data/drug_database.txt", 'r', encoding='utf-8') as f:
    drug_database = [line for line in f.readlines()]
with open("data/adr_database.txt", 'r', encoding='utf-8') as f:
    adr_database = [line for line in f.readlines()]

results = []
for i, row in tqdm(test_df.iterrows(), total = len(test_df)):
    sentence = Sentence(row['text'])
    trained_lstm_model.predict(sentence)

    # Extract LSTM predicted drug and ADR entities
    drugs = []
    adrs = []
    for entity in sentence.get_spans('ner'):
        if entity.get_label('ner').value == 'DRUG':
            drugs.append(entity.text)
        elif entity.get_label('ner').value == 'ADR':
            adrs.append(entity.text)

    # Add dependency parsing matches if LSTM has no predictions
    dp_drugs, dp_adrs = identify_drug_adr_dp(row['text'], drug_database, adr_database)
    if len(drugs) == 0:
        drugs = dp_drugs
    if len(adrs) == 0:
        adrs = dp_adrs

    results.append({
        'drug': list(set(drugs)),
        'effect': list(set(adrs)),
    })

results = pd.DataFrame(results)
results.to_csv("data/evaluation_results/test_split_LSTM_and_DP_results.csv", index=False)

In [None]:
# Evaluate dependency parsing results
test_df = pd.read_csv("data/HuggingFace/test_split.csv")
actual_drugs = []
actual_adrs = []
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    actual_drugs.append([row['drug']])
    actual_adrs.append([row['effect']])

final_results = pd.read_csv("data/evaluation_results/test_split_LSTM_and_DP_results.csv")
predicted_drugs = [ast.literal_eval(drug) for drug in final_results['drug']]
predicted_adrs = [ast.literal_eval(adr) for adr in final_results['effect']]
eval_metrics = get_evaluation_metrics(
    pred_drug=predicted_drugs,
    true_drug=actual_drugs,
    pred_adr=predicted_adrs,
    true_adr=actual_adrs
)
eval_metrics.to_csv("data/evaluation_results/eval_DP_corrected.csv", index=False)
print(eval_metrics)

# Integrate improvements into LSTM model
Use LSTM to make base predictions. Discard predictions using cosine similarity, then use dependency parsing to supplement discarded/missed predictions.

In [None]:
# Load trained LSTM and word vector models
trained_lstm_model = SequenceTagger.load(
    'taggers/base-model/best-model.pt'
)
wordVec_model = Word2Vec.load("data/word2vec.model")

In [None]:
# Load test dataset
test_df = pd.read_csv("data/HuggingFace/test_split.csv")
with open("data/drug_database.txt", 'r', encoding='utf-8') as f:
    drug_database = [line for line in f.readlines()]
with open("data/adr_database.txt", 'r', encoding='utf-8') as f:
    adr_database = [line for line in f.readlines()]

final_results = []
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    sentence = Sentence(row['text'])
    trained_lstm_model.predict(sentence)

    # Extract LSTM predicted drug and ADR entities
    predicted_drugs = []
    predicted_adrs = []
    for entity in sentence.get_spans('ner'):
        if entity.get_label('ner').value == 'DRUG':
            predicted_drugs.append(entity.text)
        elif entity.get_label('ner').value == 'ADR':
            predicted_adrs.append(entity.text)

    dp_drugs, dp_adrs = identify_drug_adr_dp(row['text'], drug_database, adr_database)
    # If LSTM has predictions
    if len(predicted_drugs) > 0 and len(predicted_adrs) > 0:
        drugs = []
        adrs = []

        for drug in predicted_drugs:
            for adr in predicted_adrs:
                cosine_similarity = get_pair_cosine_similarity(drug, adr, wordVec_model)

                # keep LSTM prediction if more than 0.4 similarity, or None
                if cosine_similarity and cosine_similarity > 0.4:
                    drugs.append(drug)
                    adrs.append(adr)

                elif cosine_similarity is None:
                    drugs.append(drug)
                    adrs.append(adr)               

        # Use dependency parsing if all pairs discarded by cosine similarity  
        # Retain LSTM predictions if no dependency parsing available        
        if len(drugs) == 0 and len(dp_drugs) > 0:
            drugs = dp_drugs
        else:
            drugs = predicted_drugs

        if len(adrs) == 0 and len(dp_adrs) > 0:
            adrs = dp_adrs
        else:
            adrs = predicted_adrs
    
    # If LSTM has no predictions or missing half of the pair, use dependency parsing results
    else:
        if len(predicted_drugs) == 0:
            drugs = dp_drugs
        else:
            drugs = predicted_drugs
        
        if len(predicted_adrs) == 0:
            adrs = dp_adrs
        else:
            adrs = predicted_adrs
    
    final_results.append({
        'drug': list(set(drugs)),
        'effect': list(set(adrs)),
    })

final_results = pd.DataFrame(final_results)
final_results.to_csv("data/evaluation_results/test_split_finalimprov_results.csv", index=False)

In [None]:
# Evaluate final results
test_df = pd.read_csv("data/HuggingFace/test_split.csv")
actual_drugs = []
actual_adrs = []
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    actual_drugs.append([row['drug']])
    actual_adrs.append([row['effect']])

final_results = pd.read_csv("data/evaluation_results/test_split_finalimprov_results.csv")
predicted_drugs = [ast.literal_eval(drug) for drug in final_results['drug']]
predicted_adrs = [ast.literal_eval(adr) for adr in final_results['effect']]
eval_metrics = get_evaluation_metrics(
    pred_drug=predicted_drugs,
    true_drug=actual_drugs,
    pred_adr=predicted_adrs,
    true_adr=actual_adrs
)
eval_metrics.to_csv("data/evaluation_results/eval_finalimprov.csv", index=False)

print(eval_metrics)