### Fundamentals of Natural Language Processing
# Negation and Uncertainty Detection using a Machine-Learning Based Approach

*Authors:*

> *Anna Blanco, Agustina Lazzati, Stanislav Bultaskii, Queralt Salvadó*

*Aims:*
> Rewrite for DL

In [3]:
# Import necessary libraries and functions
import json
import spacy
from collections import defaultdict
import re
import pandas as pd
import numpy as np
import pickle

In [6]:
with open("lstm_data.pkl", "rb") as f:
    data_dict = pickle.load(f)

lstm_train_data_neg_cue = data_dict["lstm_train_data_neg_cue"]
lstm_train_data_neg_scope = data_dict["lstm_train_data_neg_scope"]
lstm_train_data_unc_cue = data_dict["lstm_train_data_unc_cue"]
lstm_train_data_unc_scope = data_dict["lstm_train_data_unc_scope"]

lstm_test_data_neg_cue = data_dict["lstm_test_data_neg_cue"]
lstm_test_data_neg_scope = data_dict["lstm_test_data_neg_scope"]
lstm_test_data_unc_cue = data_dict["lstm_test_data_unc_cue"]
lstm_test_data_unc_scope = data_dict["lstm_test_data_unc_scope"]

print(lstm_train_data_neg_cue[2])

(['antecedents', 'alergia', 'a', 'penicilina', 'y', 'cloramfenicol', '.'], [0, 0, 0, 0, 0, 0, 0])


In [13]:
def merge_labels(cue_labels, scope_labels, cue_prefix="CUE", scope_prefix="SCOPE"):
    merged = []
    for cue, scope in zip(cue_labels, scope_labels):
        if cue != 0:
            merged.append(f"{cue_prefix}_{str(cue)}")
        elif scope != 0:
            merged.append(f"{scope_prefix}_{str(scope)}")
        else:
            merged.append(0)
    return merged

In [16]:
# Merge negation data
lstm_train_data_neg = [
    (tokens, merge_labels(cue_labels, scope_labels, cue_prefix="NEG", scope_prefix="NSCO"))
    for (tokens, cue_labels), (_, scope_labels) in zip(lstm_train_data_neg_cue, lstm_train_data_neg_scope)
]

lstm_test_data_neg = [
    (tokens, merge_labels(cue_labels, scope_labels, cue_prefix="NEG", scope_prefix="NSCO"))
    for (tokens, cue_labels), (_, scope_labels) in zip(lstm_test_data_neg_cue, lstm_test_data_neg_scope)
]

# Similarly for uncertainty
lstm_train_data_neg = [
    (tokens, merge_labels(cue_labels, scope_labels, cue_prefix="UNC", scope_prefix="UNSCO"))
    for (tokens, cue_labels), (_, scope_labels) in zip(lstm_train_data_neg_cue, lstm_train_data_neg_scope)
]

lstm_test_data_neg = [
    (tokens, merge_labels(cue_labels, scope_labels, cue_prefix="UNC", scope_prefix="UNSCO"))
    for (tokens, cue_labels), (_, scope_labels) in zip(lstm_test_data_neg_cue, lstm_test_data_neg_scope)
]

print(lstm_train_data_neg[3])

(['no', 'habitos', 'toxicos', '.'], ['UNC_1', 'UNSCO_1', 'UNSCO_1', 'UNSCO_1'])


In [7]:
!pip install fasttext

import fasttext

# Download the English fastText model
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
# Unzip the downloaded file
!gunzip cc.en.300.bin.gz

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313503 sha256=42008a00ca87415babb8e3c1534421c708597df1140618865080425f7fd33cc0
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

In [None]:
# Load pretrained FastText model (English, 300-dimensional vectors)
fasttext_model = fasttext.load_model("cc.en.300.bin")

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

class LSTMTokenDataset(Dataset):
    def __init__(self, data, ft_model, max_len=100):
        self.data = data
        self.ft = ft_model
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens, labels = self.data[idx]

        # Convert tokens to embeddings
        embeddings = [torch.tensor(self.ft.get_word_vector(token.lower())) for token in tokens]

        # Convert labels to tensor
        label_tensor = torch.tensor(labels, dtype=torch.long)

        return embeddings, label_tensor

def collate_batch(batch):
    embed_seqs, label_seqs = zip(*batch)

    embed_seqs_padded = pad_sequence([torch.stack(seq) for seq in embed_seqs], batch_first=True)
    label_seqs_padded = pad_sequence(label_seqs, batch_first=True, padding_value=-100)

    return embed_seqs_padded, label_seqs_padded

In [None]:
train_dataset_neg = LSTMTokenDataset(lstm_train_data_neg_cue, ft_model)
test_dataset_neg = LSTMTokenDataset(lstm_test_data_neg_cue, ft_model)

train_loader_neg = DataLoader(train_dataset_neg, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_loader_neg = DataLoader(test_dataset_neg, batch_size=32, shuffle=False, collate_fn=collate_batch)
