### Data Preprocessing

In [2]:
import json
import re
import requests
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import spacy
import string
from langdetect import detect
from spellchecker import SpellChecker


try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

try:
    nlp = spacy.load("es_core_news_sm")
except OSError:
    print("Downloading Spanish model for spaCy...")
    spacy.cli.download("es_core_news_sm")
    nlp = spacy.load("es_core_news_sm")

train_url = 'https://raw.githubusercontent.com/AgustinaLazzati/NLP-Project/refs/heads/Test/negacio_train_v2024.json'
test_url = 'https://raw.githubusercontent.com/AgustinaLazzati/NLP-Project/refs/heads/Test/negacio_test_v2024.json'

In [3]:
# Load spell checkers for Spanish and Catalan
spell_es = SpellChecker(language='es')  # Spanish
spell_ca = SpellChecker(language=None)  # Catalan (custom dictionary needed)

# URL of the raw file in the GitHub repository
file_url = "https://raw.githubusercontent.com/AgustinaLazzati/NLP-Project/refs/heads/main/catala.txt"

# Fetch the file content from GitHub
response = requests.get(file_url)

if response.status_code == 200:
    # If the request was successful, use the content
    catalan_words = response.text.splitlines()
    print(f"Loaded {len(catalan_words)} words from the dictionary.")
else:
    print(f"Failed to retrieve the file. HTTP Status code: {response.status_code}")

# Now you can load the words into the spell checker
spell_ca.word_frequency.load_words(catalan_words)

def correct_misspellings(text, spell_checker):
    def replace(match):
        word = match.group(0)
        corrected_word = spell_checker.correction(word)
        return corrected_word if corrected_word else word  # Keep original if no suggestion

    return re.sub(r'\b\w+\b', replace, text)

Loaded 1091836 words from the dictionary.


In [4]:
def load_data(url):
    response = requests.get(url)
    if response.status_code == 200: 
        try:
            data = json.loads(response.text)
            return data
        except json.JSONDecodeError:
            print(f"Failed to parse JSON data from {url}")
            print(f"First 500 characters of response: {response.text[:500]}")
            return None
    else:
        print(f"Failed to load data: {response.status_code}")
        return None

### Data loading and preprocessing functions

In [16]:
def load_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        try:
            data = json.loads(response.text)
            return data
        except json.JSONDecodeError:
            print(f"Failed to parse JSON data from {url}")
            print(f"First 500 characters of response: {response.text[:500]}")
            return None
    else:
        print(f"Failed to load data: {response.status_code}")
        return None

def get_text(sample):
    if isinstance(sample, dict) and 'data' in sample and isinstance(sample['data'], dict):
        return sample['data'].get('text', '')
    return ''

def preprocess_text(text):
    if not isinstance(text, str) or not text:
        return ""

    # Handle redacted entities
    text = re.sub(r'\*+', '[REDACTED]', text)

    # Replace common patterns for redacted information
    text = re.sub(r'n[ºo]\s*(historia|episodi|h\.c\.|h\.c)[:\s]*\S+', '[HC_NUM]', text, flags=re.IGNORECASE)
    text = re.sub(r'sexe:\s*\w+', '[GENDER]', text, flags=re.IGNORECASE)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    # Detect language
    try:
      language = detect(text)
    except Exception as e:
        print(f"Language detection failed: {e}")
        language = 'unknown'  # If detection fails, assume unknown language

    # Correct misspellings based on detected language
    if language == 'ca':  # If Catalan is detected
        print('Detected Language: Catalan')
        text = correct_misspellings(text, spell_ca)
    elif language == 'es':  # If Spanish is detected
        print('Detected Language: Spanish')
        text = correct_misspellings(text, spell_es)

    return text.strip()

"""""
def extract_features(text):
    if not text:
        return {
            'tokens': [],
            'lemmas': [],
            'pos': [],
            'is_punct': [],
            'is_stop': [],
            'spans': []
        }

    doc = nlp(text)

    features = {
        'tokens': [token.text for token in doc],
        'lemmas': [token.lemma_ for token in doc],
        'pos': [token.pos_ for token in doc],
        'is_punct': [token.is_punct for token in doc],
        'is_stop': [token.is_stop for token in doc],
        'spans': [(token.idx, token.idx + len(token.text)) for token in doc]
    }

    return features
"""""


# Example of how misspelling correction works
text = "El síndrome de down es una patología genetica."
print(preprocess_text(text))  #makes incorrect corrections. add dictionary for medical words


Detected Language: Spanish
El síndrome de don es una patología genética.


### Lists of negation and uncertainty cues in Spanish/Catalan

In [6]:
NEGATION_PREFIX_CUES = [
    "no", "sin", "ausencia de", "ausencia", "negación", "negativo", "negativa",
    "descarta", "descartado", "descartada", "descartar", "inexistente", "niega",
    "rechaza", "libre de", "excluye", "excluido", "excluida", "no hay", "no se",
    "no es", "no tiene", "nunca", "tampoco", "no presenta", "no muestra",
    "no evidencia", "ni", "jamás", "sense", "absència", "cap", "exempt", "exempta",
    "negatiu", "negativa", "nega", "descartat", "no hi ha", "res de", "no presenta",
    "no existeix", "mai", "nul", "nul·la", "lliure de", "no consta", "exclou",
    "gens de", "absents", "sense evidència de", "sense signes de", "no es detecta",
    "no s'observa", "no és compatible amb", "no s'aprecia", "excluir", "excloure",
    "denegar", "negar", "nada de", "ningún", "ninguna", "nunca", "ausentes",
    "falta de", "carencia de", "déficit de", "eliminado", "eliminada",
    "negado por", "descartándose", "normal", "normales", "dentro de límites normales",
    "sense alteracions", "sense canvis", "normale", "normales", "normal para"
]

NEGATION_POSTFIX_CUES = [
    "descartado", "descartada", "negado", "negada", "excluido", "excluida",
    "ausente", "inexistente", "descartat", "negat", "exclòs", "exclosa",
    "absent", "no detectado", "no detectada", "no apreciable", "no visualizado",
    "no visualizada", "no present", "no visible", "no evidenciable", "no identificable",
    "no identificado", "no identificada", "no hay", "no hi ha", "no existe",
    "no existeix", "no observado", "no observada", "no s'observa", "no mostrado",
    "no mostrada", "no demostrado", "no demostrada", "no apreciado", "no apreciada",
    "dentro de límites normales", "sin alteraciones", "sense alteracions"
]

UNCERTAINTY_CUES = [
    "posible", "probable", "quizás", "quizá", "tal vez", "posiblemente",
    "probablemente", "parece", "sugiere", "sugestivo", "compatible con",
    "podría", "puede", "puede ser", "pudiera", "sospecha", "sospechar",
    "sospechado", "sospechada", "se sospecha", "duda", "en duda", "incierto",
    "incierta", "inseguro", "insegura", "no claro", "no clara", "no descarta",
    "potser", "possiblement", "probablement", "sembla", "suggereix", "compatible amb",
    "podria", "pot", "pot ser", "sospita", "sospitar", "es sospita", "dubte",
    "incert", "incerta", "no clar", "no clara", "dubtós", "dubtosa", "equívoc",
    "equívoca", "a considerar", "a descartar", "no se puede excluir", "no es pot excloure",
    "suggestivo/a de", "indeterminado", "indeterminada", "por determinar", "per determinar",
    "por confirmar", "per confirmar", "a valorar", "en estudio", "en estudi",
    "pendiente", "pendent", "a evaluar", "a evaluer", "interrogante", "interrogant",
    "no concluyente", "no concluent", "eventual", "eventualment", "no definitivo",
    "no definitiu", "impresiona", "impresión de", "impressió de", "presuntivo",
    "presuntivo", "indicio", "indici", "sospecho", "potencial", "presumible",
    "presumiblement", "aparente", "aparentment", "orientativo", "orientatiu"
]

### Dataset analysis and evaluation functions

In [30]:
# Function to get words and their positions
def get_nextWord(text, End_word):
    i, word, start = 0, "", True
    while i < len(text):
        if text[i] not in (' ', *End_word):
            if start:
                pos = i
                start = False
            word += text[i]
        elif word:
            yield (pos, word)
            word, start = "", True
        i += 1
    yield (pos, word)

# Function to get start and end labels
def start_end_label(df, number_row):
    data = df["predictions"][number_row][0]["result"]
    text = df["data"][number_row]["text"]
    return [(entry["value"]["start"], entry["value"]["end"], text[entry["value"]["start"]:entry["value"]["end"]],
             len(text[entry["value"]["start"]:entry["value"]["end"]].rstrip().split()), entry["value"]["labels"][0]) for entry in data]


# Function to label text
def label_text(df, number_row, End_word):
    text = df["data"][number_row]["text"]
    labels = sorted(start_end_label(df, number_row), key=lambda x: x[0])
    if not labels:
        return []
    pos, words_count, text_annotations = 0, 0, []
    all_words = list(get_nextWord(text, End_word))
    for index, word in all_words:
        if pos < len(labels):
            start, end, _, num_words, label = labels[pos]
        text_annotations.append((word, label if start <= index <= end else 'other'))
        words_count += 1 if start <= index <= end else 0
        if words_count == num_words:
            pos += 1
            words_count = 0
    return text_annotations


# Function to obtain sentence delimiters
def end_word_obtantion():
    End_sentence = '!.?,;:'
    End_word = ''.join(set(string.punctuation) - set(End_sentence))
    return End_sentence, End_word

In [37]:
# Function to identify negation and uncertainty scope
# Function to get words and their positions
def get_next_word(text):
    for match in re.finditer(r'\b\w+\b', text):
        yield (match.start(), match.group(0))
        
# Function to identify negation and uncertainty scope
def identify_scope(text, negations_cues, uncertainty_cues):
    sentences = []
    current_sentence = []
    negation, uncertainty = False, False
    
    for pos, token in get_next_word(text):
        label = "other"
        if token in negations_cues:
            negation = True
            label = "NEG"
        elif token in uncertainty_cues:
            uncertainty = True
            label = "UNC"
        
        current_sentence.append((token, label))
        
        if token in '.!?':
            current_sentence = [(t[0], "NSCO" if negation else "USCO") if t[1] == "other" else t for t in current_sentence]
            sentences.extend(current_sentence)
            current_sentence = []
            negation, uncertainty = False, False

# Function to pad sentences
def padded_sentences(y_pred, y_true):
    return [[(p if i < len(p) else (' ', 'other')) for i in range(len(t))] for p, t in zip(y_pred, y_true)], [[(t if i < len(t) else (' ', 'other')) for i in range(len(p))] for p, t in zip(y_pred, y_true)]



In [41]:
# Main execution
if __name__ == "__main__":
    # Load data
    train_data = load_data(train_url)
    test_data = load_data(test_url)

    if not train_data or not test_data:
        print("Error loading training or test data")
    
    # Define negation and uncertainty cues
    negations_cues = {"no", "nunca", "jamás", "sin"}
    uncertainty_cues = {"quizás", "tal vez", "posiblemente"}

    # Process training data
    if isinstance(train_data, list):
        print("Processing training data...")
        for sample in train_data:
            if 'text' in sample:
                text = preprocess_text(sample["text"])
                print(f"Processed text: {text}")
                scope_result = identify_scope(text, negations_cues, uncertainty_cues)
                print(f"Scope result: {scope_result}")
            else:
                print("Sample doesn't contain text key")
    
    # Process test data
    if isinstance(test_data, list):
        print("Processing test data...")
        for sample in test_data:
            if 'text' in sample:
                text = preprocess_text(sample["text"])
                print(f"Processed text: {text}")
                scope_result = identify_scope(text, negations_cues, uncertainty_cues)
                print(f"Scope result: {scope_result}")
            else:
                print("Sample doesn't contain text key")


Processing training data...
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn't contain text key
Sample doesn