### Data Preprocessing

In [53]:
!pip install langdetect



In [54]:
import json
import re
import requests
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import spacy
import string
from langdetect import detect

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

try:
    nlp = spacy.load("es_core_news_sm")
except OSError:
    print("Downloading Spanish model for spaCy...")
    spacy.cli.download("es_core_news_sm")
    nlp = spacy.load("es_core_news_sm")

train_url = 'https://raw.githubusercontent.com/AgustinaLazzati/NLP-Project/refs/heads/Test/negacio_train_v2024.json'
test_url = 'https://raw.githubusercontent.com/AgustinaLazzati/NLP-Project/refs/heads/Test/negacio_test_v2024.json'

In [55]:
def load_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        try:
            data = json.loads(response.text)
            return data
        except json.JSONDecodeError:
            print(f"Failed to parse JSON data from {url}")
            print(f"First 500 characters of response: {response.text[:500]}")
            return None
    else:
        print(f"Failed to load data: {response.status_code}")
        return None

def get_text(sample):
    if isinstance(sample, dict) and 'data' in sample and isinstance(sample['data'], dict):
        return sample['data'].get('text', '')
    return ''

def preprocess_text(text):
    if not isinstance(text, str) or not text:
        return ""

    # Handle redacted entities
    text = re.sub(r'\*+', '[REDACTED]', text)

    # Replace common patterns for redacted information
    text = re.sub(r'n[ºo]\s*(historia|episodi|h\.c\.|h\.c)[:\s]*\S+', '[HC_NUM]', text, flags=re.IGNORECASE)
    text = re.sub(r'sexe:\s*\w+', '[GENDER]', text, flags=re.IGNORECASE)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    # Handle common misspellings (expand as needed)
    misspellings = {
        'sindrom': 'síndrome',
        'patologí': 'patología',
        'sintoma': 'síntoma',
    }
    for wrong, correct in misspellings.items():
        text = re.sub(fr'\b{wrong}\b', correct, text, flags=re.IGNORECASE)

    return text.strip()

def extract_features(text):
    if not text:
        return {
            'tokens': [],
            'lemmas': [],
            'pos': [],
            'is_punct': [],
            'is_stop': [],
            'spans': []
        }

    doc = nlp(text)

    features = {
        'tokens': [token.text for token in doc],
        'lemmas': [token.lemma_ for token in doc],
        'pos': [token.pos_ for token in doc],
        'is_punct': [token.is_punct for token in doc],
        'is_stop': [token.is_stop for token in doc],
        'spans': [(token.idx, token.idx + len(token.text)) for token in doc]
    }

    return features

In [56]:
def split_into_sentences(text):
    """
    Split text into sentences, one per line.
    """
    if not isinstance(text, str) or not text.strip():
        return []

    # Preprocess the text first
    processed_text = preprocess_text(text)

    doc = nlp(processed_text)

    # Extract sentences
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

    return sentences

def process_data_to_sentences(data):
    """
    Process the entire dataset to extract sentences, one per line.
    """
    all_sentences = []

    for sample in data:
        text = get_text(sample)
        if text:
            sentences = split_into_sentences(text)
            all_sentences.extend(sentences)

    return all_sentences

In [57]:
NEGATION_PREFIX_CUES = [
    "no", "sin", "ausencia de", "ausencia", "negación", "negativo", "negativa",
    "descarta", "descartado", "descartada", "descartar", "inexistente", "niega",
    "rechaza", "libre de", "excluye", "excluido", "excluida", "no hay", "no se",
    "no es", "no tiene", "nunca", "tampoco", "no presenta", "no muestra",
    "no evidencia", "ni", "jamás", "sense", "absència", "cap", "exempt", "exempta",
    "negatiu", "negativa", "nega", "descartat", "no hi ha", "res de", "no presenta",
    "no existeix", "mai", "nul", "nul·la", "lliure de", "no consta", "exclou",
    "gens de", "absents", "sense evidència de", "sense signes de", "no es detecta",
    "no s'observa", "no és compatible amb", "no s'aprecia", "excluir", "excloure",
    "denegar", "negar", "nada de", "ningún", "ninguna", "nunca", "ausentes",
    "falta de", "carencia de", "déficit de", "eliminado", "eliminada",
    "negado por", "descartándose", "normal", "normales", "dentro de límites normales",
    "sense alteracions", "sense canvis", "normale", "normales", "normal para"
]

NEGATION_POSTFIX_CUES = [
    "descartado", "descartada", "negado", "negada", "excluido", "excluida",
    "ausente", "inexistente", "descartat", "negat", "exclòs", "exclosa",
    "absent", "no detectado", "no detectada", "no apreciable", "no visualizado",
    "no visualizada", "no present", "no visible", "no evidenciable", "no identificable",
    "no identificado", "no identificada", "no hay", "no hi ha", "no existe",
    "no existeix", "no observado", "no observada", "no s'observa", "no mostrado",
    "no mostrada", "no demostrado", "no demostrada", "no apreciado", "no apreciada",
    "dentro de límites normales", "sin alteraciones", "sense alteracions"
]

UNCERTAINTY_CUES = [
    "posible", "probable", "quizás", "quizá", "tal vez", "posiblemente",
    "probablemente", "parece", "sugiere", "sugestivo", "compatible con",
    "podría", "puede", "puede ser", "pudiera", "sospecha", "sospechar",
    "sospechado", "sospechada", "se sospecha", "duda", "en duda", "incierto",
    "incierta", "inseguro", "insegura", "no claro", "no clara", "no descarta",
    "potser", "possiblement", "probablement", "sembla", "suggereix", "compatible amb",
    "podria", "pot", "pot ser", "sospita", "sospitar", "es sospita", "dubte",
    "incert", "incerta", "no clar", "no clara", "dubtós", "dubtosa", "equívoc",
    "equívoca", "a considerar", "a descartar", "no se puede excluir", "no es pot excloure",
    "suggestivo/a de", "indeterminado", "indeterminada", "por determinar", "per determinar",
    "por confirmar", "per confirmar", "a valorar", "en estudio", "en estudi",
    "pendiente", "pendent", "a evaluar", "a evaluer", "interrogante", "interrogant",
    "no concluyente", "no concluent", "eventual", "eventualment", "no definitivo",
    "no definitiu", "impresiona", "impresión de", "impressió de", "presuntivo",
    "presuntivo", "indicio", "indici", "sospecho", "potencial", "presumible",
    "presumiblement", "aparente", "aparentment", "orientativo", "orientatiu"
]

Once we've uploaded the necessary data we will implement a tokenizer in order to be able to convert the sentences in our data to tokens:

In [58]:
def tokenize_spanish(text):
    """
    Tokenizer for Spanish text using regex patterns.
    Handles Spanish-specific contractions, abbreviations, and punctuation.
    """
    # Common Spanish contractions and abbreviations
    contractions = {
        r"\bdel\b": "de el",
        r"\bal\b": "a el",
    }

    # Apply contractions expansion
    for pattern, replacement in contractions.items():
        text = re.sub(pattern, replacement, text)

    # Tokenization pattern:
    pattern = r"""
        \w+'\w+|\w+-\w+|            # Words with apostrophes or hyphens
        [a-zA-ZáéíóúÁÉÍÓÚñÑüÜ]+|    # Spanish letters with diacritics
        \d+\.?\d*|                   # Numbers (including decimals)
        [^\w\s]|                     # Any punctuation
        \S                           # Any non-whitespace (fallback)
    """

    tokens = re.findall(pattern, text, re.VERBOSE)
    return [token for token in tokens if token.strip()]

def tokenize_catalan(text):
    """
    Tokenizer for Catalan text using regex patterns.
    Handles Catalan-specific contractions, abbreviations, and punctuation.
    """
    # Common Catalan contractions and abbreviations
    contractions = {
        r"\bdel\b": "de el",        # Shared with Spanish
        r"\bal\b": "a el",           # Shared with Spanish
        r"\bpel\b": "per el",        # Catalan specific
        r"\bvals\b": "va els",       # Catalan specific
        r"\bca\b": "casa",           # Common abbreviation
    }

    # Apply contractions expansion
    for pattern, replacement in contractions.items():
        text = re.sub(pattern, replacement, text)

    # Tokenization pattern:
    pattern = r"""
        \w+['·]\w+|\w+-\w+|         # Words with apostrophes, middle dots or hyphens
        [a-zA-ZàèéíïòóúüÀÈÉÍÏÒÓÚÜÇç]+|  # Catalan letters with diacritics
        \d+\.?\d*|                  # Numbers (including decimals)
        [^\w\s]|                    # Any punctuation
        \S                          # Any non-whitespace (fallback)
    """

    tokens = re.findall(pattern, text, re.VERBOSE)
    return [token for token in tokens if token.strip()]

def tokenize_sentences(sentences):
    tokenized_sentences = []

    for sentence in sentences:
        if not isinstance(sentence, str) or not sentence.strip():
            # Skip empty or non-string entries
            continue

        if len(sentence.split()) < 3:
            # Skip very short sentences that can't be reliably detected
            tokens = tokenize_spanish(sentence)  # fallback to Spanish
            tokenized_sentences.append(tokens)
            continue

        try:
            # Detect language for each individual sentence
            lang = detect(sentence)

            # Tokenize based on detected language
            if lang == 'es':
                tokens = tokenize_spanish(sentence)
            else:
                tokens = tokenize_catalan(sentence)

            tokenized_sentences.append(tokens)
        except Exception as e:
            # Fallback to Spanish tokenizer if detection fails
            tokens = tokenize_spanish(sentence)
            tokenized_sentences.append(tokens)

    return tokenized_sentences

In [59]:
def main():
    # Load data
    train_data = load_data(train_url)
    test_data = load_data(test_url)

    # Preprocess data into sentences
    train_sentences = process_data_to_sentences(train_data)

    # Convert each sentence into tokens
    tokenized_sentences = tokenize_sentences(train_sentences)

    # Print the first few tokenized sentences for verification
    for i, tokens in enumerate(tokenized_sentences[:5]):
        print(f"Sentence {i+1}: {tokens}")

main()

Sentence 1: ['[', 'HC', '_', 'NUM', ']', '[', 'REDACTED']
Sentence 2: [']', '[', 'REDACTED', ']', '[', 'REDACTED', ']', '[', 'HC', '_', 'NUM', ']', '[', 'GENDER', ']', 'data', 'de', 'naixement', ':', '16.05', '.', '1936', 'edat', ':', '82', 'anys', 'procedencia', 'cex', 'mateix', 'hosp', 'servei', 'urologia', 'data', "d'ingres", '24.07', '.', '2018', 'data', "d'alta", '25.07', '.', '2018', '08', ':', '54', ':', '04', 'ates', 'per', '[', 'REDACTED', ']', ',', '[', 'REDACTED', ']', ';', '[', 'REDACTED', ']', ',', '[', 'REDACTED', ']', 'informe', "d'alta", "d'hospitalitzacio", 'motiu', "d'ingres", 'paciente', 'que', 'ingresa', 'de', 'forma', 'programada', 'para', 'realizacion', 'de', 'uretrotomia', 'interna', '.']
Sentence 3: ['antecedents', 'alergia', 'a', 'penicilina', 'y', 'cloramfenicol', '.']
Sentence 4: ['no', 'habitos', 'toxicos', '.']
Sentence 5: ['antecedentes', 'medicos', ':', 'bloqueo', 'auriculoventricular', 'de', 'primer', 'grado', 'hipertension', 'arterial', '.']
