In [13]:
!pip install pyspellchecker
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=372082b0731875d8715660183b33ad78f8eb2ac0305b6b43fb715e04f94f8d31
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [14]:
import json
import re
import requests
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import spacy
import string
from langdetect import detect
from spellchecker import SpellChecker


try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

try:
    nlp = spacy.load("es_core_news_sm")
except OSError:
    print("Downloading Spanish model for spaCy...")
    spacy.cli.download("es_core_news_sm")
    nlp = spacy.load("es_core_news_sm")

train_url = 'https://raw.githubusercontent.com/AgustinaLazzati/NLP-Project/refs/heads/Test/negacio_train_v2024.json'
test_url = 'https://raw.githubusercontent.com/AgustinaLazzati/NLP-Project/refs/heads/Test/negacio_test_v2024.json'

The following code defines a function used to correct misspellings on the given text. It uses the library SpellChecker to provide a

In [23]:
# Load spell checkers for Spanish and Catalan
spell_es = SpellChecker(language='es')  # Spanish
spell_ca = SpellChecker(language=None)  # Catalan (custom dictionary needed)

# URL of the raw file in the GitHub repository
file_url = "https://raw.githubusercontent.com/AgustinaLazzati/NLP-Project/refs/heads/main/catala.txt"

# Fetch the file content from GitHub
response = requests.get(file_url)

if response.status_code == 200:
    # If the request was successful, use the content
    catalan_words = response.text.splitlines()
    print(f"Loaded {len(catalan_words)} words from the dictionary.")
else:
    print(f"Failed to retrieve the file. HTTP Status code: {response.status_code}")

# Now you can load the words into the spell checker
spell_ca.word_frequency.load_words(catalan_words)

def correct_misspellings(text, spell_checker):
    def replace(match):
        word = match.group(0)
        corrected_word = spell_checker.correction(word)
        return corrected_word if corrected_word else word  # Keep original if no suggestion

    return re.sub(r'\b\w+\b', replace, text)

Loaded 1091836 words from the dictionary.


Data loading and preprocessing functions

In [28]:
def load_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        try:
            data = json.loads(response.text)
            return data
        except json.JSONDecodeError:
            print(f"Failed to parse JSON data from {url}")
            print(f"First 500 characters of response: {response.text[:500]}")
            return None
    else:
        print(f"Failed to load data: {response.status_code}")
        return None

def get_text(sample):
    if isinstance(sample, dict) and 'data' in sample and isinstance(sample['data'], dict):
        return sample['data'].get('text', '')
    return ''

def preprocess_text(text):
    if not isinstance(text, str) or not text:
        return ""

    # Handle redacted entities
    text = re.sub(r'\*+', '[REDACTED]', text)

    # Replace common patterns for redacted information
    text = re.sub(r'n[ºo]\s*(historia|episodi|h\.c\.|h\.c)[:\s]*\S+', '[HC_NUM]', text, flags=re.IGNORECASE)
    text = re.sub(r'sexe:\s*\w+', '[GENDER]', text, flags=re.IGNORECASE)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    # Detect language
    try:
      language = detect(text)
    except Exception as e:
        print(f"Language detection failed: {e}")
        language = 'unknown'  # If detection fails, assume unknown language

    # Correct misspellings based on detected language
    if language == 'ca':  # If Catalan is detected
        print('Detected Language: Catalan')
        text = correct_misspellings(text, spell_ca)
    elif language == 'es':  # If Spanish is detected
        print('Detected Language: Spanish')
        text = correct_misspellings(text, spell_es)

    return text.strip()

def extract_features(text):
    if not text:
        return {
            'tokens': [],
            'lemmas': [],
            'pos': [],
            'is_punct': [],
            'is_stop': [],
            'spans': []
        }

    doc = nlp(text)

    features = {
        'tokens': [token.text for token in doc],
        'lemmas': [token.lemma_ for token in doc],
        'pos': [token.pos_ for token in doc],
        'is_punct': [token.is_punct for token in doc],
        'is_stop': [token.is_stop for token in doc],
        'spans': [(token.idx, token.idx + len(token.text)) for token in doc]
    }

    return features

# Example of how misspelling correction works
text = "El síndrome de down es una patologí genetica."
print(preprocess_text(text))

Detected Language: Spanish
El síndrome de don es una patología genética.


Lists of negation and uncertainty cues in Spanish/Catalan

In [5]:
NEGATION_PREFIX_CUES = [
    "no", "sin", "ausencia de", "ausencia", "negación", "negativo", "negativa",
    "descarta", "descartado", "descartada", "descartar", "inexistente", "niega",
    "rechaza", "libre de", "excluye", "excluido", "excluida", "no hay", "no se",
    "no es", "no tiene", "nunca", "tampoco", "no presenta", "no muestra",
    "no evidencia", "ni", "jamás", "sense", "absència", "cap", "exempt", "exempta",
    "negatiu", "negativa", "nega", "descartat", "no hi ha", "res de", "no presenta",
    "no existeix", "mai", "nul", "nul·la", "lliure de", "no consta", "exclou",
    "gens de", "absents", "sense evidència de", "sense signes de", "no es detecta",
    "no s'observa", "no és compatible amb", "no s'aprecia", "excluir", "excloure",
    "denegar", "negar", "nada de", "ningún", "ninguna", "nunca", "ausentes",
    "falta de", "carencia de", "déficit de", "eliminado", "eliminada",
    "negado por", "descartándose", "normal", "normales", "dentro de límites normales",
    "sense alteracions", "sense canvis", "normale", "normales", "normal para"
]

NEGATION_POSTFIX_CUES = [
    "descartado", "descartada", "negado", "negada", "excluido", "excluida",
    "ausente", "inexistente", "descartat", "negat", "exclòs", "exclosa",
    "absent", "no detectado", "no detectada", "no apreciable", "no visualizado",
    "no visualizada", "no present", "no visible", "no evidenciable", "no identificable",
    "no identificado", "no identificada", "no hay", "no hi ha", "no existe",
    "no existeix", "no observado", "no observada", "no s'observa", "no mostrado",
    "no mostrada", "no demostrado", "no demostrada", "no apreciado", "no apreciada",
    "dentro de límites normales", "sin alteraciones", "sense alteracions"
]

UNCERTAINTY_CUES = [
    "posible", "probable", "quizás", "quizá", "tal vez", "posiblemente",
    "probablemente", "parece", "sugiere", "sugestivo", "compatible con",
    "podría", "puede", "puede ser", "pudiera", "sospecha", "sospechar",
    "sospechado", "sospechada", "se sospecha", "duda", "en duda", "incierto",
    "incierta", "inseguro", "insegura", "no claro", "no clara", "no descarta",
    "potser", "possiblement", "probablement", "sembla", "suggereix", "compatible amb",
    "podria", "pot", "pot ser", "sospita", "sospitar", "es sospita", "dubte",
    "incert", "incerta", "no clar", "no clara", "dubtós", "dubtosa", "equívoc",
    "equívoca", "a considerar", "a descartar", "no se puede excluir", "no es pot excloure",
    "suggestivo/a de", "indeterminado", "indeterminada", "por determinar", "per determinar",
    "por confirmar", "per confirmar", "a valorar", "en estudio", "en estudi",
    "pendiente", "pendent", "a evaluar", "a evaluer", "interrogante", "interrogant",
    "no concluyente", "no concluent", "eventual", "eventualment", "no definitivo",
    "no definitiu", "impresiona", "impresión de", "impressió de", "presuntivo",
    "presuntivo", "indicio", "indici", "sospecho", "potencial", "presumible",
    "presumiblement", "aparente", "aparentment", "orientativo", "orientatiu"
]

NegationDetector class

In [6]:
class NegationDetector:
    def __init__(self):
        self.negation_prefix_patterns = self.compile_patterns(NEGATION_PREFIX_CUES)
        self.negation_postfix_patterns = self.compile_patterns(NEGATION_POSTFIX_CUES)
        self.uncertainty_patterns = self.compile_patterns(UNCERTAINTY_CUES)
        self.scope_window = 7  # Increased from 5 to capture larger scopes

    def compile_patterns(self, cue_list):
        patterns = []
        for cue in cue_list:
            if ' ' in cue:
                pattern = r'\b' + re.escape(cue) + r'\b'
            else:
                pattern = r'\b' + re.escape(cue) + r'\b'
            patterns.append(re.compile(pattern, re.IGNORECASE))
        return patterns

    def find_matches(self, text, patterns):
        matches = []
        for pattern in patterns:
            for match in pattern.finditer(text):
                matches.append({
                    'start': match.start(),
                    'end': match.end(),
                    'cue': text[match.start():match.end()]
                })
        return matches

    def detect_negation_cues(self, text):
        prefix_matches = self.find_matches(text, self.negation_prefix_patterns)
        postfix_matches = self.find_matches(text, self.negation_postfix_patterns)

        cues = []
        for match in prefix_matches:
            match['type'] = 'negation_prefix'
            cues.append(match)
        for match in postfix_matches:
            match['type'] = 'negation_postfix'
            cues.append(match)
        return cues

    def detect_uncertainty_cues(self, text):
        """Detect uncertainty cues in text"""
        matches = self.find_matches(text, self.uncertainty_patterns)
        cues = []
        for match in matches:
            match['type'] = 'uncertainty'
            cues.append(match)
        return cues

    def detect_scope(self, text, cues, features):
        if not text or not cues or not features['tokens']:
            return []

        scopes = []
        doc = nlp(text)
        tokens = [token for token in doc]

        for cue in cues:
            cue_start = cue['start']
            cue_token_idx = None
            for i, token in enumerate(tokens):
                if token.idx <= cue_start < token.idx + len(token.text):
                    cue_token_idx = i
                    break

            if cue_token_idx is None:
                continue

            if cue['type'] == 'negation_prefix':
                scope_start = cue['end']
                scope_end_idx = min(cue_token_idx + self.scope_window + 1, len(tokens))
                if scope_end_idx < len(tokens):
                    scope_end = tokens[scope_end_idx].idx
                else:
                    scope_end = len(text)

            elif cue['type'] == 'negation_postfix':
                scope_end = cue['start']
                scope_start_idx = max(0, cue_token_idx - self.scope_window)
                scope_start = tokens[scope_start_idx].idx

            elif cue['type'] == 'uncertainty':
                scope_start = cue['end']
                scope_end_idx = min(cue_token_idx + self.scope_window + 1, len(tokens))
                if scope_end_idx < len(tokens):
                    scope_end = tokens[scope_end_idx].idx
                else:
                    scope_end = len(text)

            for i, token in enumerate(tokens[cue_token_idx:min(cue_token_idx + self.scope_window + 1, len(tokens))]):
                if token.is_punct and token.text in ['.', ';', ':', '!', '?']:
                    if cue['type'] in ['negation_prefix', 'uncertainty']:
                        scope_end = token.idx
                        break

            if cue['type'] == 'negation_postfix':
                for i in range(cue_token_idx - 1, max(0, cue_token_idx - self.scope_window - 1), -1):
                    if tokens[i].is_punct and tokens[i].text in ['.', ';', ':', '!', '?']:
                        scope_start = tokens[i+1].idx if i+1 < len(tokens) else tokens[i].idx + len(tokens[i].text)
                        break

            scope_text = text[scope_start:scope_end].strip()
            if scope_text:
                scopes.append({
                    'cue': cue,
                    'scope_start': scope_start,
                    'scope_end': scope_end,
                    'scope_text': scope_text
                })

        return scopes

    def process_text(self, text):
        cleaned_text = preprocess_text(text)
        if not cleaned_text:
            return {
                'text': "",
                'negation_cues': [],
                'uncertainty_cues': [],
                'scopes': []
            }

        features = extract_features(cleaned_text)
        negation_cues = self.detect_negation_cues(cleaned_text)
        uncertainty_cues = self.detect_uncertainty_cues(cleaned_text)
        all_cues = negation_cues + uncertainty_cues
        scopes = self.detect_scope(cleaned_text, all_cues, features)

        return {
            'text': cleaned_text,
            'negation_cues': negation_cues,
            'uncertainty_cues': uncertainty_cues,
            'scopes': scopes
        }

Dataset analysis and evaluation functions

In [7]:
def analyze_dataset_statistics(data):
    if not data:
        return "No data available for analysis."

    result = {
        "num_samples": len(data),
        "has_text": 0,
        "sample_keys": set(),
        "avg_text_length": 0,
        "text_lengths": [],
        "languages": {"spanish": 0, "catalan": 0, "unknown": 0}
    }

    total_length = 0

    if data and isinstance(data[0], dict):
        result["sample_keys"] = set(data[0].keys())

    for sample in data:
        if isinstance(sample, dict):
            text = get_text(sample)
            if text:
                result["has_text"] += 1
                text_length = len(text)
                result["text_lengths"].append(text_length)
                total_length += text_length

                text_lower = text.lower()
                if any(word in text_lower for word in ["paciente", "presenta", "día", "hospital", "médico"]):
                    result["languages"]["spanish"] += 1
                elif any(word in text_lower for word in ["pacient", "presenta", "dia", "hospital", "metge"]):
                    result["languages"]["catalan"] += 1
                else:
                    result["languages"]["unknown"] += 1

    if result["has_text"] > 0:
        result["avg_text_length"] = total_length / result["has_text"]

    return result

def evaluate_model(predictions, gold_standard):
    true_pos_cues = 0
    false_pos_cues = 0
    false_neg_cues = 0
    true_pos_scopes = 0
    false_pos_scopes = 0
    false_neg_scopes = 0

    if true_pos_cues + false_pos_cues > 0:
        precision_cues = true_pos_cues / (true_pos_cues + false_pos_cues)
    else:
        precision_cues = 0

    if true_pos_cues + false_neg_cues > 0:
        recall_cues = true_pos_cues / (true_pos_cues + false_neg_cues)
    else:
        recall_cues = 0

    if precision_cues + recall_cues > 0:
        f1_cues = 2 * (precision_cues * recall_cues) / (precision_cues + recall_cues)
    else:
        f1_cues = 0

    if true_pos_scopes + false_pos_scopes > 0:
        precision_scopes = true_pos_scopes / (true_pos_scopes + false_pos_scopes)
    else:
        precision_scopes = 0

    if true_pos_scopes + false_neg_scopes > 0:
        recall_scopes = true_pos_scopes / (true_pos_scopes + false_neg_scopes)
    else:
        recall_scopes = 0

    if precision_scopes + recall_scopes > 0:
        f1_scopes = 2 * (precision_scopes * recall_scopes) / (precision_scopes + recall_scopes)
    else:
        f1_scopes = 0

    return {
        'cues': {
            'precision': precision_cues,
            'recall': recall_cues,
            'f1': f1_cues
        },
        'scopes': {
            'precision': precision_scopes,
            'recall': recall_scopes,
            'f1': f1_scopes
        }
    }

Main execution and testing

In [8]:
def main():
    print("Loading training data...")
    train_data = load_data(train_url)
    print("Loading test data...")
    test_data = load_data(test_url)

    if not train_data or not test_data:
        print("Failed to load data. Exiting.")
        return

    print("\nAnalyzing training dataset...")
    train_stats = analyze_dataset_statistics(train_data)
    print(f"Training dataset statistics: {train_stats}")

    print("\nAnalyzing test dataset...")
    test_stats = analyze_dataset_statistics(test_data)
    print(f"Test dataset statistics: {test_stats}")

    detector = NegationDetector()

    print("\nProcessing samples from training data...")

    all_cues = []
    all_scopes = []

    for i, sample in enumerate(train_data[:5]):
        text = get_text(sample)
        if text:
            print(f"\n--- Sample {i+1} ---")
            print(f"Original text (first 100 chars): {text[:100]}...")

            result = detector.process_text(text)
            print(f"Cleaned text (first 100 chars): {result['text'][:100]}...")

            n_cues = len(result['negation_cues']) + len(result['uncertainty_cues'])
            all_cues.extend(result['negation_cues'])
            all_cues.extend(result['uncertainty_cues'])
            all_scopes.extend(result['scopes'])

            print(f"Found {len(result['negation_cues'])} negation cues and {len(result['uncertainty_cues'])} uncertainty cues")

            if result['negation_cues']:
                print("Sample negation cues:")
                for cue in result['negation_cues'][:3]:
                    print(f"- {cue['cue']} ({cue['start']}:{cue['end']})")

            if result['uncertainty_cues']:
                print("Sample uncertainty cues:")
                for cue in result['uncertainty_cues'][:3]:
                    print(f"- {cue['cue']} ({cue['start']}:{cue['end']})")

            if result['scopes']:
                print("Sample detected scopes:")
                for scope in result['scopes'][:3]:
                    print(f"- Cue: {scope['cue']['cue']}, Scope: '{scope['scope_text']}'")

    print("\n--- Overall Statistics ---")
    print(f"Total cues found in samples: {len(all_cues)}")
    print(f"Total scopes found in samples: {len(all_scopes)}")

    if all_cues:
        cue_types = {}
        for cue in all_cues:
            cue_type = cue['type']
            if cue_type in cue_types:
                cue_types[cue_type] += 1
            else:
                cue_types[cue_type] = 1

        print("Cue type distribution:")
        for cue_type, count in cue_types.items():
            print(f"- {cue_type}: {count} ({count/len(all_cues)*100:.1f}%)")

    print("\nProcessing all test data...")
    results = []
    total_negation_cues = 0
    total_uncertainty_cues = 0
    total_scopes = 0

    for sample in test_data:
        text = get_text(sample)
        if text:
            result = detector.process_text(text)
            results.append(result)
            total_negation_cues += len(result['negation_cues'])
            total_uncertainty_cues += len(result['uncertainty_cues'])
            total_scopes += len(result['scopes'])

    print(f"Processed {len(results)} test samples.")
    print(f"Found {total_negation_cues} negation cues and {total_uncertainty_cues} uncertainty cues.")
    print(f"Detected {total_scopes} scopes in total.")

if __name__ == "__main__":
    main()

Loading training data...
Loading test data...

Analyzing training dataset...
Training dataset statistics: {'num_samples': 254, 'has_text': 254, 'sample_keys': {'data', 'predictions', 'annotations'}, 'avg_text_length': 5042.673228346457, 'text_lengths': [3866, 1401, 4271, 10349, 2958, 5888, 5886, 3647, 2822, 2099, 9312, 4467, 5663, 2387, 8348, 2725, 5443, 2535, 4847, 2713, 4257, 2956, 4626, 1749, 6102, 9401, 2965, 10551, 8686, 6128, 3220, 9682, 11741, 7088, 7698, 5720, 4668, 5344, 5962, 10647, 1597, 3762, 1797, 4248, 3771, 1679, 1856, 4554, 2415, 1491, 3764, 3950, 13715, 3882, 4009, 4539, 2553, 4259, 11501, 3343, 1989, 2071, 10209, 2258, 13481, 4241, 13566, 11883, 5190, 2257, 1802, 5597, 6898, 3444, 7826, 10773, 6031, 2408, 2588, 3444, 13037, 2575, 6411, 22202, 4075, 3655, 6762, 3621, 5090, 6705, 3309, 4714, 3637, 6754, 11352, 6741, 10928, 3329, 4192, 6227, 6505, 6060, 2229, 6667, 7181, 1664, 4754, 5162, 1454, 10466, 6166, 3922, 3213, 4887, 9417, 3757, 10755, 5804, 5797, 8930, 3444, 115