In [None]:
# %% [code]
# Install required packages (if not already installed)
# You may comment out this cell if you have already installed the dependencies.
!pip install datasets evaluate numpy pandas pydantic transformers accelerate seqeval peft


Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.met

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ls /content/drive/MyDrive/TFG/ClinAIS_dataset

 clinais.dev.json
'clinais.test&background.blind.json'
 clinais.train.augmented.json
 clinais.train.augmented_v2.json
 clinais.train_dev_augmented_SAFE.json
 clinais.train_dev_combined_A100.json
 clinais.train_dev_combined.json
 clinais.train_dev_combined_ultimate.json
 clinais.train.json
 clinais.train.synonyms.boundaries.json
 clinais.train.synonyms.json
 clinais.train_translated_A100.json
 clinais.train.translated.json
 clinais.train_translated_SAFE.json
 clinais.train.translated_v2.json
 clinais.train.translated_v3.json
 nada
 train_syn_aug.json


In [None]:
!git clone https://github.com/Iker0610/Sec-Identification-in-Spanish-Clinical-Notes.git

Cloning into 'Sec-Identification-in-Spanish-Clinical-Notes'...
remote: Enumerating objects: 97, done.[K
remote: Counting objects: 100% (97/97), done.[K
remote: Compressing objects: 100% (80/80), done.[K
remote: Total 97 (delta 21), reused 81 (delta 14), pack-reused 0 (from 0)[K
Receiving objects: 100% (97/97), 163.21 KiB | 2.12 MiB/s, done.
Resolving deltas: 100% (21/21), done.


In [None]:
import sys
sys.path.append("/content/drive/MyDrive/TFG/evaluation")
from metricas import score_predictions  # from /content/drive/MyDrive/TFG/evaluation/metricas.py


In [None]:
## clinais test (non optimized)

In [None]:
# ================================
# Complete ClinAIS Training and Testing Pipeline
# ================================

!git clone https://github.com/Iker0610/Sec-Identification-in-Spanish-Clinical-Notes.git
!pip install transformers datasets evaluate seqeval pydantic

import os
import sys
import json
import logging
from typing import List, Any
from pydantic import BaseModel
import numpy as np
import pandas as pd
from enum import Enum
import zipfile
from pathlib import Path

# Add the cloned repository to the path to import metrica
# Set up logging
logging.basicConfig(level=logging.INFO)

# -------------------------------
# 1) Dataset Models
# -------------------------------
class ClinicalSections(str, Enum):
    PRESENT_ILLNESS = "PRESENT_ILLNESS"
    DERIVED_FROM_TO = "DERIVED_FROM/TO"
    PAST_MEDICAL_HISTORY = "PAST_MEDICAL_HISTORY"
    FAMILY_HISTORY = "FAMILY_HISTORY"
    EXPLORATION = "EXPLORATION"
    TREATMENT = "TREATMENT"
    EVOLUTION = "EVOLUTION"

    @classmethod
    def list(cls):
        return list(map(lambda c: c.value, cls))

class SectionAnnotation(BaseModel):
    segment: str
    label: ClinicalSections
    start_offset: int
    end_offset: int

class SectionAnnotations(BaseModel):
    gold: List[SectionAnnotation] = []
    prediction: List[SectionAnnotation] = []

class BoundaryAnnotation(BaseModel):
    span: str
    boundary: ClinicalSections | None
    start_offset: int
    end_offset: int

class BoundaryAnnotations(BaseModel):
    gold: List[BoundaryAnnotation] = []
    prediction: List[BoundaryAnnotation] = []

class Entry(BaseModel):
    note_id: str
    note_text: str
    section_annotation: SectionAnnotations = SectionAnnotations()
    boundary_annotation: BoundaryAnnotations = BoundaryAnnotations()

    def toJson(self):
        return json.dumps(self, ensure_ascii=False, default=lambda o: o.__dict__)

class ClinAISDataset(BaseModel):
    annotated_entries: dict[str, Entry]
    scores: dict[str, Any] = {}

    def getEntry(self, idx: int) -> Entry:
        key = list(self.annotated_entries.keys())[idx]
        return self.annotated_entries[key]

    def toJson(self):
        return json.dumps(self, ensure_ascii=False, default=lambda o: o.__dict__)

# -------------------------------
# 2) WordListSplitter & EntrySplitter
# -------------------------------
class WordListSplitter():
    def __init__(self, max_size: int, min_size: int, split_caracter_list=['.', ',', ';', ':']):
        self.max_size = max_size
        self.min_size = min_size
        self.split_caracters = split_caracter_list

    def get_split_index(self, spans: List[str], caracter='.'):
        positions = []
        for idx, span in enumerate(spans):
            if span and span[-1] == caracter:
                if idx < len(spans) - 1 and spans[idx+1] and spans[idx+1][0] == '\n':
                    positions.append(idx+1)
                else:
                    positions.append(idx)
        if not positions:
            return -1
        avg = sum(positions) / len(positions)
        return min(positions, key=lambda x: abs(x - avg))

    def split(self, spans: List[str], split_caracter_index=0):
        if len(spans) > self.max_size:
            spos = self.get_split_index(spans, self.split_caracters[split_caracter_index])
            if spos < self.min_size or spos > len(spans) - self.min_size:
                if split_caracter_index < len(self.split_caracters)-1:
                    print("changing split caracter")
                    return self.split(spans, split_caracter_index+1)
                else:
                    print("split caracter choices exhausted. Dividing in half...")
                    spos = len(spans) // 2
            left = spans[:spos+1]
            right = spans[spos+1:]
            return self.split(left, split_caracter_index) + self.split(right, split_caracter_index)
        else:
            return [spans]

class EntrySplitter():
    def __init__(self, max_size: int, min_size: int, split_caracter_list=['.', ',', ';', ':']):
        self.max_size = max_size
        self.min_size = min_size
        self.split_caracters = split_caracter_list

    def get_split_index(self, b_annotations: List[BoundaryAnnotation], caracter='.'):
        positions = []
        for idx, ba in enumerate(b_annotations):
            if ba.span and ba.span[-1] == caracter:
                if idx < len(b_annotations) - 1 and b_annotations[idx+1].span and b_annotations[idx+1].span[0] == '\n':
                    positions.append(idx+1)
                else:
                    positions.append(idx)
        if not positions:
            return -1
        avg = sum(positions) / len(positions)
        return min(positions, key=lambda x: abs(x - avg))

    def split(self, b_annotations: List[BoundaryAnnotation], split_caracter_index=0) -> List[List[BoundaryAnnotation]]:
        if len(b_annotations) > self.max_size:
            spos = self.get_split_index(b_annotations, self.split_caracters[split_caracter_index])
            if spos < self.min_size or spos > len(b_annotations) - self.min_size:
                if split_caracter_index < len(self.split_caracters)-1:
                    print("changing split caracter")
                    return self.split(b_annotations, split_caracter_index+1)
                else:
                    print("split caracter choices exhausted. Dividing in half...")
                    spos = len(b_annotations) // 2
            left = b_annotations[:spos+1]
            right = b_annotations[spos+1:]
            return self.split(left, split_caracter_index) + self.split(right, split_caracter_index)
        else:
            return [b_annotations]

# -------------------------------
# 3) Translation pipeline
# -------------------------------
import transformers
from transformers import pipeline

pipe_en_es = pipeline("translation", model="Helsinki-NLP/opus-mt-en-es")
pipe_es_en = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")

def apply_translation_pipeline(text_list: List[str]) -> List[str]:
    # Spanish -> English
    first_step = pipe_es_en(text_list)
    # English -> Spanish
    second_step = pipe_en_es([x['translation_text'] for x in first_step])
    return [x['translation_text'] for x in second_step]

# -------------------------------
# 4) Utility functions for predictions
# -------------------------------
def split_entry(b_annotations_list: List[List[BoundaryAnnotation]], tokenizer, max_length_preset=None) -> (bool, List[List[BoundaryAnnotation]]):
    res = []
    rdy = True
    length_limit = tokenizer.model_max_length if max_length_preset is None else max_length_preset

    for b_annotations in b_annotations_list:
        joined_text = " ".join([ba.span for ba in b_annotations])
        tokenized_ids = tokenizer.encode(joined_text, truncation=False)
        if len(tokenized_ids) > length_limit:
            max_length = int((length_limit / len(tokenized_ids)) * len(b_annotations))
            esplitter = EntrySplitter(max_size=max_length, min_size=3)
            res += esplitter.split(b_annotations)
            rdy = False
        else:
            res.append(b_annotations)
    return rdy, res

def get_text_splits(entry: Entry, ba_splits: List[List[BoundaryAnnotation]]) -> List[str]:
    result = []
    for b_ann in ba_splits:
        start_offset = b_ann[0].start_offset
        end_offset = b_ann[-1].end_offset
        result.append(entry.note_text[start_offset : end_offset+1])
    return result

def getBoudaryAnnotationsForRange(bas: List[BoundaryAnnotation], start: int, end: int) -> List[BoundaryAnnotation]:
    result = []
    idx = 0
    while idx < len(bas) and bas[idx].start_offset < end:
        if bas[idx].start_offset >= start:
            result.append(bas[idx])
        idx += 1
    return result

# -------------------------------
# 5) Data augmentation functions
# -------------------------------
def translate_entry(entry: Entry):
    max_accepted_length = int(0.7 * pipe_es_en.tokenizer.model_max_length)
    section2indeces = {}
    entry_sections_texts = []
    next_index = 0

    for i, section in enumerate(entry.section_annotation.gold):
        tokenized_section = pipe_es_en.tokenizer(section.segment, truncation=False)
        if len(tokenized_section['input_ids']) > max_accepted_length:
            rdy = False
            splitted = [getBoudaryAnnotationsForRange(entry.boundary_annotation.gold,
                                                      section.start_offset,
                                                      section.end_offset)]
            while not rdy:
                rdy, splitted = split_entry(splitted, pipe_es_en.tokenizer, max_accepted_length)
            text_splits_list = get_text_splits(entry, splitted)
            entry_sections_texts += text_splits_list
            section2indeces[i] = list(range(next_index, next_index + len(text_splits_list)))
            next_index += len(text_splits_list)
        else:
            entry_sections_texts.append(section.segment)
            section2indeces[i] = [next_index]
            next_index += 1

    translations = apply_translation_pipeline(entry_sections_texts)
    return section2indeces, translations

def translate_dataset_and_save(dataset_path, translated_dataset_path):
    if not os.path.isfile(translated_dataset_path):
        print("No existing translated data found; creating new file.")
        with open(translated_dataset_path, 'w', encoding='utf-8') as f:
            f.write(ClinAISDataset(annotated_entries={}).toJson())

    print("Loading original dataset to translate...")
    with open(dataset_path, encoding='utf-8') as f:
        ds = ClinAISDataset(**json.load(f))

    print("Loading partial translated dataset if any...")
    with open(translated_dataset_path, encoding='utf-8') as f:
        translated_ds = ClinAISDataset(**json.load(f))

    print(f"Currently partial has {len(translated_ds.annotated_entries)} entries")

    from tqdm import tqdm
    print("Translating dataset:")
    keys_list = list(ds.annotated_entries.keys())
    for idx, key in tqdm(enumerate(keys_list)):
        if key in translated_ds.annotated_entries:
            continue

        entry = ds.annotated_entries[key]
        try:
            sec2idx, trans_texts = translate_entry(entry)
            new_entry = Entry(note_id=entry.note_id, note_text=" ".join(trans_texts))
            section_offset = 0

            for section_idx in sec2idx.keys():
                start_i = sec2idx[section_idx][0]
                end_i = sec2idx[section_idx][-1]
                combined = " ".join(trans_texts[start_i: end_i+1])
                label_ = entry.section_annotation.gold[section_idx].label

                seg = SectionAnnotation(
                    segment=combined,
                    label=label_,
                    start_offset=section_offset,
                    end_offset=section_offset + len(combined)
                )
                new_entry.section_annotation.gold.append(seg)

                splitted_spans = combined.split(" ")
                sp_offset = section_offset
                for i_, sp_ in enumerate(splitted_spans):
                    ba = BoundaryAnnotation(
                        span=sp_,
                        boundary=label_ if i_ == 0 else None,
                        start_offset=sp_offset,
                        end_offset=sp_offset + len(sp_)
                    )
                    new_entry.boundary_annotation.gold.append(ba)
                    sp_offset += len(sp_) + 1
                section_offset += len(combined) + 1

            translated_ds.annotated_entries[new_entry.note_id] = new_entry

            if idx % 10 == 0:
                with open(translated_dataset_path, 'w', encoding='utf-8') as f:
                    f.write(translated_ds.toJson())

        except Exception as e:
            print("Error while translating entry:", key, e)
            with open(translated_dataset_path, 'w', encoding='utf-8') as f:
                f.write(translated_ds.toJson())
            continue

    print("Saving final translated data to", translated_dataset_path)
    with open(translated_dataset_path, 'w', encoding='utf-8') as f:
        f.write(translated_ds.toJson())
    print("Done translation.")

def create_augmented_dataset(train_set_path, translated_train_set_path, save_path):
    with open(train_set_path, encoding='utf-8') as f:
        train_ds = ClinAISDataset(**json.load(f))

    with open(translated_train_set_path, encoding='utf-8') as f:
        translated_ds = ClinAISDataset(**json.load(f))

    augmented = ClinAISDataset(annotated_entries={})

    for k, e in train_ds.annotated_entries.items():
        augmented.annotated_entries[k] = e

    for k, e in translated_ds.annotated_entries.items():
        e.note_id += '_T'
        augmented.annotated_entries[k+'_T'] = e

    with open(save_path, 'w', encoding='utf-8') as f:
        f.write(augmented.toJson())
    print("Augmented dataset saved at", save_path)

def combine_train_dev_datasets(train_path, dev_path, combined_path):
    """Combine train and dev datasets for final training"""
    with open(train_path, encoding='utf-8') as f:
        train_ds = ClinAISDataset(**json.load(f))

    with open(dev_path, encoding='utf-8') as f:
        dev_ds = ClinAISDataset(**json.load(f))

    combined = ClinAISDataset(annotated_entries={})

    # Add all training entries
    for k, e in train_ds.annotated_entries.items():
        combined.annotated_entries[k] = e

    # Add all dev entries
    for k, e in dev_ds.annotated_entries.items():
        combined.annotated_entries[k] = e

    with open(combined_path, 'w', encoding='utf-8') as f:
        f.write(combined.toJson())
    print(f"Combined dataset saved at {combined_path}")
    print(f"Total entries: {len(combined.annotated_entries)}")

# -------------------------------
# 6) Data preparation pipeline
# -------------------------------
from datasets import Dataset, DatasetDict

def get_labelled_span_list(b_annotations: BoundaryAnnotations):
    cur_label = [None]
    def get_label(b):
        if b is not None:
            cur_label[0] = b
        return cur_label
    return [(x.span, get_label(x.boundary)[0]) for x in b_annotations.gold]

def create_dataset_object(data_set: ClinAISDataset, label2id: dict) -> Dataset:
    spans_labels = []
    for _, e in data_set.annotated_entries.items():
        labeled = get_labelled_span_list(e.boundary_annotation)
        labeled = [(sp, label2id[lab]) for sp, lab in labeled]
        spans_labels.append(list(zip(*labeled)))

    df = pd.DataFrame(data=spans_labels, columns=['spans','labels'])
    return Dataset.from_pandas(df)

def get_reshaped_list(target_list, pattern_list):
    result = []
    idx = 0
    for subl in pattern_list:
        result.append(target_list[idx: idx+len(subl)])
        idx += len(subl)
    return result

def split_entry_for_tokenizer(entry_data: dict, word_splitter: WordListSplitter):
    parts = word_splitter.split(entry_data['spans'])
    labs = get_reshaped_list(entry_data['labels'], parts)
    return {'spans': parts, 'labels': labs}

def tokenize_split_align(batch, tokenizer):
    tok = tokenizer(batch['spans'], truncation=False, is_split_into_words=True)
    splitted = {'spans': [], 'labels': [], 'input_ids': [], 'attention_mask': [], 'aligned_labels': []}

    for i, input_ids in enumerate(tok['input_ids']):
        if len(input_ids) > tokenizer.model_max_length:
            max_len = int((tokenizer.model_max_length / len(input_ids)) * len(batch['spans'][i]))
            spdata = split_entry_for_tokenizer({'spans': batch['spans'][i], 'labels': batch['labels'][i]},
                                               WordListSplitter(max_size=max_len, min_size=3))
            for sps, lbs in zip(spdata['spans'], spdata['labels']):
                splitted['spans'].append(sps)
                splitted['labels'].append(lbs)
                splitted['input_ids'].append([])
                splitted['attention_mask'].append([])
                splitted['aligned_labels'].append([])
        else:
            splitted['spans'].append(batch['spans'][i])
            splitted['labels'].append(batch['labels'][i])
            splitted['input_ids'].append(input_ids)
            splitted['attention_mask'].append(tok['attention_mask'][i])

            wids = tok.word_ids(batch_index=i)
            p = None
            label_ids = []
            for w in wids:
                if w is None:
                    label_ids.append(-100)
                elif w != p:
                    label_ids.append(batch['labels'][i][w])
                else:
                    label_ids.append(-100)
                p = w
            splitted['aligned_labels'].append(label_ids)

    return splitted

def tokenize_dataset_dict(ds_dict: DatasetDict, tokenizer) -> DatasetDict:
    def processing_fn(batch):
        return tokenize_split_align(batch, tokenizer)

    ds_tok = ds_dict.map(processing_fn, batched=True)

    # Re-check splitting
    for split in ds_tok:
        for example in ds_tok[split]['spans']:
            enc = tokenizer.encode(example, truncation=False, is_split_into_words=True)
            if len(enc) > tokenizer.model_max_length:
                return tokenize_dataset_dict(ds_tok, tokenizer)

    def rename_func(row):
        return {
            'input_ids': row['input_ids'],
            'labels': row['aligned_labels'],
            'attention_mask': row['attention_mask']
        }

    ds_tok = ds_tok.map(rename_func, remove_columns=['aligned_labels', 'spans'])
    return ds_tok

def execute_data_preparation_pipeline(train_data_path: str, tokenizer, label2id: dict) -> DatasetDict:
    """Modified to use only training data"""
    with open(train_data_path, encoding='utf-8') as f:
        train_data = ClinAISDataset(**json.load(f))

    train_ds = create_dataset_object(train_data, label2id)
    ds_dict = DatasetDict(train=train_ds)
    ds_tok = tokenize_dataset_dict(ds_dict, tokenizer)
    return ds_tok

# -------------------------------
# 7) Post-processing for predictions
# -------------------------------
class PredictionSection(BaseModel):
    entity_group: ClinicalSections
    score: float | None
    word: str
    start: int
    end: int

    class Config:
        use_enum_values = True

class Prediction(BaseModel):
    sections: List[PredictionSection] = []

class PredictionPostProcessor:
    def __init__(self, prediction: Prediction, verbuous=True):
        self.prediction = prediction
        self.min_section_size = 3
        self.punctuation_marks = [',', '.', ';', ':', ')', ']', '}', '!', '?']
        self.verbuous = verbuous

    def get_section_size(self, sec: PredictionSection):
        return len(sec.word.strip().split())

    def merge_undersize_sections(self):
        erase = []
        prev = None
        for i, sec in enumerate(self.prediction.sections):
            if self.get_section_size(sec) < self.min_section_size:
                if prev is not None:
                    if self.verbuous:
                        print(f"Merging {i} into {prev}")
                    self.prediction.sections[prev].word += sec.word
                    self.prediction.sections[prev].end = sec.end
                    erase.append(i)
                elif i < len(self.prediction.sections) - 1:
                    if self.verbuous:
                        print(f"Merging {i} into {i+1}")
                    self.prediction.sections[i+1].word = sec.word + self.prediction.sections[i+1].word
                    self.prediction.sections[i+1].start = sec.start
            else:
                prev = i

        for idx in sorted(erase, reverse=True):
            del self.prediction.sections[idx]
        return self.prediction

    def reasign_leading_punctuation_marks(self):
        for i, sec in enumerate(self.prediction.sections):
            if sec.word and sec.word[0] in self.punctuation_marks:
                if i > 0:
                    if self.verbuous:
                        print(f"Moving punctuation {sec.word[0]} from {i} to {i-1}")
                    self.prediction.sections[i-1].word += sec.word[0]
                    self.prediction.sections[i-1].end += 1
                    sec.word = sec.word[1:]
                    sec.start += 1

    def merge_contiguous_equivalent_sections(self):
        erase = []
        last = None
        for i, sec in enumerate(self.prediction.sections):
            if last is None:
                last = sec
                continue
            if sec.entity_group == last.entity_group:
                last.word += sec.word
                last.end = sec.end
                erase.append(i)
            else:
                last = sec

        for idx in sorted(erase, reverse=True):
            del self.prediction.sections[idx]
        return self.prediction

    def do_all(self, verbuous=True):
        self.verbuous = verbuous
        self.merge_undersize_sections()
        self.reasign_leading_punctuation_marks()
        self.merge_contiguous_equivalent_sections()

def process_entry(entry: Entry, model_pipe):
    # Get boundary annotations
    ba_list = [entry.boundary_annotation.gold]
    rdy = False
    while not rdy:
        rdy, ba_list = split_entry(ba_list, model_pipe.tokenizer)

    # Get text splits
    final_text_splits = []
    for sub_ba in ba_list:
        final_text_splits.append(entry.note_text[sub_ba[0].start_offset : sub_ba[-1].end_offset+1])

    # Make predictions
    pred = Prediction()
    offset = 0
    for txt in final_text_splits:
        partial_output = model_pipe(txt)
        part = Prediction()
        part.sections = [PredictionSection(**p) for p in partial_output]
        for s in part.sections:
            s.start += offset
            s.end += offset
            pred.sections.append(s)
        offset += len(txt)

    # Post-process
    ppp = PredictionPostProcessor(pred, False)
    ppp.do_all()

    # Create boundary predictions
    b_pred = []
    for gold_ba in entry.boundary_annotation.gold:
        b_pred.append(BoundaryAnnotation(**gold_ba.__dict__))

    # Reset all boundaries to None first
    for ba in b_pred:
        ba.boundary = None

    # Assign boundaries from predictions
    for sec in pred.sections:
        subset = [ba for ba in b_pred if ba.start_offset >= sec.start and ba.start_offset <= sec.end]
        if subset:
            subset[0].boundary = sec.entity_group
            for j in range(1, len(subset)):
                subset[j].boundary = None

    # Update entry with predictions
    entry.section_annotation.prediction = []
    for s in pred.sections:
        entry.section_annotation.prediction.append(SectionAnnotation(
            segment=s.word,
            label=s.entity_group,
            start_offset=s.start,
            end_offset=s.end
        ))

    entry.boundary_annotation.prediction = b_pred

def create_predictions_file(dataset_path, save_predicted_path, model_pipeline):
    with open(dataset_path, encoding='utf-8') as f:
        ds = ClinAISDataset(**json.load(f))

    from tqdm import tqdm
    for key, en in tqdm(ds.annotated_entries.items()):
        process_entry(en, model_pipeline)

    with open(save_predicted_path, 'w', encoding='utf-8') as f:
        f.write(ds.toJson())
    print("Predictions saved to", save_predicted_path)

# -------------------------------
# 8) Training code
# -------------------------------
import evaluate
import seqeval
import numpy as np
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    TrainingArguments,
    DataCollatorForTokenClassification,
    Trainer,
    pipeline,
    EarlyStoppingCallback
)

def get_seqeval_metrics(id2label: dict):
    seqeval_metric = evaluate.load("seqeval")

    def compute_metrics_fn(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        final_preds = []
        final_labels = []
        for pred, lab in zip(predictions, labels):
            temp_pred = []
            temp_lab = []
            for p_, l_ in zip(pred, lab):
                if l_ == -100:
                    continue
                temp_pred.append(id2label[p_])
                temp_lab.append(id2label[l_])
            final_preds.append(temp_pred)
            final_labels.append(temp_lab)

        results = seqeval_metric.compute(predictions=final_preds, references=final_labels)
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"]
        }

    return compute_metrics_fn

def create_label_id_dictionaries(section_types: List[str]):
    label2id = {}
    id2label = {}
    for idx, lab in enumerate(section_types):
        label2id[lab] = idx
        id2label[idx] = lab
    return label2id, id2label

def build_trainer(base_model: str, train_data_path: str, out_dir: str, do_freeze: bool = False) -> Trainer:
    # Create dictionaries for labels
    label2id, id2label = create_label_id_dictionaries(ClinicalSections.list())
    tokenizer = AutoTokenizer.from_pretrained(base_model)

    # Use modified preparation pipeline (no validation split)
    ds = execute_data_preparation_pipeline(train_data_path, tokenizer, label2id)
    data_coll = DataCollatorForTokenClassification(tokenizer=tokenizer)

    model = AutoModelForTokenClassification.from_pretrained(base_model,
                                                           num_labels=len(label2id),
                                                           id2label=id2label,
                                                           label2id=label2id)

    if do_freeze:
        # Optionally freeze the encoder weights
        for p in model.longformer.parameters():
            p.requires_grad = False
        for p in model.classifier.parameters():
            p.requires_grad = True

    # Training arguments for final training (no evaluation)
    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=3e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=2,
        num_train_epochs=4,  # Fewer epochs for final training
        weight_decay=0.01,
        warmup_ratio=0.1,
        save_strategy="epoch",
        logging_steps=50,
        report_to=[],
        save_total_limit=1,  # Only keep the last checkpoint
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"],
        tokenizer=tokenizer,
        data_collator=data_coll,
        compute_metrics=get_seqeval_metrics(id2label),
    )

    return trainer

def test_model(finetuned_model_path: str, test_dataset_path: str, save_predictions_path: str, save_evaluated_path: str = None):
    print("Creating predictions for test dataset with pipeline...")
    pipeline_model = pipeline("token-classification",
                              model=AutoModelForTokenClassification.from_pretrained(finetuned_model_path),
                              tokenizer=AutoTokenizer.from_pretrained(finetuned_model_path),
                              aggregation_strategy="simple")

    create_predictions_file(test_dataset_path, save_predictions_path, pipeline_model)
    print("Predictions saved to", save_predictions_path)

    if save_evaluated_path:
      print("Evaluating predictions with official metricas script...")
      score_predictions(prediction_file=Path(save_predictions_path), output_result_file=Path(save_evaluated_path))
      print("Evaluation saved to", save_evaluated_path)

      # Print Weighted B2
      with open(save_evaluated_path, "r", encoding="utf-8") as f:
           results = json.load(f)
           if "Weighted B2" in results:
               print("Weighted B2 =>", results["Weighted B2"])
           else:
               print("No Weighted B2 found in", save_evaluated_path)

def create_submission_zip(predictions_json_path: str, zip_output_path: str):
   """Create a ZIP file for CodaLab submission"""
   with zipfile.ZipFile(zip_output_path, 'w') as zipf:
       zipf.write(predictions_json_path, os.path.basename(predictions_json_path))
   print(f"Submission ZIP created at: {zip_output_path}")

# -------------------------------
# 9) Main execution pipeline
# -------------------------------

def main():
   # Mount Google Drive
   from google.colab import drive
   drive.mount('/content/drive')

   # Define paths
   base_drive_path = "/content/drive/MyDrive/TFG/ClinAIS_dataset"

   # Input data paths
   original_train_path = os.path.join(base_drive_path, "clinais.train.json")
   dev_data_path = os.path.join(base_drive_path, "clinais.dev.json")
   test_data_path = os.path.join(base_drive_path, "clinais.test&background.blind.json")

   # Intermediate data paths
   trans_train_path = os.path.join(base_drive_path, "clinais.train.translated_v3.json")
   aug_train_path = os.path.join(base_drive_path, "clinais.train.augmented_v3.json")
   combined_train_dev_path = os.path.join(base_drive_path, "clinais.train_dev_combined.json")
   combined_aug_path = os.path.join(base_drive_path, "clinais.train_dev_augmented_final.json")

   # Model and output paths
   output_dir = "/content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-final"
   predictions_json = os.path.join(output_dir, "test_predictions.json")
   evaluated_json = os.path.join(output_dir, "test_predictions_evaluated.json")
   submission_zip = os.path.join(output_dir, "clinais_submission.zip")

   print("=== ClinAIS Training and Testing Pipeline ===")
   print(f"Train data: {original_train_path}")
   print(f"Dev data: {dev_data_path}")
   print(f"Test data: {test_data_path}")
   print(f"Output directory: {output_dir}")

   # Create output directory
   os.makedirs(output_dir, exist_ok=True)

   # Step 1: Check if files exist
   if not os.path.exists(original_train_path):
       print(f"ERROR: Training file not found at {original_train_path}")
       return
   if not os.path.exists(dev_data_path):
       print(f"ERROR: Dev file not found at {dev_data_path}")
       return
   if not os.path.exists(test_data_path):
       print(f"ERROR: Test file not found at {test_data_path}")
       return

   # Step 2: Translate training dataset for augmentation
   print("\n=== Step 1: Translating training dataset ===")
   if not os.path.exists(trans_train_path):
       translate_dataset_and_save(original_train_path, trans_train_path)
   else:
       print("Translated training data already exists, skipping translation.")

   # Step 3: Create augmented training dataset
   print("\n=== Step 2: Creating augmented training dataset ===")
   if not os.path.exists(aug_train_path):
       create_augmented_dataset(original_train_path, trans_train_path, aug_train_path)
   else:
       print("Augmented training data already exists, skipping augmentation.")

   # Step 4: Combine train + dev datasets
   print("\n=== Step 3: Combining train and dev datasets ===")
   if not os.path.exists(combined_train_dev_path):
       combine_train_dev_datasets(original_train_path, dev_data_path, combined_train_dev_path)
   else:
       print("Combined train+dev data already exists.")

   # Step 5: Create final augmented dataset (original train+dev + translated train)
   print("\n=== Step 4: Creating final augmented dataset ===")
   if not os.path.exists(combined_aug_path):
       create_augmented_dataset(combined_train_dev_path, trans_train_path, combined_aug_path)
   else:
       print("Final augmented dataset already exists.")

   # Step 6: Train the model
   print("\n=== Step 5: Training the model ===")
   trainer = build_trainer(
       base_model="PlanTL-GOB-ES/bsc-bio-ehr-es",
       train_data_path=combined_aug_path,  # Use the final augmented dataset
       out_dir=output_dir,
       do_freeze=False
   )

   print("Starting training...")
   trainer.train()

   # Save the model
   print("Saving model and tokenizer...")
   trainer.save_state()
   trainer.model.save_pretrained(output_dir)
   trainer.tokenizer.save_pretrained(output_dir)
   print(f"Model and tokenizer saved to {output_dir}")

   # Step 7: Make predictions on test set
   print("\n=== Step 6: Making predictions on test set ===")
   test_model(output_dir, test_data_path, predictions_json)

   # Step 8: Create submission ZIP
   print("\n=== Step 7: Creating submission ZIP ===")
   create_submission_zip(predictions_json, submission_zip)

   print("\n=== Pipeline completed successfully! ===")
   print(f"Final model: {output_dir}")
   print(f"Test predictions: {predictions_json}")
   print(f"Submission ZIP: {submission_zip}")

   # Display final statistics
   with open(predictions_json, encoding='utf-8') as f:
       test_results = ClinAISDataset(**json.load(f))
       print(f"Test set entries processed: {len(test_results.annotated_entries)}")

   print("\nReady for submission to CodaLab!")

# Alternative function for development/validation testing
def test_on_dev_set():
   """Function to test the model on dev set to check performance before final submission"""
   from google.colab import drive
   drive.mount('/content/drive')

   base_drive_path = "/content/drive/MyDrive/TFG/ClinAIS_dataset"
   output_dir = "/content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-final"

   dev_data_path = os.path.join(base_drive_path, "clinais.dev.json")
   dev_predictions_json = os.path.join(output_dir, "dev_predictions.json")
   dev_evaluated_json = os.path.join(output_dir, "dev_predictions_evaluated.json")

   print("=== Testing model on dev set ===")
   test_model(output_dir, dev_data_path, dev_predictions_json, dev_evaluated_json)

   print("Dev set evaluation completed!")

# Quick training function without augmentation (for faster testing)
def quick_train():
   """Quick training without augmentation for testing purposes"""
   from google.colab import drive
   drive.mount('/content/drive')

   base_drive_path = "/content/drive/MyDrive/TFG/ClinAIS_dataset"
   original_train_path = os.path.join(base_drive_path, "clinais.train.json")
   dev_data_path = os.path.join(base_drive_path, "clinais.dev.json")
   combined_train_dev_path = os.path.join(base_drive_path, "clinais.train_dev_combined_quick.json")
   output_dir = "/content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-quick"

   # Combine train + dev
   combine_train_dev_datasets(original_train_path, dev_data_path, combined_train_dev_path)

   # Train
   trainer = build_trainer(
       base_model="PlanTL-GOB-ES/bsc-bio-ehr-es",
       train_data_path=combined_train_dev_path,
       out_dir=output_dir,
       do_freeze=False
   )

   trainer.train()
   trainer.save_state()
   trainer.model.save_pretrained(output_dir)
   trainer.tokenizer.save_pretrained(output_dir)

   print(f"Quick model saved to {output_dir}")

# -------------------------------
# 10) Execution
# -------------------------------

if __name__ == "__main__":
   # Run the main pipeline
   main()

   # Optionally test on dev set to verify performance
   # test_on_dev_set()

# -------------------------------
# 11) Additional utility functions
# -------------------------------

def check_dataset_stats(dataset_path):
   """Check statistics of a dataset"""
   with open(dataset_path, encoding='utf-8') as f:
       ds = ClinAISDataset(**json.load(f))

   print(f"Dataset: {dataset_path}")
   print(f"Total entries: {len(ds.annotated_entries)}")

   # Count sections
   section_counts = {}
   total_sections = 0

   for entry in ds.annotated_entries.values():
       for section in entry.section_annotation.gold:
           label = section.label
           if label not in section_counts:
               section_counts[label] = 0
           section_counts[label] += 1
           total_sections += 1

   print(f"Total sections: {total_sections}")
   print("Section distribution:")
   for label, count in sorted(section_counts.items()):
       percentage = (count / total_sections) * 100
       print(f"  {label}: {count} ({percentage:.1f}%)")

def validate_submission_format(predictions_json_path):
   """Validate that the submission file has the correct format"""
   try:
       with open(predictions_json_path, encoding='utf-8') as f:
           ds = ClinAISDataset(**json.load(f))

       print(f"Validation successful for {predictions_json_path}")
       print(f"Entries: {len(ds.annotated_entries)}")

       # Check that all entries have boundary predictions
       for entry_id, entry in ds.annotated_entries.items():
           if not entry.boundary_annotation.prediction:
               print(f"WARNING: Entry {entry_id} has no boundary predictions")
           else:
               # Check that prediction boundaries match gold boundaries in structure
               if len(entry.boundary_annotation.prediction) != len(entry.boundary_annotation.gold):
                   print(f"WARNING: Entry {entry_id} has mismatched boundary counts")

       print("Validation completed.")

   except Exception as e:
       print(f"ERROR: Validation failed for {predictions_json_path}: {e}")

# Example usage for checking dataset statistics
def check_all_datasets():
   """Check statistics for all datasets"""
   from google.colab import drive
   drive.mount('/content/drive')

   base_path = "/content/drive/MyDrive/TFG/ClinAIS_dataset"
   datasets = [
       "clinais.train.json",
       "clinais.dev.json",
       "clinais.test&background.blind.json"
   ]

   for dataset in datasets:
       path = os.path.join(base_path, dataset)
       if os.path.exists(path):
           check_dataset_stats(path)
           print("-" * 50)

# Uncomment to run dataset statistics
check_all_datasets()

fatal: destination path 'Sec-Identification-in-Spanish-Clinical-Notes' already exists and is not an empty directory.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Device set to use cuda:0


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
=== ClinAIS Training and Testing Pipeline ===
Train data: /content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.train.json
Dev data: /content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.dev.json
Test data: /content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.test&background.blind.json
Output directory: /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-final

=== Step 1: Translating training dataset ===
No existing translated data found; creating new file.
Loading original dataset to translate...
Loading partial translated dataset if any...
Currently partial has 0 entries
Translating dataset:


10it [01:39,  6.67s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
87it [15:12, 13.81s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors
510it [1:26:19, 10.16s/it]


KeyboardInterrupt: 

In [None]:
# ================================
# Optimized ClinAIS Training Pipeline with Better Error Handling (no augmentation)
# ================================

In [None]:

!git clone https://github.com/Iker0610/Sec-Identification-in-Spanish-Clinical-Notes.git
!pip install transformers datasets evaluate seqeval pydantic sacremoses

import os
import sys
import json
import logging
from typing import List, Any
from pydantic import BaseModel
import numpy as np
import pandas as pd
from enum import Enum
import zipfile
from pathlib import Path
import torch

# Set up logging
logging.basicConfig(level=logging.INFO)

# Optimize for GPU usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# -------------------------------
# 1) Dataset Models (unchanged)
# -------------------------------
class ClinicalSections(str, Enum):
    PRESENT_ILLNESS = "PRESENT_ILLNESS"
    DERIVED_FROM_TO = "DERIVED_FROM/TO"
    PAST_MEDICAL_HISTORY = "PAST_MEDICAL_HISTORY"
    FAMILY_HISTORY = "FAMILY_HISTORY"
    EXPLORATION = "EXPLORATION"
    TREATMENT = "TREATMENT"
    EVOLUTION = "EVOLUTION"

    @classmethod
    def list(cls):
        return list(map(lambda c: c.value, cls))

class SectionAnnotation(BaseModel):
    segment: str
    label: ClinicalSections
    start_offset: int
    end_offset: int

class SectionAnnotations(BaseModel):
    gold: List[SectionAnnotation] = []
    prediction: List[SectionAnnotation] = []

class BoundaryAnnotation(BaseModel):
    span: str
    boundary: ClinicalSections | None
    start_offset: int
    end_offset: int

class BoundaryAnnotations(BaseModel):
    gold: List[BoundaryAnnotation] = []
    prediction: List[BoundaryAnnotation] = []

class Entry(BaseModel):
    note_id: str
    note_text: str
    section_annotation: SectionAnnotations = SectionAnnotations()
    boundary_annotation: BoundaryAnnotations = BoundaryAnnotations()

    def toJson(self):
        return json.dumps(self, ensure_ascii=False, default=lambda o: o.__dict__)

class ClinAISDataset(BaseModel):
    annotated_entries: dict[str, Entry]
    scores: dict[str, Any] = {}

    def getEntry(self, idx: int) -> Entry:
        key = list(self.annotated_entries.keys())[idx]
        return self.annotated_entries[key]

    def toJson(self):
        return json.dumps(self, ensure_ascii=False, default=lambda o: o.__dict__)

# -------------------------------
# 2) Optimized Translation Pipeline
# -------------------------------
import transformers
from transformers import pipeline

def initialize_translation_pipelines():
    """Initialize translation pipelines with better error handling"""
    try:
        print("Initializing translation pipelines...")
        pipe_es_en = pipeline("translation",
                             model="Helsinki-NLP/opus-mt-es-en",
                             device=0 if torch.cuda.is_available() else -1,
                             batch_size=8)  # Batch processing
        pipe_en_es = pipeline("translation",
                             model="Helsinki-NLP/opus-mt-en-es",
                             device=0 if torch.cuda.is_available() else -1,
                             batch_size=8)  # Batch processing
        print("Translation pipelines initialized successfully!")
        return pipe_es_en, pipe_en_es
    except Exception as e:
        print(f"Error initializing translation pipelines: {e}")
        return None, None

def apply_translation_pipeline_batched(text_list: List[str], pipe_es_en, pipe_en_es, max_length=400) -> List[str]:
    """Optimized translation with better chunking and error handling"""
    if not pipe_es_en or not pipe_en_es:
        print("Translation pipelines not available, returning original texts")
        return text_list

    try:
        # Filter out texts that are too long and split them
        processed_texts = []
        for text in text_list:
            if len(text.split()) > max_length:
                # Split long texts into smaller chunks
                words = text.split()
                chunks = [' '.join(words[i:i+max_length]) for i in range(0, len(words), max_length)]
                processed_texts.extend(chunks)
            else:
                processed_texts.append(text)

        # Spanish -> English
        print(f"Translating {len(processed_texts)} text chunks ES->EN...")
        first_step = pipe_es_en(processed_texts)

        # English -> Spanish
        print(f"Translating back EN->ES...")
        second_step = pipe_en_es([x['translation_text'] for x in first_step])

        # Reconstruct original structure if we had to split
        result = [x['translation_text'] for x in second_step]

        # If we split texts, we need to rejoin them
        if len(result) != len(text_list):
            final_result = []
            result_idx = 0
            for original_text in text_list:
                if len(original_text.split()) > max_length:
                    # Calculate how many chunks this text was split into
                    words = original_text.split()
                    num_chunks = len(range(0, len(words), max_length))
                    # Rejoin the chunks
                    rejoined = ' '.join(result[result_idx:result_idx + num_chunks])
                    final_result.append(rejoined)
                    result_idx += num_chunks
                else:
                    final_result.append(result[result_idx])
                    result_idx += 1
            return final_result
        else:
            return result

    except Exception as e:
        print(f"Error in translation: {e}")
        print("Returning original texts without translation")
        return text_list

# -------------------------------
# 3) Simplified Splitter Classes
# -------------------------------
class WordListSplitter():
    def __init__(self, max_size: int, min_size: int, split_caracter_list=['.', ',', ';', ':']):
        self.max_size = max_size
        self.min_size = min_size
        self.split_caracters = split_caracter_list

    def get_split_index(self, spans: List[str], caracter='.'):
        positions = []
        for idx, span in enumerate(spans):
            if span and span[-1] == caracter:
                if idx < len(spans) - 1 and spans[idx+1] and spans[idx+1][0] == '\n':
                    positions.append(idx+1)
                else:
                    positions.append(idx)
        if not positions:
            return -1
        avg = sum(positions) / len(positions)
        return min(positions, key=lambda x: abs(x - avg))

    def split(self, spans: List[str], split_caracter_index=0):
        if len(spans) > self.max_size:
            spos = self.get_split_index(spans, self.split_caracters[split_caracter_index])
            if spos < self.min_size or spos > len(spans) - self.min_size:
                if split_caracter_index < len(self.split_caracters)-1:
                    return self.split(spans, split_caracter_index+1)
                else:
                    spos = len(spans) // 2
            left = spans[:spos+1]
            right = spans[spos+1:]
            return self.split(left, split_caracter_index) + self.split(right, split_caracter_index)
        else:
            return [spans]

# -------------------------------
# 4) Simplified Translation Functions
# -------------------------------
def translate_dataset_simple(dataset_path, translated_dataset_path, pipe_es_en, pipe_en_es):
    """Simplified translation that processes sections more efficiently"""

    if os.path.exists(translated_dataset_path):
        print("Translated dataset already exists, loading...")
        with open(translated_dataset_path, encoding='utf-8') as f:
            return ClinAISDataset(**json.load(f))

    print("Loading original dataset...")
    with open(dataset_path, encoding='utf-8') as f:
        ds = ClinAISDataset(**json.load(f))

    print(f"Translating {len(ds.annotated_entries)} entries...")
    translated_ds = ClinAISDataset(annotated_entries={})

    # Process in batches for efficiency
    batch_size = 10
    entries = list(ds.annotated_entries.items())

    from tqdm import tqdm
    for i in tqdm(range(0, len(entries), batch_size), desc="Translating batches"):
        batch = entries[i:i+batch_size]

        # Collect all sections from this batch
        batch_texts = []
        batch_metadata = []

        for key, entry in batch:
            for section in entry.section_annotation.gold:
                batch_texts.append(section.segment)
                batch_metadata.append((key, section))

        try:
            # Translate all sections in this batch
            translated_texts = apply_translation_pipeline_batched(batch_texts, pipe_es_en, pipe_en_es)

            # Reconstruct entries
            text_idx = 0
            for key, entry in batch:
                new_entry = Entry(note_id=entry.note_id, note_text="")
                section_texts = []

                for section in entry.section_annotation.gold:
                    if text_idx < len(translated_texts):
                        translated_text = translated_texts[text_idx]
                    else:
                        translated_text = section.segment  # Fallback

                    section_texts.append(translated_text)
                    text_idx += 1

                # Reconstruct the full note text and annotations
                new_entry.note_text = " ".join(section_texts)
                section_offset = 0

                for i, (section, translated_text) in enumerate(zip(entry.section_annotation.gold, section_texts)):
                    # Create section annotation
                    seg = SectionAnnotation(
                        segment=translated_text,
                        label=section.label,
                        start_offset=section_offset,
                        end_offset=section_offset + len(translated_text)
                    )
                    new_entry.section_annotation.gold.append(seg)

                    # Create boundary annotations
                    words = translated_text.split()
                    word_offset = section_offset
                    for j, word in enumerate(words):
                        ba = BoundaryAnnotation(
                            span=word,
                            boundary=section.label if j == 0 else None,
                            start_offset=word_offset,
                            end_offset=word_offset + len(word)
                        )
                        new_entry.boundary_annotation.gold.append(ba)
                        word_offset += len(word) + 1

                    section_offset += len(translated_text) + 1

                translated_ds.annotated_entries[key] = new_entry

        except Exception as e:
            print(f"Error in batch {i//batch_size}: {e}")
            # Add original entries as fallback
            for key, entry in batch:
                translated_ds.annotated_entries[key] = entry

        # Save progress every batch
        if i % (batch_size * 5) == 0:
            with open(translated_dataset_path, 'w', encoding='utf-8') as f:
                f.write(translated_ds.toJson())

    # Final save
    with open(translated_dataset_path, 'w', encoding='utf-8') as f:
        f.write(translated_ds.toJson())

    print(f"Translation complete. Saved to {translated_dataset_path}")
    return translated_ds

def combine_train_dev_datasets(train_path, dev_path, combined_path):
    """Combine train and dev datasets"""
    print("Combining train and dev datasets...")

    with open(train_path, encoding='utf-8') as f:
        train_ds = ClinAISDataset(**json.load(f))

    with open(dev_path, encoding='utf-8') as f:
        dev_ds = ClinAISDataset(**json.load(f))

    combined = ClinAISDataset(annotated_entries={})

    # Add all entries
    for k, e in train_ds.annotated_entries.items():
        combined.annotated_entries[k] = e

    for k, e in dev_ds.annotated_entries.items():
        combined.annotated_entries[k] = e

    with open(combined_path, 'w', encoding='utf-8') as f:
        f.write(combined.toJson())

    print(f"Combined dataset saved: {len(combined.annotated_entries)} entries")
    return combined

def create_augmented_dataset(original_path, translated_path, augmented_path):
    """Create augmented dataset with original + translated"""
    print("Creating augmented dataset...")

    with open(original_path, encoding='utf-8') as f:
        original_ds = ClinAISDataset(**json.load(f))

    with open(translated_path, encoding='utf-8') as f:
        translated_ds = ClinAISDataset(**json.load(f))

    augmented = ClinAISDataset(annotated_entries={})

    # Add original entries
    for k, e in original_ds.annotated_entries.items():
        augmented.annotated_entries[k] = e

    # Add translated entries with modified IDs
    for k, e in translated_ds.annotated_entries.items():
        e.note_id += '_T'
        augmented.annotated_entries[k + '_T'] = e

    with open(augmented_path, 'w', encoding='utf-8') as f:
        f.write(augmented.toJson())

    print(f"Augmented dataset saved: {len(augmented.annotated_entries)} entries")
    return augmented

# -------------------------------
# 5) Quick Training Functions (Simplified)
# -------------------------------
from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    TrainingArguments,
    DataCollatorForTokenClassification,
    Trainer,
    pipeline
)
import evaluate

def create_label_mappings():
    """Create label to ID mappings"""
    sections = ClinicalSections.list()
    label2id = {label: idx for idx, label in enumerate(sections)}
    id2label = {idx: label for idx, label in enumerate(sections)}
    return label2id, id2label

def prepare_dataset_simple(dataset_path, tokenizer, label2id):
    """Simplified dataset preparation"""
    print(f"Preparing dataset from {dataset_path}")

    with open(dataset_path, encoding='utf-8') as f:
        ds = ClinAISDataset(**json.load(f))

    # Extract spans and labels
    all_spans = []
    all_labels = []

    for entry in ds.annotated_entries.values():
        spans = []
        labels = []
        current_label = None

        for ba in entry.boundary_annotation.gold:
            spans.append(ba.span)
            if ba.boundary is not None:
                current_label = ba.boundary
            labels.append(label2id[current_label] if current_label else label2id[ClinicalSections.PRESENT_ILLNESS])

        all_spans.append(spans)
        all_labels.append(labels)

    # Tokenize and align labels
    def tokenize_and_align(spans, labels):
        tokenized = tokenizer(spans, truncation=True, is_split_into_words=True, max_length=512)

        aligned_labels = []
        word_ids = tokenized.word_ids()
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)
            elif word_idx != previous_word_idx:
                aligned_labels.append(labels[word_idx])
            else:
                aligned_labels.append(-100)
            previous_word_idx = word_idx

        return {
            'input_ids': tokenized['input_ids'],
            'attention_mask': tokenized['attention_mask'],
            'labels': aligned_labels
        }

    # Process all examples
    processed_data = []
    for spans, labels in zip(all_spans, all_labels):
        try:
            processed = tokenize_and_align(spans, labels)
            processed_data.append(processed)
        except Exception as e:
            print(f"Error processing example: {e}")
            continue

    # Create dataset
    df = pd.DataFrame(processed_data)
    dataset = Dataset.from_pandas(df)

    return DatasetDict({'train': dataset})

def train_model_simple(dataset_path, output_dir, base_model="PlanTL-GOB-ES/bsc-bio-ehr-es"):
    """Simplified training function"""
    print("Starting model training...")

    # Create label mappings
    label2id, id2label = create_label_mappings()

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    model = AutoModelForTokenClassification.from_pretrained(
        base_model,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    )

    # Prepare dataset
    dataset = prepare_dataset_simple(dataset_path, tokenizer, label2id)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=3e-5,
        per_device_train_batch_size=4,  # Smaller batch size for stability
        gradient_accumulation_steps=4,  # Compensate with gradient accumulation
        num_train_epochs=3,  # Fewer epochs to avoid overfitting
        weight_decay=0.01,
        warmup_ratio=0.1,
        save_strategy="epoch",
        logging_steps=100,
        report_to=[],
        dataloader_pin_memory=False,
        save_total_limit=1,
    )

    # Data collator
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Train
    print("Training started...")
    trainer.train()

    # Save model
    trainer.save_model()
    tokenizer.save_pretrained(output_dir)

    print(f"Model saved to {output_dir}")
    return trainer

# -------------------------------
# 6) Main Execution (Simplified)
# -------------------------------
def main_simplified():
    """Simplified main execution"""
    from google.colab import drive
    drive.mount('/content/drive')

    # Paths
    base_path = "/content/drive/MyDrive/TFG/ClinAIS_dataset"
    train_path = os.path.join(base_path, "clinais.train.json")
    dev_path = os.path.join(base_path, "clinais.dev.json")
    test_path = os.path.join(base_path, "clinais.test&background.blind.json")

    # Output paths
    output_dir = "/content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple"
    combined_path = os.path.join(base_path, "clinais.train_dev_combined.json")

    print("=== Simplified ClinAIS Pipeline ===")

    # Step 1: Combine train and dev
    if not os.path.exists(combined_path):
        combine_train_dev_datasets(train_path, dev_path, combined_path)

    # Step 2: Train model (skip translation for now due to long processing time)
    os.makedirs(output_dir, exist_ok=True)
    trainer = train_model_simple(combined_path, output_dir)

    print("=== Training completed! ===")
    print(f"Model saved to: {output_dir}")

    return output_dir

# Run the simplified version first
if __name__ == "__main__":
    output_dir = main_simplified()

fatal: destination path 'Sec-Identification-in-Spanish-Clinical-Notes' already exists and is not an empty directory.
Using device: cuda
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
=== Simplified ClinAIS Pipeline ===
Combining train and dev datasets...
Combined dataset saved: 908 entries
Starting model training...


tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.17M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/521k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/bsc-bio-ehr-es and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataset from /content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.train_dev_combined.json


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

  trainer = Trainer(


Training started...


Step,Training Loss
100,0.8569


Model saved to /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple
=== Training completed! ===
Model saved to: /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple


In [None]:
# ================================
# Testing the Trained Model
# ================================

import os
import json
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
from pathlib import Path
import zipfile



# -------------------------------
# Prediction Processing Functions
# -------------------------------

class PredictionSection:
    def __init__(self, entity_group, score, word, start, end):
        self.entity_group = entity_group
        self.score = score
        self.word = word
        self.start = start
        self.end = end

class PredictionPostProcessor:
    def __init__(self, prediction_sections, verbose=False):
        self.sections = prediction_sections
        self.min_section_size = 3
        self.punctuation_marks = [',', '.', ';', ':', ')', ']', '}', '!', '?']
        self.verbose = verbose

    def get_section_size(self, sec):
        return len(sec.word.strip().split())

    def merge_undersize_sections(self):
        erase = []
        prev = None
        for i, sec in enumerate(self.sections):
            if self.get_section_size(sec) < self.min_section_size:
                if prev is not None:
                    if self.verbose:
                        print(f"Merging {i} into {prev}")
                    self.sections[prev].word += sec.word
                    self.sections[prev].end = sec.end
                    erase.append(i)
                elif i < len(self.sections) - 1:
                    if self.verbose:
                        print(f"Merging {i} into {i+1}")
                    self.sections[i+1].word = sec.word + self.sections[i+1].word
                    self.sections[i+1].start = sec.start
            else:
                prev = i

        for idx in sorted(erase, reverse=True):
            del self.sections[idx]

    def merge_contiguous_equivalent_sections(self):
        erase = []
        last = None
        for i, sec in enumerate(self.sections):
            if last is None:
                last = sec
                continue
            if sec.entity_group == last.entity_group:
                last.word += sec.word
                last.end = sec.end
                erase.append(i)
            else:
                last = sec

        for idx in sorted(erase, reverse=True):
            del self.sections[idx]

    def process(self):
        self.merge_undersize_sections()
        self.merge_contiguous_equivalent_sections()
        return self.sections

def process_entry_predictions(entry, model_pipeline):
    """Generate predictions for a single entry"""

    # Get the full text
    text = entry.note_text

    # Make predictions
    try:
        raw_predictions = model_pipeline(text)
    except Exception as e:
        print(f"Error making predictions: {e}")
        # Fallback: assign everything to PRESENT_ILLNESS
        raw_predictions = [{
            'entity_group': 'PRESENT_ILLNESS',
            'score': 0.5,
            'word': text,
            'start': 0,
            'end': len(text)
        }]

    # Convert to our format
    prediction_sections = []
    for pred in raw_predictions:
        section = PredictionSection(
            entity_group=pred['entity_group'],
            score=pred.get('score', 0.0),
            word=pred['word'],
            start=pred['start'],
            end=pred['end']
        )
        prediction_sections.append(section)

    # Post-process predictions
    processor = PredictionPostProcessor(prediction_sections, verbose=False)
    processed_sections = processor.process()

    # Create section annotations
    entry.section_annotation.prediction = []
    for section in processed_sections:
        section_ann = SectionAnnotation(
            segment=section.word,
            label=section.entity_group,
            start_offset=section.start,
            end_offset=section.end
        )
        entry.section_annotation.prediction.append(section_ann)

    # Create boundary annotations (reset all to None first)
    entry.boundary_annotation.prediction = []
    for gold_ba in entry.boundary_annotation.gold:
        pred_ba = BoundaryAnnotation(
            span=gold_ba.span,
            boundary=None,  # Start with no boundary
            start_offset=gold_ba.start_offset,
            end_offset=gold_ba.end_offset
        )
        entry.boundary_annotation.prediction.append(pred_ba)

    # Assign boundaries from predictions
    for section in processed_sections:
        # Find boundary annotations that fall within this section
        for ba in entry.boundary_annotation.prediction:
            if ba.start_offset >= section.start and ba.start_offset < section.end:
                if ba.boundary is None:  # Only assign if not already assigned
                    ba.boundary = section.entity_group
                    break  # Only assign to the first token in the section

def test_model_on_dataset(model_path, dataset_path, output_path):
    """Test the trained model on a dataset"""

    print(f"Testing model from {model_path}")
    print(f"Dataset: {dataset_path}")

    # Load the trained model
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForTokenClassification.from_pretrained(model_path)

        # Create pipeline
        pipe = pipeline(
            "token-classification",
            model=model,
            tokenizer=tokenizer,
            aggregation_strategy="simple",
            device=0 if torch.cuda.is_available() else -1
        )

        print("Model loaded successfully!")

    except Exception as e:
        print(f"Error loading model: {e}")
        return None

    # Load dataset
    print("Loading dataset...")
    with open(dataset_path, 'r', encoding='utf-8') as f:
        dataset = ClinAISDataset(**json.load(f))

    print(f"Dataset loaded: {len(dataset.annotated_entries)} entries")

    # Process each entry
    print("Making predictions...")
    from tqdm import tqdm

    for entry_id, entry in tqdm(dataset.annotated_entries.items()):
        try:
            process_entry_predictions(entry, pipe)
        except Exception as e:
            print(f"Error processing entry {entry_id}: {e}")
            # Create fallback predictions
            entry.section_annotation.prediction = []
            entry.boundary_annotation.prediction = []
            for gold_ba in entry.boundary_annotation.gold:
                pred_ba = BoundaryAnnotation(
                    span=gold_ba.span,
                    boundary=None,
                    start_offset=gold_ba.start_offset,
                    end_offset=gold_ba.end_offset
                )
                entry.boundary_annotation.prediction.append(pred_ba)

    # Save predictions
    print(f"Saving predictions to {output_path}")
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(dataset.toJson())

    print("Predictions saved successfully!")
    return dataset

def evaluate_predictions(predictions_path, output_eval_path):
    """Evaluate predictions using the official metric"""

    print(f"Evaluating predictions from {predictions_path}")

    try:
        # Use the official evaluation script
        score_predictions(
            prediction_file=Path(predictions_path),
            output_result_file=Path(output_eval_path)
        )

        # Load and display results
        with open(output_eval_path, 'r', encoding='utf-8') as f:
            results = json.load(f)

        print("\n=== EVALUATION RESULTS ===")
        for metric, value in results.items():
            print(f"{metric}: {value}")

        if "Weighted B2" in results:
            print(f"\n🎯 FINAL SCORE - Weighted B2: {results['Weighted B2']:.4f}")

        return results

    except Exception as e:
        print(f"Error during evaluation: {e}")
        return None

def create_submission_zip(predictions_json_path, zip_output_path):
    """Create submission ZIP file for CodaLab"""

    print(f"Creating submission ZIP: {zip_output_path}")

    with zipfile.ZipFile(zip_output_path, 'w') as zipf:
        zipf.write(predictions_json_path, os.path.basename(predictions_json_path))

    print(f"Submission ZIP created: {zip_output_path}")

# -------------------------------
# Main Testing Functions
# -------------------------------

def test_on_dev_set():
    """Test the model on the development set"""

    # Paths
    model_path = "/content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple"
    dev_data_path = "/content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.dev.json"

    # Output paths
    dev_predictions_path = os.path.join(model_path, "dev_predictions.json")
    dev_evaluation_path = os.path.join(model_path, "dev_evaluation.json")

    print("=== Testing Model on Development Set ===")

    # Test model
    test_model_on_dataset(model_path, dev_data_path, dev_predictions_path)

    # Evaluate
    results = evaluate_predictions(dev_predictions_path, dev_evaluation_path)

    return results

def test_on_test_set():
    """Test the model on the blind test set"""

    # Paths
    model_path = "/content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple"
    test_data_path = "/content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.test&background.blind.json"

    # Output paths
    test_predictions_path = os.path.join(model_path, "test_predictions.json")
    submission_zip_path = os.path.join(model_path, "clinais_submission.zip")

    print("=== Testing Model on Test Set ===")

    # Test model
    test_model_on_dataset(model_path, test_data_path, test_predictions_path)

    # Create submission ZIP
    create_submission_zip(test_predictions_path, submission_zip_path)

    print(f"\n🎉 Test predictions ready!")
    print(f"Predictions: {test_predictions_path}")
    print(f"Submission ZIP: {submission_zip_path}")

    return test_predictions_path, submission_zip_path

def quick_validation():
    """Quick validation to check if the model works"""

    model_path = "/content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple"

    print("=== Quick Model Validation ===")

    try:
        # Load model
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForTokenClassification.from_pretrained(model_path)

        # Test with a simple sentence
        pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

        test_text = "Paciente de 65 años con dolor abdominal. Se realizó ecografía. El tratamiento fue efectivo."

        result = pipe(test_text)

        print("✅ Model working! Sample prediction:")
        for pred in result:
            print(f"  {pred['entity_group']}: '{pred['word']}'")

        return True

    except Exception as e:
        print(f"❌ Model validation failed: {e}")
        return False

# -------------------------------
# Run Tests
# -------------------------------

print("🚀 Starting Model Testing...")

# Step 1: Quick validation
print("\n--- Step 1: Quick Validation ---")
if not quick_validation():
    print("Model validation failed. Please check the model.")
else:
    print("✅ Model validation passed!")

    # Step 2: Test on dev set
    print("\n--- Step 2: Testing on Development Set ---")
    dev_results = test_on_dev_set()

    if dev_results:
        print("✅ Development set testing completed!")

        # Step 3: Test on test set
        print("\n--- Step 3: Testing on Test Set ---")
        test_predictions_path, submission_zip_path = test_on_test_set()

        print("\n🎉 ALL TESTING COMPLETED!")
        print(f"📁 Your submission file: {submission_zip_path}")
        print("📤 Ready to upload to CodaLab!")

    else:
        print("❌ Development set testing failed.")

🚀 Starting Model Testing...

--- Step 1: Quick Validation ---
=== Quick Model Validation ===


Device set to use cuda:0


✅ Model working! Sample prediction:
  PRESENT_ILLNESS: ' Paciente de 65 años con dolor abdominal.'
  EXPLORATION: ' Se realizó ecografía'
  TREATMENT: '. El tratamiento fue efectivo.'
✅ Model validation passed!

--- Step 2: Testing on Development Set ---
=== Testing Model on Development Set ===
Testing model from /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple
Dataset: /content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.dev.json


Device set to use cuda:0


Model loaded successfully!
Loading dataset...
Dataset loaded: 127 entries
Making predictions...


100%|██████████| 127/127 [00:09<00:00, 12.75it/s]


Saving predictions to /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/dev_predictions.json
Predictions saved successfully!
Evaluating predictions from /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/dev_predictions.json
Loading predictions from /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/dev_predictions.json.
Loading references from the prediction file.
Evaluating all predictions


100%|██████████| 127/127 [00:02<00:00, 49.62it/s] 



=== EVALUATION RESULTS ===
Weighted B2: 0.7755863836634077
Scores per file: {'S0004-06142005000200009-3': {'B2': 0.8478647537494018, 'Statistics': {'matches': ['PRESENT_ILLNESS', 'EXPLORATION'], 'additions': [], 'deletions': ['EXPLORATION'], 'substitutions': [], 'transpositions': [{'start_offset': 7, 'end_offset': 18, 'boundary': 'PAST_MEDICAL_HISTORY'}], 'count_edits': 0.7266967713770675, 'weighted_transpositions': 0.223350345914925}}, 'S0004-06142005001000015-1': {'B2': 0.8952574126822433, 'Statistics': {'matches': ['PRESENT_ILLNESS', 'PAST_MEDICAL_HISTORY', 'PRESENT_ILLNESS', 'EXPLORATION', 'TREATMENT', 'EVOLUTION', 'TREATMENT', 'EVOLUTION'], 'additions': ['EXPLORATION', 'EVOLUTION'], 'deletions': [], 'substitutions': [], 'transpositions': [], 'count_edits': 1.0474258731775667, 'weighted_transpositions': 0}}, 'S0004-06142006000100012-1': {'B2': 0.8690717658528042, 'Statistics': {'matches': ['PRESENT_ILLNESS', 'PAST_MEDICAL_HISTORY', 'PRESENT_ILLNESS', 'EXPLORATION', 'TREATMENT', 'E

Device set to use cuda:0


Model loaded successfully!
Loading dataset...
Dataset loaded: 2843 entries
Making predictions...


100%|██████████| 2843/2843 [01:55<00:00, 24.71it/s]


Saving predictions to /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/test_predictions.json
Predictions saved successfully!
Creating submission ZIP: /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/clinais_submission.zip
Submission ZIP created: /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/clinais_submission.zip

🎉 Test predictions ready!
Predictions: /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/test_predictions.json
Submission ZIP: /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/clinais_submission.zip

🎉 ALL TESTING COMPLETED!
📁 Your submission file: /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/clinais_submission.zip
📤 Ready to upload to CodaLab!


In [None]:
# ================================
# Official ClinAIS Testing Script
# Following Official Evaluation Requirements
# ================================

import os
import json
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
from pathlib import Path
import zipfile
import sys
from tqdm import tqdm


# Load your pydantic models (from the starting kit structure)
from typing import List, Any
from pydantic import BaseModel
from enum import Enum

class ClinicalSections(str, Enum):
    PRESENT_ILLNESS = "PRESENT_ILLNESS"
    DERIVED_FROM_TO = "DERIVED_FROM/TO"
    PAST_MEDICAL_HISTORY = "PAST_MEDICAL_HISTORY"
    FAMILY_HISTORY = "FAMILY_HISTORY"
    EXPLORATION = "EXPLORATION"
    TREATMENT = "TREATMENT"
    EVOLUTION = "EVOLUTION"

class BoundaryAnnotation(BaseModel):
    span: str
    boundary: ClinicalSections | None
    start_offset: int
    end_offset: int

class BoundaryAnnotations(BaseModel):
    gold: List[BoundaryAnnotation] = []
    prediction: List[BoundaryAnnotation] = []

class SectionAnnotation(BaseModel):
    segment: str
    label: ClinicalSections
    start_offset: int
    end_offset: int

class SectionAnnotations(BaseModel):
    gold: List[SectionAnnotation] = []
    prediction: List[SectionAnnotation] = []

class Entry(BaseModel):
    note_id: str
    note_text: str
    section_annotation: SectionAnnotations = SectionAnnotations()
    boundary_annotation: BoundaryAnnotations = BoundaryAnnotations()

    def toJson(self):
        return json.dumps(self, ensure_ascii=False, default=lambda o: o.__dict__)

class ClinAISDataset(BaseModel):
    annotated_entries: dict[str, Entry]
    scores: dict[str, Any] = {}

    def toJson(self):
        return json.dumps(self, ensure_ascii=False, default=lambda o: o.__dict__)

# -------------------------------
# Official Prediction Method
# -------------------------------

def predict_boundaries_official(model_path: str, dataset_path: str, output_path: str):
    """
    Official prediction method that only fills boundary_annotation.prediction
    while keeping the exact same tokens/borders as gold standard
    """

    print(f"🎯 Making Official Predictions")
    print(f"Model: {model_path}")
    print(f"Dataset: {dataset_path}")

    # Load model
    print("Loading model...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForTokenClassification.from_pretrained(model_path)

        # Create pipeline
        pipe = pipeline(
            "token-classification",
            model=model,
            tokenizer=tokenizer,
            aggregation_strategy="simple",
            device=0 if torch.cuda.is_available() else -1
        )
        print("✅ Model loaded successfully!")

    except Exception as e:
        print(f"❌ Error loading model: {e}")
        return False

    # Load dataset
    print("Loading dataset...")
    with open(dataset_path, 'r', encoding='utf-8') as f:
        dataset = ClinAISDataset(**json.load(f))

    print(f"📄 Dataset loaded: {len(dataset.annotated_entries)} entries")

    # Process each entry
    print("🔮 Making predictions...")

    for entry_id, entry in tqdm(dataset.annotated_entries.items(), desc="Processing entries"):
        try:
            # Get model predictions on the full text
            full_text = entry.note_text

            try:
                raw_predictions = pipe(full_text)
            except Exception as e:
                print(f"⚠️  Prediction error for {entry_id}: {e}")
                raw_predictions = []

            # Initialize all boundary predictions to None
            entry.boundary_annotation.prediction = []

            for gold_boundary in entry.boundary_annotation.gold:
                # Create prediction boundary with same span and offsets
                pred_boundary = BoundaryAnnotation(
                    span=gold_boundary.span,
                    boundary=None,  # Start with no boundary
                    start_offset=gold_boundary.start_offset,
                    end_offset=gold_boundary.end_offset
                )
                entry.boundary_annotation.prediction.append(pred_boundary)

            # Map model predictions to boundaries
            if raw_predictions:
                # Create section predictions based on model output
                current_section = None
                section_starts = {}

                # Group predictions by section
                for pred in raw_predictions:
                    section_label = pred['entity_group']
                    char_start = pred['start']
                    char_end = pred['end']

                    # Find the first boundary token that starts this section
                    for i, boundary in enumerate(entry.boundary_annotation.prediction):
                        if (boundary.start_offset >= char_start and
                            boundary.start_offset < char_end and
                            boundary.boundary is None):  # Not already assigned

                            boundary.boundary = section_label
                            section_starts[section_label] = i
                            break

                # Fill in gaps with reasonable defaults
                current_label = ClinicalSections.PRESENT_ILLNESS

                for i, boundary in enumerate(entry.boundary_annotation.prediction):
                    if boundary.boundary is None:
                        # Check if we're starting a new section based on predictions
                        found_new_section = False
                        for pred in raw_predictions:
                            if (boundary.start_offset >= pred['start'] and
                                boundary.start_offset < pred['end']):
                                current_label = pred['entity_group']
                                found_new_section = True
                                break

                        # Only assign boundary to first token of new sections
                        if found_new_section and i > 0:
                            # Check if previous token had different label
                            prev_sections = [p for p in raw_predictions
                                           if entry.boundary_annotation.prediction[i-1].start_offset >= p['start']
                                           and entry.boundary_annotation.prediction[i-1].start_offset < p['end']]
                            if prev_sections and prev_sections[0]['entity_group'] != current_label:
                                boundary.boundary = current_label

            else:
                # Fallback: assign first token to PRESENT_ILLNESS
                if entry.boundary_annotation.prediction:
                    entry.boundary_annotation.prediction[0].boundary = ClinicalSections.PRESENT_ILLNESS

        except Exception as e:
            print(f"❌ Error processing entry {entry_id}: {e}")
            # Fallback: create empty predictions but keep structure
            entry.boundary_annotation.prediction = []
            for gold_boundary in entry.boundary_annotation.gold:
                pred_boundary = BoundaryAnnotation(
                    span=gold_boundary.span,
                    boundary=None,
                    start_offset=gold_boundary.start_offset,
                    end_offset=gold_boundary.end_offset
                )
                entry.boundary_annotation.prediction.append(pred_boundary)

    # Save predictions
    print(f"💾 Saving predictions to {output_path}")
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(dataset.toJson())

    print("✅ Predictions saved!")
    return True

def evaluate_with_official_script(predictions_path: str, results_path: str):
    """Evaluate using the official evaluation script"""

    print(f"📊 Evaluating with official script...")
    print(f"Predictions: {predictions_path}")
    print(f"Results: {results_path}")

    try:
        # Use official evaluation
        score_predictions(
            prediction_file=Path(predictions_path),
            output_result_file=Path(results_path)
        )

        # Load and show results
        with open(results_path, 'r', encoding='utf-8') as f:
            results = json.load(f)

        print("\n🎯 === OFFICIAL EVALUATION RESULTS ===")
        for metric, value in results.items():
            if isinstance(value, float):
                print(f"{metric}: {value:.4f}")
            else:
                print(f"{metric}: {value}")

        # Highlight the main score
        if "Weighted B2" in results:
            score = results["Weighted B2"]
            print(f"\n🏆 FINAL SCORE: {score:.4f}")

            # Give some context
            if score > 0.8:
                print("🌟 Excellent performance!")
            elif score > 0.6:
                print("👍 Good performance!")
            elif score > 0.4:
                print("📈 Decent performance, room for improvement")
            else:
                print("📉 Needs improvement")

        return results

    except Exception as e:
        print(f"❌ Evaluation error: {e}")
        return None

def create_official_submission(predictions_path: str, submission_zip_path: str):
    """Create official submission ZIP file"""

    print(f"📦 Creating submission ZIP...")

    with zipfile.ZipFile(submission_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(predictions_path, os.path.basename(predictions_path))

    file_size = os.path.getsize(submission_zip_path) / (1024 * 1024)  # MB
    print(f"✅ Submission created: {submission_zip_path}")
    print(f"📏 File size: {file_size:.2f} MB")

    return submission_zip_path

# -------------------------------
# Main Testing Functions
# -------------------------------

def test_dev_set_official():
    """Test on development set with official evaluation"""

    print("=== 🧪 TESTING ON DEVELOPMENT SET ===")

    # Paths
    model_path = "/content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple"
    dev_path = "/content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.dev.json"

    # Output files
    dev_predictions = os.path.join(model_path, "dev_predictions_official.json")
    dev_results = os.path.join(model_path, "dev_evaluation_official.json")

    # Step 1: Make predictions
    success = predict_boundaries_official(model_path, dev_path, dev_predictions)

    if not success:
        print("❌ Prediction failed")
        return None

    # Step 2: Evaluate
    results = evaluate_with_official_script(dev_predictions, dev_results)

    return results

def test_full_test_set_official():
    """Test on the full test set (test + background) for submission"""

    print("=== 🎯 TESTING ON FULL TEST SET ===")
    print("(This includes both test and background cases)")

    # Paths
    model_path = "/content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple"
    test_path = "/content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.test&background.blind.json"

    # Output files
    test_predictions = os.path.join(model_path, "test_predictions_official.json")
    submission_zip = os.path.join(model_path, "clinais_official_submission.zip")

    # Step 1: Make predictions
    print("🔮 Making predictions on full test set...")
    success = predict_boundaries_official(model_path, test_path, test_predictions)

    if not success:
        print("❌ Prediction failed")
        return None, None

    # Step 2: Create submission
    zip_path = create_official_submission(test_predictions, submission_zip)

    # Validate submission structure
    print("\n🔍 Validating submission...")
    with open(test_predictions, 'r', encoding='utf-8') as f:
        data = json.load(f)
        entries = data.get('annotated_entries', {})
        print(f"✅ Submission contains {len(entries)} entries")

    return test_predictions, zip_path

def quick_model_check():
    """Quick check to see if model works"""

    print("=== 🔧 QUICK MODEL CHECK ===")

    model_path = "/content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple"

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForTokenClassification.from_pretrained(model_path)

        pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

        test_text = "Paciente con dolor abdominal. Exploración física normal. Tratamiento con analgésicos."

        result = pipe(test_text)

        print("✅ Model working! Sample prediction:")
        for i, pred in enumerate(result[:5]):  # Show first 5
            print(f"  {i+1}. {pred['entity_group']}: '{pred['word']}' (score: {pred['score']:.3f})")

        return True

    except Exception as e:
        print(f"❌ Model check failed: {e}")
        return False

# -------------------------------
# RUN THE TESTS
# -------------------------------

print("🚀 STARTING OFFICIAL CLINAIS TESTING")
print("="*50)

# Step 1: Quick model check
print("\n--- Step 1: Model Validation ---")
if not quick_model_check():
    print("❌ Model validation failed. Cannot proceed.")
else:
    print("✅ Model validation passed!")

    # Step 2: Test on dev set
    print("\n--- Step 2: Development Set Testing ---")
    dev_results = test_dev_set_official()

    if dev_results:
        print("✅ Development testing completed!")

        # Step 3: Create final submission
        print("\n--- Step 3: Final Test Set Prediction ---")
        test_pred_path, submission_zip = test_full_test_set_official()

        if submission_zip:
            print("\n🎉 === ALL TESTING COMPLETED! ===")
            print(f"📁 Submission file: {submission_zip}")
            print(f"📤 Ready for CodaLab upload!")
            print(f"💡 This file contains predictions for ALL {2713 + 130} cases")
            print("   (130 test cases + 2713 background cases)")

            if dev_results and "Weighted B2" in dev_results:
                print(f"🎯 Expected performance: ~{dev_results['Weighted B2']:.4f}")
        else:
            print("❌ Final submission creation failed")
    else:
        print("❌ Development testing failed")

print("\n" + "="*50)
print("🏁 Testing complete!")

🚀 STARTING OFFICIAL CLINAIS TESTING

--- Step 1: Model Validation ---
=== 🔧 QUICK MODEL CHECK ===


Device set to use cuda:0


✅ Model working! Sample prediction:
  1. PRESENT_ILLNESS: ' Paciente con dolor abdominal' (score: 0.668)
  2. EXPLORATION: '. Exploración física normal.' (score: 0.724)
  3. TREATMENT: ' Tratamiento con analgésicos' (score: 0.411)
  4. EXPLORATION: '.' (score: 0.317)
✅ Model validation passed!

--- Step 2: Development Set Testing ---
=== 🧪 TESTING ON DEVELOPMENT SET ===
🎯 Making Official Predictions
Model: /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple
Dataset: /content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.dev.json
Loading model...


Device set to use cuda:0


✅ Model loaded successfully!
Loading dataset...
📄 Dataset loaded: 127 entries
🔮 Making predictions...


Processing entries: 100%|██████████| 127/127 [00:06<00:00, 20.52it/s]


💾 Saving predictions to /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/dev_predictions_official.json
✅ Predictions saved!
📊 Evaluating with official script...
Predictions: /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/dev_predictions_official.json
Results: /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/dev_evaluation_official.json
Loading predictions from /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/dev_predictions_official.json.
Loading references from the prediction file.
Evaluating all predictions


100%|██████████| 127/127 [00:00<00:00, 133.03it/s]



🎯 === OFFICIAL EVALUATION RESULTS ===
Weighted B2: 0.7225
Scores per file: {'S0004-06142005000200009-3': {'B2': 0.8478647537494018, 'Statistics': {'matches': ['PRESENT_ILLNESS', 'EXPLORATION'], 'additions': [], 'deletions': ['EXPLORATION'], 'substitutions': [], 'transpositions': [{'start_offset': 7, 'end_offset': 18, 'boundary': 'PAST_MEDICAL_HISTORY'}], 'count_edits': 0.7266967713770675, 'weighted_transpositions': 0.223350345914925}}, 'S0004-06142005001000015-1': {'B2': 0.8952574126822433, 'Statistics': {'matches': ['PRESENT_ILLNESS', 'PAST_MEDICAL_HISTORY', 'PRESENT_ILLNESS', 'EXPLORATION', 'TREATMENT', 'EVOLUTION', 'TREATMENT', 'EVOLUTION'], 'additions': ['EXPLORATION', 'EVOLUTION'], 'deletions': [], 'substitutions': [], 'transpositions': [], 'count_edits': 1.0474258731775667, 'weighted_transpositions': 0}}, 'S0004-06142006000100012-1': {'B2': 0.5016732127310712, 'Statistics': {'matches': ['PRESENT_ILLNESS', 'PAST_MEDICAL_HISTORY', 'PRESENT_ILLNESS', 'EXPLORATION', 'TREATMENT', 'EV

Device set to use cuda:0


✅ Model loaded successfully!
Loading dataset...
📄 Dataset loaded: 2843 entries
🔮 Making predictions...


Processing entries: 100%|██████████| 2843/2843 [02:28<00:00, 19.18it/s]


💾 Saving predictions to /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/test_predictions_official.json
✅ Predictions saved!
📦 Creating submission ZIP...
✅ Submission created: /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/clinais_official_submission.zip
📏 File size: 20.89 MB

🔍 Validating submission...
✅ Submission contains 2843 entries

🎉 === ALL TESTING COMPLETED! ===
📁 Submission file: /content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple/clinais_official_submission.zip
📤 Ready for CodaLab upload!
💡 This file contains predictions for ALL 2843 cases
   (130 test cases + 2713 background cases)
🎯 Expected performance: ~0.7225

🏁 Testing complete!


In [None]:
## FINAL TESTING (NO)

In [None]:
# ================================
# Complete ClinAIS Testing Strategy
# ================================

import os
import json
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
from pathlib import Path
import zipfile
import sys
from tqdm import tqdm


# [Include all the model classes from before - ClinicalSections, Entry, etc.]
# ... (same as previous code)

def run_complete_testing():
    """Complete testing strategy for ClinAIS"""

    model_path = "/content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-ner-clinais-simple"

    # Dataset paths
    dev_path = "/content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.dev.json"
    test_path = "/content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.test&background.blind.json"

    print("🎯 COMPLETE CLINAIS TESTING STRATEGY")
    print("="*50)

    # STEP 1: Quick model validation
    print("\n📋 Step 1: Model Validation")
    if not quick_model_check(model_path):
        print("❌ Model failed validation")
        return

    # STEP 2: Test on dev set (for performance estimation)
    print("\n📊 Step 2: Development Set Testing (Performance Check)")
    print("Purpose: Estimate how well your model will perform")

    dev_predictions = os.path.join(model_path, "dev_predictions.json")
    dev_results = os.path.join(model_path, "dev_evaluation.json")

    # Predict on dev
    success = predict_boundaries_official(model_path, dev_path, dev_predictions)
    if success:
        # Evaluate dev
        results = evaluate_with_official_script(dev_predictions, dev_results)
        if results and "Weighted B2" in results:
            dev_score = results["Weighted B2"]
            print(f"🎯 DEV SET SCORE: {dev_score:.4f}")
            print(f"📈 This is your expected performance on the test set")
        else:
            print("⚠️  Dev evaluation failed")
    else:
        print("❌ Dev prediction failed")
        return

    # STEP 3: Final test set prediction (for submission)
    print("\n🎯 Step 3: Final Test Set Prediction (Submission)")
    print("Purpose: Create submission file for CodaLab")

    # Check test set size
    with open(test_path, 'r', encoding='utf-8') as f:
        test_data = json.load(f)
        test_entries = len(test_data['annotated_entries'])
        print(f"📄 Test set contains: {test_entries} entries")
        print("   (130 test cases + 2,713 background cases)")

    # Predict on test set
    test_predictions = os.path.join(model_path, "final_test_predictions.json")
    submission_zip = os.path.join(model_path, "clinais_final_submission.zip")

    print("🔮 Making final predictions...")
    success = predict_boundaries_official(model_path, test_path, test_predictions)

    if success:
        # Create submission ZIP
        create_official_submission(test_predictions, submission_zip)

        print("\n🎉 TESTING COMPLETED!")
        print("="*50)
        print(f"✅ Development score: {dev_score:.4f}")
        print(f"📁 Submission file: {submission_zip}")
        print(f"📤 Ready for CodaLab!")
        print(f"💡 File contains {test_entries} predictions")

        # Final instructions
        print("\n📋 NEXT STEPS:")
        print("1. Upload the ZIP file to CodaLab")
        print("2. You have 5 submissions maximum for test phase")
        print("3. 5 submissions per day limit")
        print(f"4. Expected score: ~{dev_score:.4f} (based on dev set)")

        return submission_zip
    else:
        print("❌ Final test prediction failed")
        return None

def quick_model_check(model_path):
    """Quick validation"""
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForTokenClassification.from_pretrained(model_path)
        pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

        test_text = "Paciente con dolor abdominal. Exploración normal."
        result = pipe(test_text)

        print(f"✅ Model works! Found {len(result)} predictions")
        return True
    except Exception as e:
        print(f"❌ Model error: {e}")
        return False

def predict_boundaries_official(model_path, dataset_path, output_path):
    """Official prediction method"""

    print(f"🔮 Making predictions...")
    print(f"Input: {dataset_path}")
    print(f"Output: {output_path}")

    # Load model
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForTokenClassification.from_pretrained(model_path)
        pipe = pipeline("token-classification", model=model, tokenizer=tokenizer,
                       aggregation_strategy="simple", device=0 if torch.cuda.is_available() else -1)
    except Exception as e:
        print(f"❌ Model loading error: {e}")
        return False

    # Load dataset
    with open(dataset_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Convert to our format if needed
    if 'annotated_entries' in data:
        dataset = ClinAISDataset(**data)
    else:
        dataset = ClinAISDataset(annotated_entries=data)

    print(f"📄 Processing {len(dataset.annotated_entries)} entries...")

    # Process each entry
    for entry_id, entry in tqdm(dataset.annotated_entries.items()):
        try:
            # Get predictions from model
            full_text = entry.note_text
            raw_predictions = pipe(full_text)

            # Initialize boundary predictions (copy structure from gold)
            entry.boundary_annotation.prediction = []
            for gold_boundary in entry.boundary_annotation.gold:
                pred_boundary = BoundaryAnnotation(
                    span=gold_boundary.span,
                    boundary=None,  # Start with no boundary
                    start_offset=gold_boundary.start_offset,
                    end_offset=gold_boundary.end_offset
                )
                entry.boundary_annotation.prediction.append(pred_boundary)

            # Map model predictions to boundaries
            if raw_predictions:
                # Sort predictions by start position
                sorted_preds = sorted(raw_predictions, key=lambda x: x['start'])

                for pred in sorted_preds:
                    section_label = pred['entity_group']
                    pred_start = pred['start']
                    pred_end = pred['end']

                    # Find first boundary token in this prediction span
                    for boundary in entry.boundary_annotation.prediction:
                        if (boundary.start_offset >= pred_start and
                            boundary.start_offset < pred_end and
                            boundary.boundary is None):
                            boundary.boundary = section_label
                            break

            # Ensure at least first token has a boundary
            if entry.boundary_annotation.prediction and entry.boundary_annotation.prediction[0].boundary is None:
                entry.boundary_annotation.prediction[0].boundary = ClinicalSections.PRESENT_ILLNESS

        except Exception as e:
            print(f"⚠️  Error processing {entry_id}: {e}")
            # Create empty predictions but maintain structure
            entry.boundary_annotation.prediction = []
            for gold_boundary in entry.boundary_annotation.gold:
                pred_boundary = BoundaryAnnotation(
                    span=gold_boundary.span,
                    boundary=None,
                    start_offset=gold_boundary.start_offset,
                    end_offset=gold_boundary.end_offset
                )
                entry.boundary_annotation.prediction.append(pred_boundary)

    # Save predictions
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(dataset.toJson())

    print("✅ Predictions saved!")
    return True

def evaluate_with_official_script(predictions_path, results_path):
    """Evaluate using official script"""
    try:
        score_predictions(Path(predictions_path), Path(results_path))

        with open(results_path, 'r', encoding='utf-8') as f:
            results = json.load(f)

        print("\n📊 EVALUATION RESULTS:")
        for metric, value in results.items():
            if isinstance(value, float):
                print(f"  {metric}: {value:.4f}")
            else:
                print(f"  {metric}: {value}")

        return results
    except Exception as e:
        print(f"❌ Evaluation error: {e}")
        return None

def create_official_submission(predictions_path, zip_path):
    """Create submission ZIP"""
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(predictions_path, os.path.basename(predictions_path))

    size_mb = os.path.getsize(zip_path) / (1024 * 1024)
    print(f"📦 Submission created: {zip_path} ({size_mb:.1f} MB)")

# Run the complete testing
if __name__ == "__main__":
    run_complete_testing()

ImportError: cannot import name 'score_predictions' from 'evaluate' (/usr/local/lib/python3.11/dist-packages/evaluate/__init__.py)

In [None]:
## Final version with augmented data

In [None]:
# ================================
# A100 SAFE MODE - Fixed CUDA Issues
# ================================

import os
import sys
import json
import logging
from typing import List, Any
from pydantic import BaseModel
import numpy as np
import pandas as pd
from enum import Enum
import zipfile
from pathlib import Path
import torch
from tqdm import tqdm
import gc

# CUDA ERROR DEBUGGING
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

# Set up logging
logging.basicConfig(level=logging.INFO)

# SAFE A100 OPTIMIZATION (avoid device-side assert)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# SAFER A100 settings to avoid CUDA errors
if torch.cuda.is_available():
    print(f"🔥 GPU: {torch.cuda.get_device_name()}")
    print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

    # CONSERVATIVE A100 settings to avoid errors
    torch.backends.cudnn.benchmark = False  # Disable for stability
    torch.backends.cuda.matmul.allow_tf32 = False  # Disable TF32 for now
    torch.backends.cudnn.allow_tf32 = False

# Aggressive memory cleanup
def safe_cuda_cleanup():
    """Safe CUDA memory cleanup"""
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
    gc.collect()

safe_cuda_cleanup()

# [Include all your previous model classes here - ClinicalSections, Entry, etc.]
class ClinicalSections(str, Enum):
    PRESENT_ILLNESS = "PRESENT_ILLNESS"
    DERIVED_FROM_TO = "DERIVED_FROM/TO"
    PAST_MEDICAL_HISTORY = "PAST_MEDICAL_HISTORY"
    FAMILY_HISTORY = "FAMILY_HISTORY"
    EXPLORATION = "EXPLORATION"
    TREATMENT = "TREATMENT"
    EVOLUTION = "EVOLUTION"

    @classmethod
    def list(cls):
        return list(map(lambda c: c.value, cls))

class SectionAnnotation(BaseModel):
    segment: str
    label: ClinicalSections
    start_offset: int
    end_offset: int

class SectionAnnotations(BaseModel):
    gold: List[SectionAnnotation] = []
    prediction: List[SectionAnnotation] = []

class BoundaryAnnotation(BaseModel):
    span: str
    boundary: ClinicalSections | None
    start_offset: int
    end_offset: int

class BoundaryAnnotations(BaseModel):
    gold: List[BoundaryAnnotation] = []
    prediction: List[BoundaryAnnotation] = []

class Entry(BaseModel):
    note_id: str
    note_text: str
    section_annotation: SectionAnnotations = SectionAnnotations()
    boundary_annotation: BoundaryAnnotations = BoundaryAnnotations()

    def toJson(self):
        return json.dumps(self, ensure_ascii=False, default=lambda o: o.__dict__)

class ClinAISDataset(BaseModel):
    annotated_entries: dict[str, Entry]
    scores: dict[str, Any] = {}

    def toJson(self):
        return json.dumps(self, ensure_ascii=False, default=lambda o: o.__dict__)

# -------------------------------
# SAFE A100 Translation (Reduced settings)
# -------------------------------
from transformers import pipeline
import transformers

transformers.logging.set_verbosity_error()

def setup_safe_translation_pipelines():
    """Setup SAFE translation pipelines (reduced batch sizes)"""
    print("🛡️ Setting up SAFE translation pipelines...")

    try:
        # SAFER settings to avoid CUDA errors
        pipe_es_en = pipeline(
            "translation",
            model="Helsinki-NLP/opus-mt-es-en",
            device=0,
            batch_size=16,  # REDUCED from 64
            max_length=512,
            framework="pt",
            torch_dtype=torch.float32  # Use FP32 for stability
        )

        pipe_en_es = pipeline(
            "translation",
            model="Helsinki-NLP/opus-mt-en-es",
            device=0,
            batch_size=16,  # REDUCED from 64
            max_length=512,
            framework="pt",
            torch_dtype=torch.float32  # Use FP32 for stability
        )

        print("✅ SAFE translation pipelines ready!")
        return pipe_es_en, pipe_en_es

    except Exception as e:
        print(f"❌ Translation pipeline error: {e}")
        print("🔄 Trying CPU fallback...")

        # CPU fallback
        pipe_es_en = pipeline(
            "translation",
            model="Helsinki-NLP/opus-mt-es-en",
            device=-1,  # CPU
            batch_size=8,
            max_length=512
        )

        pipe_en_es = pipeline(
            "translation",
            model="Helsinki-NLP/opus-mt-en-es",
            device=-1,  # CPU
            batch_size=8,
            max_length=512
        )

        print("✅ CPU translation pipelines ready!")
        return pipe_es_en, pipe_en_es

def safe_translate_texts(texts: List[str], pipe_es_en, pipe_en_es) -> List[str]:
    """SAFE translation with error handling"""
    if not texts:
        return []

    print(f"🛡️ SAFE translating {len(texts)} segments...")

    # Simple chunking for very long texts
    processed_texts = []
    chunk_mappings = []

    for i, text in enumerate(texts):
        words = text.split()
        if len(words) > 200:  # Conservative chunking
            # Split into smaller chunks
            chunks = [' '.join(words[j:j+200]) for j in range(0, len(words), 200)]
            processed_texts.extend(chunks)
            chunk_mappings.extend([i] * len(chunks))
        else:
            processed_texts.append(text)
            chunk_mappings.append(i)

    try:
        # SAFE ES -> EN with smaller batches
        print(f"  🌐 ES->EN: {len(processed_texts)} segments...")

        # Process in smaller chunks to avoid CUDA errors
        en_texts = []
        batch_size = 8  # Very conservative batch size

        for i in range(0, len(processed_texts), batch_size):
            batch = processed_texts[i:i + batch_size]
            try:
                batch_results = pipe_es_en(
                    batch,
                    max_length=512,
                    truncation=True,
                    do_sample=False
                )
                en_texts.extend([r['translation_text'] for r in batch_results])

                # Cleanup after each batch
                safe_cuda_cleanup()

            except Exception as e:
                print(f"⚠️  Batch {i//batch_size} error: {e}")
                # Fallback: use original texts for this batch
                en_texts.extend(batch)

        # SAFE EN -> ES
        print(f"  🌐 EN->ES: {len(en_texts)} segments...")

        final_texts = []
        for i in range(0, len(en_texts), batch_size):
            batch = en_texts[i:i + batch_size]
            try:
                batch_results = pipe_en_es(
                    batch,
                    max_length=512,
                    truncation=True,
                    do_sample=False
                )
                final_texts.extend([r['translation_text'] for r in batch_results])

                # Cleanup after each batch
                safe_cuda_cleanup()

            except Exception as e:
                print(f"⚠️  Back-translation batch {i//batch_size} error: {e}")
                # Fallback: use EN texts
                final_texts.extend(batch)

        # Reconstruct original structure
        if len(final_texts) == len(texts):
            return final_texts

        result = [''] * len(texts)
        for chunk_text, orig_idx in zip(final_texts, chunk_mappings):
            if result[orig_idx]:
                result[orig_idx] += ' ' + chunk_text
            else:
                result[orig_idx] = chunk_text

        print(f"✅ SAFE translation completed!")
        return result

    except Exception as e:
        print(f"⚠️  Translation error: {e}")
        print("🔄 Returning original texts...")
        return texts

def safe_translate_dataset(dataset_path: str, output_path: str, pipe_es_en, pipe_en_es):
    """SAFE dataset translation"""

    print(f"🛡️ SAFE dataset translation...")

    with open(dataset_path, 'r', encoding='utf-8') as f:
        original_ds = ClinAISDataset(**json.load(f))

    print(f"📄 Processing {len(original_ds.annotated_entries)} entries...")

    translated_ds = ClinAISDataset(annotated_entries={})

    # MUCH smaller batches to avoid CUDA errors
    entries = list(original_ds.annotated_entries.items())
    batch_size = 10  # Very conservative

    for i in tqdm(range(0, len(entries), batch_size), desc="🛡️ SAFE translation"):
        batch_entries = entries[i:i + batch_size]

        all_sections = []
        entry_section_counts = []

        for key, entry in batch_entries:
            sections = [s.segment for s in entry.section_annotation.gold]
            all_sections.extend(sections)
            entry_section_counts.append(len(sections))

        try:
            translated_sections = safe_translate_texts(all_sections, pipe_es_en, pipe_en_es)

            section_idx = 0
            for (key, entry), num_sections in zip(batch_entries, entry_section_counts):
                entry_translated = translated_sections[section_idx:section_idx + num_sections]
                section_idx += num_sections

                new_entry = Entry(
                    note_id=entry.note_id + "_SAFE",
                    note_text=" ".join(entry_translated)
                )

                char_offset = 0
                for orig_section, trans_text in zip(entry.section_annotation.gold, entry_translated):
                    section_ann = SectionAnnotation(
                        segment=trans_text,
                        label=orig_section.label,
                        start_offset=char_offset,
                        end_offset=char_offset + len(trans_text)
                    )
                    new_entry.section_annotation.gold.append(section_ann)

                    words = trans_text.split()
                    word_offset = char_offset
                    for j, word in enumerate(words):
                        boundary_ann = BoundaryAnnotation(
                            span=word,
                            boundary=orig_section.label if j == 0 else None,
                            start_offset=word_offset,
                            end_offset=word_offset + len(word)
                        )
                        new_entry.boundary_annotation.gold.append(boundary_ann)
                        word_offset += len(word) + 1

                    char_offset += len(trans_text) + 1

                translated_ds.annotated_entries[key + "_SAFE"] = new_entry

        except Exception as e:
            print(f"⚠️  Batch error: {e}")
            # Add original entries as fallback
            for key, entry in batch_entries:
                translated_ds.annotated_entries[key + "_ORIG"] = entry

        # Save progress every batch
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(translated_ds.toJson())
        print(f"💾 Progress: {len(translated_ds.annotated_entries)} entries")

        # Aggressive cleanup
        safe_cuda_cleanup()

    print(f"🎉 SAFE translation complete: {len(translated_ds.annotated_entries)} entries")
    return translated_ds

# -------------------------------
# Continue with training and main pipeline...
# Just replace the aggressive A100 functions with these safe versions
# and use smaller batch sizes throughout
# -------------------------------

def safe_a100_pipeline():
    """SAFE A100 pipeline that won't crash"""
    from google.colab import drive
    drive.mount('/content/drive')

    base_path = "/content/drive/MyDrive/TFG/ClinAIS_dataset"
    train_path = os.path.join(base_path, "clinais.train.json")
    dev_path = os.path.join(base_path, "clinais.dev.json")

    # Use existing combined if available
    combined_path = os.path.join(base_path, "clinais.train_dev_combined.json")
    translated_path = os.path.join(base_path, "clinais.train_translated_SAFE.json")
    augmented_path = os.path.join(base_path, "clinais.train_dev_augmented_SAFE.json")

    output_dir = "/content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-clinais-SAFE"
    os.makedirs(output_dir, exist_ok=True)

    print("🛡️ === SAFE A100 PIPELINE ===")
    print("🔧 Conservative settings to avoid CUDA errors")

    # Step 1: Use existing combined or create
    if os.path.exists(combined_path):
        print("✅ Using existing combined dataset")
    else:
        print("Creating combined dataset...")
        # [Create combined dataset code here]

    # Step 2: SAFE translation
    print("\n--- SAFE Translation ---")
    if not os.path.exists(translated_path):
        pipe_es_en, pipe_en_es = setup_safe_translation_pipelines()
        safe_translate_dataset(train_path, translated_path, pipe_es_en, pipe_en_es)

        # Cleanup
        del pipe_es_en, pipe_en_es
        safe_cuda_cleanup()
    else:
        print("✅ Using existing translated dataset")
b
    print("🎉 SAFE pipeline completed!")
    return output_dir

# Run SAFE version
if __name__ == "__main__":
    print("🛡️ RUNNING SAFE A100 VERSION")
    safe_model = safe_a100_pipeline()

🚀 Using device: cuda
🔥 GPU: NVIDIA A100-SXM4-40GB
💾 GPU Memory: 42.5 GB
🛡️ RUNNING SAFE A100 VERSION
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🛡️ === SAFE A100 PIPELINE ===
🔧 Conservative settings to avoid CUDA errors
✅ Using existing combined dataset

--- SAFE Translation ---
🛡️ Setting up SAFE translation pipelines...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]



✅ SAFE translation pipelines ready!
🛡️ SAFE dataset translation...
📄 Processing 781 entries...


🛡️ SAFE translation:   0%|          | 0/79 [00:00<?, ?it/s]

🛡️ SAFE translating 76 segments...
  🌐 ES->EN: 76 segments...
  🌐 EN->ES: 76 segments...
💾 Progress: 10 entries


🛡️ SAFE translation:   1%|▏         | 1/79 [02:23<3:06:13, 143.26s/it]

🛡️ SAFE translating 79 segments...
  🌐 ES->EN: 80 segments...
  🌐 EN->ES: 80 segments...
✅ SAFE translation completed!
💾 Progress: 20 entries


🛡️ SAFE translation:   3%|▎         | 2/79 [05:25<3:33:18, 166.21s/it]

🛡️ SAFE translating 72 segments...
  🌐 ES->EN: 73 segments...
  🌐 EN->ES: 73 segments...
✅ SAFE translation completed!
💾 Progress: 30 entries


🛡️ SAFE translation:   4%|▍         | 3/79 [07:53<3:20:00, 157.90s/it]

🛡️ SAFE translating 86 segments...
  🌐 ES->EN: 88 segments...
  🌐 EN->ES: 88 segments...
✅ SAFE translation completed!
💾 Progress: 40 entries


🛡️ SAFE translation:   5%|▌         | 4/79 [11:13<3:38:06, 174.49s/it]

🛡️ SAFE translating 125 segments...
  🌐 ES->EN: 127 segments...
  🌐 EN->ES: 127 segments...
✅ SAFE translation completed!
💾 Progress: 50 entries


🛡️ SAFE translation:   6%|▋         | 5/79 [15:29<4:11:22, 203.81s/it]

🛡️ SAFE translating 103 segments...
  🌐 ES->EN: 104 segments...
  🌐 EN->ES: 104 segments...
✅ SAFE translation completed!
💾 Progress: 60 entries


🛡️ SAFE translation:   8%|▊         | 6/79 [19:26<4:21:55, 215.28s/it]

🛡️ SAFE translating 87 segments...
  🌐 ES->EN: 88 segments...
  🌐 EN->ES: 88 segments...
✅ SAFE translation completed!
💾 Progress: 70 entries


🛡️ SAFE translation:   9%|▉         | 7/79 [22:27<4:04:37, 203.85s/it]

🛡️ SAFE translating 104 segments...
  🌐 ES->EN: 106 segments...
  🌐 EN->ES: 106 segments...
✅ SAFE translation completed!
💾 Progress: 80 entries


🛡️ SAFE translation:  10%|█         | 8/79 [26:35<4:18:09, 218.16s/it]

🛡️ SAFE translating 110 segments...
  🌐 ES->EN: 113 segments...
  🌐 EN->ES: 113 segments...
✅ SAFE translation completed!
💾 Progress: 90 entries


🛡️ SAFE translation:  11%|█▏        | 9/79 [30:18<4:16:00, 219.44s/it]

🛡️ SAFE translating 79 segments...
  🌐 ES->EN: 79 segments...
  🌐 EN->ES: 79 segments...
💾 Progress: 100 entries


🛡️ SAFE translation:  13%|█▎        | 10/79 [32:56<3:50:29, 200.43s/it]

🛡️ SAFE translating 74 segments...
  🌐 ES->EN: 75 segments...
  🌐 EN->ES: 75 segments...
✅ SAFE translation completed!
💾 Progress: 110 entries


🛡️ SAFE translation:  14%|█▍        | 11/79 [34:48<3:16:44, 173.60s/it]

🛡️ SAFE translating 101 segments...
  🌐 ES->EN: 101 segments...
  🌐 EN->ES: 101 segments...
💾 Progress: 120 entries


🛡️ SAFE translation:  15%|█▌        | 12/79 [37:54<3:18:03, 177.37s/it]

🛡️ SAFE translating 79 segments...
  🌐 ES->EN: 79 segments...
  🌐 EN->ES: 79 segments...
💾 Progress: 130 entries


🛡️ SAFE translation:  16%|█▋        | 13/79 [40:03<2:58:53, 162.62s/it]

🛡️ SAFE translating 113 segments...
  🌐 ES->EN: 117 segments...
  🌐 EN->ES: 117 segments...
✅ SAFE translation completed!
💾 Progress: 140 entries


🛡️ SAFE translation:  18%|█▊        | 14/79 [43:11<3:04:19, 170.15s/it]

🛡️ SAFE translating 80 segments...
  🌐 ES->EN: 82 segments...
  🌐 EN->ES: 82 segments...
✅ SAFE translation completed!
💾 Progress: 150 entries


🛡️ SAFE translation:  19%|█▉        | 15/79 [46:12<3:05:05, 173.53s/it]

🛡️ SAFE translating 97 segments...
  🌐 ES->EN: 100 segments...
  🌐 EN->ES: 100 segments...
✅ SAFE translation completed!
💾 Progress: 160 entries


🛡️ SAFE translation:  20%|██        | 16/79 [48:56<2:59:12, 170.67s/it]

🛡️ SAFE translating 86 segments...
  🌐 ES->EN: 94 segments...
  🌐 EN->ES: 94 segments...
✅ SAFE translation completed!
💾 Progress: 170 entries


🛡️ SAFE translation:  22%|██▏       | 17/79 [52:53<3:16:55, 190.58s/it]

🛡️ SAFE translating 103 segments...
  🌐 ES->EN: 105 segments...
  🌐 EN->ES: 105 segments...
✅ SAFE translation completed!
💾 Progress: 180 entries


🛡️ SAFE translation:  23%|██▎       | 18/79 [55:12<2:57:58, 175.05s/it]

🛡️ SAFE translating 86 segments...
  🌐 ES->EN: 89 segments...
  🌐 EN->ES: 89 segments...
✅ SAFE translation completed!
💾 Progress: 190 entries


🛡️ SAFE translation:  24%|██▍       | 19/79 [57:49<2:49:42, 169.71s/it]

🛡️ SAFE translating 76 segments...
  🌐 ES->EN: 79 segments...
  🌐 EN->ES: 79 segments...
✅ SAFE translation completed!
💾 Progress: 200 entries


🛡️ SAFE translation:  25%|██▌       | 20/79 [1:00:51<2:50:26, 173.33s/it]

🛡️ SAFE translating 78 segments...
  🌐 ES->EN: 78 segments...
  🌐 EN->ES: 78 segments...
💾 Progress: 210 entries


🛡️ SAFE translation:  27%|██▋       | 21/79 [1:03:54<2:50:17, 176.17s/it]

🛡️ SAFE translating 116 segments...
  🌐 ES->EN: 118 segments...
  🌐 EN->ES: 118 segments...
✅ SAFE translation completed!
💾 Progress: 220 entries


🛡️ SAFE translation:  28%|██▊       | 22/79 [1:07:56<3:06:22, 196.18s/it]

🛡️ SAFE translating 68 segments...
  🌐 ES->EN: 68 segments...
  🌐 EN->ES: 68 segments...
💾 Progress: 230 entries


🛡️ SAFE translation:  29%|██▉       | 23/79 [1:09:51<2:40:11, 171.63s/it]

🛡️ SAFE translating 52 segments...
  🌐 ES->EN: 55 segments...
  🌐 EN->ES: 55 segments...
✅ SAFE translation completed!
💾 Progress: 240 entries


🛡️ SAFE translation:  30%|███       | 24/79 [1:12:13<2:29:12, 162.77s/it]

🛡️ SAFE translating 66 segments...
  🌐 ES->EN: 68 segments...
  🌐 EN->ES: 68 segments...
✅ SAFE translation completed!
💾 Progress: 250 entries


🛡️ SAFE translation:  32%|███▏      | 25/79 [1:14:39<2:22:04, 157.87s/it]

🛡️ SAFE translating 74 segments...
  🌐 ES->EN: 75 segments...
  🌐 EN->ES: 75 segments...
✅ SAFE translation completed!
💾 Progress: 260 entries


🛡️ SAFE translation:  33%|███▎      | 26/79 [1:17:23<2:21:04, 159.71s/it]

🛡️ SAFE translating 73 segments...
  🌐 ES->EN: 73 segments...
  🌐 EN->ES: 73 segments...
💾 Progress: 270 entries


🛡️ SAFE translation:  34%|███▍      | 27/79 [1:20:00<2:17:44, 158.93s/it]

🛡️ SAFE translating 64 segments...
  🌐 ES->EN: 68 segments...
  🌐 EN->ES: 68 segments...
✅ SAFE translation completed!
💾 Progress: 280 entries


🛡️ SAFE translation:  35%|███▌      | 28/79 [1:22:30<2:12:47, 156.22s/it]

🛡️ SAFE translating 47 segments...
  🌐 ES->EN: 47 segments...
  🌐 EN->ES: 47 segments...
💾 Progress: 290 entries


🛡️ SAFE translation:  37%|███▋      | 29/79 [1:23:58<1:53:09, 135.80s/it]

🛡️ SAFE translating 105 segments...
  🌐 ES->EN: 110 segments...
  🌐 EN->ES: 110 segments...
✅ SAFE translation completed!
💾 Progress: 300 entries


🛡️ SAFE translation:  38%|███▊      | 30/79 [1:27:37<2:11:09, 160.60s/it]

🛡️ SAFE translating 79 segments...
  🌐 ES->EN: 81 segments...
  🌐 EN->ES: 81 segments...
✅ SAFE translation completed!
💾 Progress: 310 entries


🛡️ SAFE translation:  39%|███▉      | 31/79 [1:30:35<2:12:39, 165.82s/it]

🛡️ SAFE translating 98 segments...
  🌐 ES->EN: 102 segments...
  🌐 EN->ES: 102 segments...
✅ SAFE translation completed!
💾 Progress: 320 entries


🛡️ SAFE translation:  41%|████      | 32/79 [1:34:56<2:32:15, 194.38s/it]

🛡️ SAFE translating 110 segments...
  🌐 ES->EN: 115 segments...
  🌐 EN->ES: 115 segments...
✅ SAFE translation completed!
💾 Progress: 330 entries


🛡️ SAFE translation:  42%|████▏     | 33/79 [1:38:55<2:39:19, 207.81s/it]

🛡️ SAFE translating 120 segments...
  🌐 ES->EN: 123 segments...
  🌐 EN->ES: 123 segments...
✅ SAFE translation completed!
💾 Progress: 340 entries


🛡️ SAFE translation:  43%|████▎     | 34/79 [1:42:52<2:42:20, 216.46s/it]

🛡️ SAFE translating 142 segments...
  🌐 ES->EN: 144 segments...
  🌐 EN->ES: 144 segments...
✅ SAFE translation completed!
💾 Progress: 350 entries


🛡️ SAFE translation:  44%|████▍     | 35/79 [1:46:58<2:45:14, 225.34s/it]

🛡️ SAFE translating 131 segments...
  🌐 ES->EN: 133 segments...
  🌐 EN->ES: 133 segments...
✅ SAFE translation completed!
💾 Progress: 360 entries


🛡️ SAFE translation:  46%|████▌     | 36/79 [1:51:12<2:47:37, 233.90s/it]

🛡️ SAFE translating 120 segments...
  🌐 ES->EN: 120 segments...
  🌐 EN->ES: 120 segments...
💾 Progress: 370 entries


🛡️ SAFE translation:  47%|████▋     | 37/79 [1:54:30<2:36:11, 223.13s/it]

🛡️ SAFE translating 119 segments...
  🌐 ES->EN: 122 segments...
  🌐 EN->ES: 122 segments...
✅ SAFE translation completed!
💾 Progress: 380 entries


🛡️ SAFE translation:  48%|████▊     | 38/79 [1:58:59<2:41:55, 236.96s/it]

🛡️ SAFE translating 113 segments...
  🌐 ES->EN: 116 segments...
  🌐 EN->ES: 116 segments...
✅ SAFE translation completed!
💾 Progress: 390 entries


🛡️ SAFE translation:  49%|████▉     | 39/79 [2:02:22<2:31:09, 226.74s/it]

🛡️ SAFE translating 103 segments...
  🌐 ES->EN: 105 segments...
  🌐 EN->ES: 105 segments...
✅ SAFE translation completed!
💾 Progress: 400 entries


🛡️ SAFE translation:  51%|█████     | 40/79 [2:05:39<2:21:32, 217.76s/it]

🛡️ SAFE translating 65 segments...
  🌐 ES->EN: 65 segments...
  🌐 EN->ES: 65 segments...
💾 Progress: 410 entries


🛡️ SAFE translation:  52%|█████▏    | 41/79 [2:08:00<2:03:29, 194.98s/it]

🛡️ SAFE translating 62 segments...
  🌐 ES->EN: 69 segments...
  🌐 EN->ES: 69 segments...
✅ SAFE translation completed!
💾 Progress: 420 entries


🛡️ SAFE translation:  53%|█████▎    | 42/79 [2:10:53<1:56:05, 188.25s/it]

🛡️ SAFE translating 74 segments...
  🌐 ES->EN: 74 segments...
  🌐 EN->ES: 74 segments...
💾 Progress: 430 entries


🛡️ SAFE translation:  54%|█████▍    | 43/79 [2:13:18<1:45:09, 175.26s/it]

🛡️ SAFE translating 63 segments...
  🌐 ES->EN: 64 segments...
  🌐 EN->ES: 64 segments...
✅ SAFE translation completed!
💾 Progress: 440 entries


🛡️ SAFE translation:  56%|█████▌    | 44/79 [2:15:20<1:32:54, 159.26s/it]

🛡️ SAFE translating 76 segments...
  🌐 ES->EN: 77 segments...
  🌐 EN->ES: 77 segments...
✅ SAFE translation completed!
💾 Progress: 450 entries


🛡️ SAFE translation:  57%|█████▋    | 45/79 [2:18:04<1:31:07, 160.80s/it]

🛡️ SAFE translating 94 segments...
  🌐 ES->EN: 98 segments...
  🌐 EN->ES: 98 segments...
✅ SAFE translation completed!
💾 Progress: 460 entries


🛡️ SAFE translation:  58%|█████▊    | 46/79 [2:21:29<1:35:39, 173.93s/it]

🛡️ SAFE translating 66 segments...
  🌐 ES->EN: 69 segments...
  🌐 EN->ES: 69 segments...
✅ SAFE translation completed!
💾 Progress: 470 entries


🛡️ SAFE translation:  59%|█████▉    | 47/79 [2:24:17<1:31:55, 172.36s/it]

🛡️ SAFE translating 76 segments...
  🌐 ES->EN: 78 segments...
  🌐 EN->ES: 78 segments...
✅ SAFE translation completed!
💾 Progress: 480 entries


🛡️ SAFE translation:  61%|██████    | 48/79 [2:27:15<1:29:54, 174.02s/it]

🛡️ SAFE translating 66 segments...
  🌐 ES->EN: 67 segments...
  🌐 EN->ES: 67 segments...
✅ SAFE translation completed!
💾 Progress: 490 entries


🛡️ SAFE translation:  62%|██████▏   | 49/79 [2:29:45<1:23:21, 166.71s/it]

🛡️ SAFE translating 60 segments...
  🌐 ES->EN: 61 segments...
  🌐 EN->ES: 61 segments...
✅ SAFE translation completed!
💾 Progress: 500 entries


🛡️ SAFE translation:  63%|██████▎   | 50/79 [2:31:49<1:14:25, 153.98s/it]

🛡️ SAFE translating 63 segments...
  🌐 ES->EN: 65 segments...
  🌐 EN->ES: 65 segments...
✅ SAFE translation completed!
💾 Progress: 510 entries


🛡️ SAFE translation:  65%|██████▍   | 51/79 [2:34:14<1:10:31, 151.14s/it]

🛡️ SAFE translating 70 segments...
  🌐 ES->EN: 71 segments...
  🌐 EN->ES: 71 segments...
✅ SAFE translation completed!
💾 Progress: 520 entries


🛡️ SAFE translation:  66%|██████▌   | 52/79 [2:36:40<1:07:19, 149.62s/it]

🛡️ SAFE translating 61 segments...
  🌐 ES->EN: 62 segments...
  🌐 EN->ES: 62 segments...
✅ SAFE translation completed!
💾 Progress: 530 entries


🛡️ SAFE translation:  67%|██████▋   | 53/79 [2:39:16<1:05:42, 151.65s/it]

🛡️ SAFE translating 68 segments...
  🌐 ES->EN: 70 segments...
  🌐 EN->ES: 70 segments...
✅ SAFE translation completed!
💾 Progress: 540 entries


🛡️ SAFE translation:  68%|██████▊   | 54/79 [2:41:57<1:04:19, 154.37s/it]

🛡️ SAFE translating 73 segments...
  🌐 ES->EN: 75 segments...
  🌐 EN->ES: 75 segments...
✅ SAFE translation completed!
💾 Progress: 550 entries


🛡️ SAFE translation:  70%|██████▉   | 55/79 [2:43:42<55:51, 139.63s/it]  

🛡️ SAFE translating 84 segments...
  🌐 ES->EN: 84 segments...
  🌐 EN->ES: 84 segments...
💾 Progress: 560 entries


🛡️ SAFE translation:  71%|███████   | 56/79 [2:46:52<59:18, 154.70s/it]

🛡️ SAFE translating 68 segments...
  🌐 ES->EN: 70 segments...
  🌐 EN->ES: 70 segments...
✅ SAFE translation completed!
💾 Progress: 570 entries


🛡️ SAFE translation:  72%|███████▏  | 57/79 [2:49:18<55:44, 152.03s/it]

🛡️ SAFE translating 57 segments...
  🌐 ES->EN: 57 segments...
  🌐 EN->ES: 57 segments...
💾 Progress: 580 entries


🛡️ SAFE translation:  73%|███████▎  | 58/79 [2:51:08<48:48, 139.45s/it]

🛡️ SAFE translating 77 segments...
  🌐 ES->EN: 79 segments...
  🌐 EN->ES: 79 segments...
✅ SAFE translation completed!
💾 Progress: 590 entries


🛡️ SAFE translation:  75%|███████▍  | 59/79 [2:53:41<47:48, 143.44s/it]

🛡️ SAFE translating 73 segments...
  🌐 ES->EN: 75 segments...
  🌐 EN->ES: 75 segments...
✅ SAFE translation completed!
💾 Progress: 600 entries


🛡️ SAFE translation:  76%|███████▌  | 60/79 [2:56:44<49:14, 155.49s/it]

🛡️ SAFE translating 75 segments...
  🌐 ES->EN: 84 segments...
  🌐 EN->ES: 84 segments...
✅ SAFE translation completed!
💾 Progress: 610 entries


🛡️ SAFE translation:  77%|███████▋  | 61/79 [2:59:43<48:42, 162.36s/it]

🛡️ SAFE translating 120 segments...
  🌐 ES->EN: 121 segments...
  🌐 EN->ES: 121 segments...
✅ SAFE translation completed!
💾 Progress: 620 entries


🛡️ SAFE translation:  78%|███████▊  | 62/79 [3:03:27<51:13, 180.81s/it]

🛡️ SAFE translating 102 segments...
  🌐 ES->EN: 112 segments...
  🌐 EN->ES: 112 segments...
✅ SAFE translation completed!
💾 Progress: 630 entries


🛡️ SAFE translation:  80%|███████▉  | 63/79 [3:06:55<50:26, 189.16s/it]

🛡️ SAFE translating 65 segments...
  🌐 ES->EN: 65 segments...
  🌐 EN->ES: 65 segments...
💾 Progress: 640 entries


🛡️ SAFE translation:  81%|████████  | 64/79 [3:08:13<38:57, 155.86s/it]

🛡️ SAFE translating 73 segments...
  🌐 ES->EN: 77 segments...
  🌐 EN->ES: 77 segments...
✅ SAFE translation completed!
💾 Progress: 650 entries


🛡️ SAFE translation:  82%|████████▏ | 65/79 [3:10:44<35:59, 154.23s/it]

🛡️ SAFE translating 44 segments...
  🌐 ES->EN: 58 segments...
  🌐 EN->ES: 58 segments...
✅ SAFE translation completed!
💾 Progress: 660 entries


🛡️ SAFE translation:  84%|████████▎ | 66/79 [3:13:10<32:55, 151.93s/it]

🛡️ SAFE translating 42 segments...
  🌐 ES->EN: 52 segments...
  🌐 EN->ES: 52 segments...
✅ SAFE translation completed!
💾 Progress: 670 entries


🛡️ SAFE translation:  85%|████████▍ | 67/79 [3:15:24<29:15, 146.28s/it]

🛡️ SAFE translating 59 segments...
  🌐 ES->EN: 61 segments...
  🌐 EN->ES: 61 segments...
✅ SAFE translation completed!
💾 Progress: 680 entries


🛡️ SAFE translation:  86%|████████▌ | 68/79 [3:18:02<27:29, 149.92s/it]

🛡️ SAFE translating 98 segments...
  🌐 ES->EN: 102 segments...
  🌐 EN->ES: 102 segments...
✅ SAFE translation completed!
💾 Progress: 690 entries


🛡️ SAFE translation:  87%|████████▋ | 69/79 [3:21:27<27:43, 166.35s/it]

🛡️ SAFE translating 88 segments...
  🌐 ES->EN: 103 segments...
  🌐 EN->ES: 103 segments...
✅ SAFE translation completed!
💾 Progress: 700 entries


🛡️ SAFE translation:  89%|████████▊ | 70/79 [3:25:11<27:32, 183.66s/it]

🛡️ SAFE translating 86 segments...
  🌐 ES->EN: 88 segments...
  🌐 EN->ES: 88 segments...
✅ SAFE translation completed!
💾 Progress: 710 entries


🛡️ SAFE translation:  90%|████████▉ | 71/79 [3:28:14<24:29, 183.72s/it]

🛡️ SAFE translating 72 segments...
  🌐 ES->EN: 75 segments...
  🌐 EN->ES: 75 segments...
✅ SAFE translation completed!
💾 Progress: 720 entries


🛡️ SAFE translation:  91%|█████████ | 72/79 [3:31:34<21:59, 188.56s/it]

🛡️ SAFE translating 74 segments...
  🌐 ES->EN: 75 segments...
  🌐 EN->ES: 75 segments...
✅ SAFE translation completed!
💾 Progress: 730 entries


🛡️ SAFE translation:  92%|█████████▏| 73/79 [3:34:16<18:02, 180.35s/it]

🛡️ SAFE translating 70 segments...
  🌐 ES->EN: 71 segments...
  🌐 EN->ES: 71 segments...
✅ SAFE translation completed!
💾 Progress: 740 entries


🛡️ SAFE translation:  94%|█████████▎| 74/79 [3:36:55<14:30, 174.16s/it]

🛡️ SAFE translating 73 segments...
  🌐 ES->EN: 74 segments...
  🌐 EN->ES: 74 segments...
✅ SAFE translation completed!
💾 Progress: 750 entries


🛡️ SAFE translation:  95%|█████████▍| 75/79 [3:39:12<10:51, 162.84s/it]

🛡️ SAFE translating 58 segments...
  🌐 ES->EN: 62 segments...
  🌐 EN->ES: 62 segments...
✅ SAFE translation completed!
💾 Progress: 760 entries


🛡️ SAFE translation:  96%|█████████▌| 76/79 [3:41:56<08:09, 163.13s/it]

🛡️ SAFE translating 70 segments...
  🌐 ES->EN: 77 segments...
  🌐 EN->ES: 77 segments...
✅ SAFE translation completed!
💾 Progress: 770 entries


🛡️ SAFE translation:  97%|█████████▋| 77/79 [3:45:01<05:39, 169.77s/it]

🛡️ SAFE translating 91 segments...
  🌐 ES->EN: 95 segments...
  🌐 EN->ES: 95 segments...
✅ SAFE translation completed!
💾 Progress: 780 entries


🛡️ SAFE translation:  99%|█████████▊| 78/79 [3:48:46<03:06, 186.52s/it]

🛡️ SAFE translating 26 segments...
  🌐 ES->EN: 27 segments...
  🌐 EN->ES: 27 segments...
✅ SAFE translation completed!


🛡️ SAFE translation: 100%|██████████| 79/79 [3:49:41<00:00, 174.45s/it]

💾 Progress: 781 entries
🎉 SAFE translation complete: 781 entries





🎉 SAFE pipeline completed!


In [None]:
import os

# Check if the SAFE model was saved
model_path = "/content/drive/MyDrive/TFG/TRAINS/models/bsc-bio-ehr-es-clinais-SAFE"

print("🔍 Checking SAFE A100 model...")
if os.path.exists(model_path):
    print("✅ SAFE model directory exists!")

    # Check for key model files
    files_to_check = [
        "config.json",
        "pytorch_model.bin",
        "tokenizer.json",
        "tokenizer_config.json"
    ]

    for file in files_to_check:
        file_path = os.path.join(model_path, file)
        if os.path.exists(file_path):
            size_mb = os.path.getsize(file_path) / (1024*1024)
            print(f"  ✅ {file} ({size_mb:.1f} MB)")
        else:
            print(f"  ❌ {file} - MISSING")

    # Check dataset files
    dataset_path = "/content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.train_dev_augmented_SAFE.json"
    if os.path.exists(dataset_path):
        size_mb = os.path.getsize(dataset_path) / (1024*1024)
        print(f"  ✅ Augmented dataset ({size_mb:.1f} MB)")

    print("\n🚀 Ready to run the testing script!")

else:
    print("❌ SAFE model directory not found")
    print("🔧 Training may have been interrupted")


🔍 Checking SAFE A100 model...
✅ SAFE model directory exists!
  ❌ config.json - MISSING
  ❌ pytorch_model.bin - MISSING
  ❌ tokenizer.json - MISSING
  ❌ tokenizer_config.json - MISSING

🚀 Ready to run the testing script!


In [None]:
# ===================================================================
# ROBUST CLINAIS FINAL SUBMISSION PIPELINE (WORKING & COMPATIBLE)
#
# This version uses the training strategy from your working script
# to guarantee compatibility and remove the error.
# It trains for a fixed number of epochs without intermediate evaluation.
# ===================================================================
import os
import json
import logging
import gc
import math
import numpy as np
from enum import Enum
from typing import List, Dict, Any
from pydantic import BaseModel, Field
from pathlib import Path
import zipfile

import torch
from tqdm.auto import tqdm

# --- Basic Setup ---
logging.basicConfig(level=logging.INFO)
logging.getLogger("transformers").setLevel(logging.WARNING)
logging.getLogger("pydantic").setLevel(logging.WARNING)


# --- Device Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")
if torch.cuda.is_available():
    print(f"🔥 GPU: {torch.cuda.get_device_name()}")
    torch.backends.cudnn.benchmark = True


def safe_cuda_cleanup():
    """Safely free up GPU memory."""
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
    gc.collect()


# ================================
# 1. DATASET MODELS (PYDANTIC)
# ================================
class ClinicalSections(str, Enum):
    PRESENT_ILLNESS = "PRESENT_ILLNESS"
    DERIVED_FROM_TO = "DERIVED_FROM/TO"
    PAST_MEDICAL_HISTORY = "PAST_MEDICAL_HISTORY"
    FAMILY_HISTORY = "FAMILY_HISTORY"
    EXPLORATION = "EXPLORATION"
    TREATMENT = "TREATMENT"
    EVOLUTION = "EVOLUTION"

    @classmethod
    def list(cls):
        return [c.value for c in cls]

class SectionAnnotation(BaseModel):
    segment: str
    label: ClinicalSections
    start_offset: int
    end_offset: int

class SectionAnnotations(BaseModel):
    gold: List[SectionAnnotation] = Field(default_factory=list)
    prediction: List[SectionAnnotation] = Field(default_factory=list)

class BoundaryAnnotation(BaseModel):
    span: str
    boundary: ClinicalSections | None
    start_offset: int
    end_offset: int

class BoundaryAnnotations(BaseModel):
    gold: List[BoundaryAnnotation] = Field(default_factory=list)
    prediction: List[BoundaryAnnotation] = Field(default_factory=list)

class Entry(BaseModel):
    note_id: str
    note_text: str
    section_annotation: SectionAnnotations = Field(default_factory=SectionAnnotations)
    boundary_annotation: BoundaryAnnotations = Field(default_factory=BoundaryAnnotations)

class ClinAISDataset(BaseModel):
    annotated_entries: Dict[str, Entry]
    scores: Dict[str, Any] = Field(default_factory=dict)

    def to_json(self):
        # Use model_dump_json for Pydantic v2, fallback to dict for v1
        if hasattr(self, 'model_dump_json'):
             return self.model_dump_json(indent=2)
        return json.dumps(self.dict(), indent=2, ensure_ascii=False)

    @classmethod
    def from_json(cls, json_str: str):
        if hasattr(cls, 'model_validate_json'):
            return cls.model_validate_json(json_str)
        return cls.parse_raw(json_str)


# ================================
# 2. DATA PREPARATION
# ================================
def prepare_training_data(dataset_path: str, tokenizer, label2id: Dict, max_length: int = 512):
    """Loads and processes the data for training."""
    from datasets import Dataset, DatasetDict

    print(f"📊 Preparing training data from: {dataset_path}")
    with open(dataset_path, 'r', encoding='utf-8') as f:
        ds = ClinAISDataset.from_json(f.read())

    examples = []
    for entry in tqdm(ds.annotated_entries.values(), desc="Processing entries"):
        tokens, labels, current_section = [], [], None
        for ba in entry.boundary_annotation.gold:
            tokens.append(ba.span)
            if ba.boundary:
                current_section = ba.boundary
            label_name = current_section or ClinicalSections.PRESENT_ILLNESS
            labels.append(label2id[label_name])
        if tokens:
            examples.append({'tokens': tokens, 'labels': labels})

    print(f"✅ Data processed: {len(examples)} training examples.")

    def tokenize_and_align(batch):
        tokenized = tokenizer(batch['tokens'], truncation=True, is_split_into_words=True, max_length=max_length)
        aligned_labels = []
        for i, labels in enumerate(batch['labels']):
            word_ids = tokenized.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(labels[word_idx])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx
            aligned_labels.append(label_ids)
        tokenized['labels'] = aligned_labels
        return tokenized

    train_ds = Dataset.from_list(examples).map(tokenize_and_align, batched=True)
    return DatasetDict({'train': train_ds})


# ================================
# 3. TRAINING
# ================================
def train_model(train_data_path: str, model_output_dir: str):
    """Main function to configure and run the model training."""
    from transformers import (
        AutoModelForTokenClassification, AutoTokenizer, TrainingArguments,
        DataCollatorForTokenClassification, Trainer
    )

    print("\n--- Starting Model Training ---")
    base_model = "PlanTL-GOB-ES/bsc-bio-ehr-es"

    label_list = ClinicalSections.list()
    label2id = {label: i for i, label in enumerate(label_list)}
    id2label = {i: label for i, label in enumerate(label_list)}

    tokenizer = AutoTokenizer.from_pretrained(base_model)
    model = AutoModelForTokenClassification.from_pretrained(base_model, num_labels=len(label_list), id2label=id2label, label2id=label2id)

    dataset = prepare_training_data(train_data_path, tokenizer, label2id)

    # --- MODIFIED SECTION: Using TrainingArguments from your working script ---
    # This configuration avoids evaluation and will not cause an error.
    training_args = TrainingArguments(
        output_dir=model_output_dir,
        learning_rate=3e-5,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        weight_decay=0.01,
        warmup_ratio=0.1,
        save_strategy="epoch",  # Save at the end of each epoch
        logging_steps=100,
        save_total_limit=2,
        fp16=torch.cuda.is_available(),
        report_to="none",
    )
    # --- END OF MODIFIED SECTION ---

    data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)

    # The Trainer is now simpler, without evaluation datasets or callbacks.
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    print("🏋️‍♂️ Starting training...")
    trainer.train()
    print("✅ Training finished.")

    trainer.save_model(model_output_dir)
    tokenizer.save_pretrained(model_output_dir)
    print(f"💾 Final model saved to: {model_output_dir}")
    safe_cuda_cleanup()


# ================================
# 4. PREDICTION
# ================================
def generate_predictions(model_path: str, test_data_path: str, output_json_path: str):
    """Loads a trained model and generates predictions for the test set."""
    from transformers import pipeline

    print("\n--- Generating Final Predictions ---")
    print(f"🧪 Using test data: {test_data_path}")

    ner_pipe = pipeline("token-classification", model=model_path, tokenizer=model_path, device=0 if device.type == 'cuda' else -1, aggregation_strategy="simple")

    with open(test_data_path, 'r', encoding='utf-8') as f:
        test_ds = ClinAISDataset.from_json(f.read())

    for entry in tqdm(test_ds.annotated_entries.values(), desc="Predicting on test set"):
        predictions = ner_pipe(entry.note_text)
        entry.section_annotation.prediction = []
        char_to_label = {}
        for pred in predictions:
            segment_text = pred['word'].strip()
            if not segment_text: continue

            # A simple find might be fragile, but is a reasonable approach
            start = entry.note_text.find(segment_text, max(0, pred['start'] - 20))
            if start == -1: continue # Could not re-align, skip
            end = start + len(segment_text)

            entry.section_annotation.prediction.append(SectionAnnotation(
                segment=segment_text, label=pred['entity_group'],
                start_offset=start, end_offset=end
            ))
            for i in range(start, end):
                char_to_label[i] = pred['entity_group']

        current_section = None
        entry.boundary_annotation.prediction = [] # Ensure it's reset
        for gold_ba in entry.boundary_annotation.gold:
            pred_boundary = char_to_label.get(gold_ba.start_offset)
            boundary_label = None
            if pred_boundary and pred_boundary != current_section:
                boundary_label = pred_boundary
                current_section = pred_boundary
            entry.boundary_annotation.prediction.append(BoundaryAnnotation(
                span=gold_ba.span, boundary=boundary_label,
                start_offset=gold_ba.start_offset, end_offset=gold_ba.end_offset
            ))

    with open(output_json_path, 'w', encoding='utf-8') as f:
        f.write(test_ds.to_json())
    print(f"✅ Predictions saved to: {output_json_path}")


# ================================
# 5. MAIN EXECUTION
# ================================
def main():
    """Main execution function to run the entire pipeline."""
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        DRIVE_PATH = "/content/drive/MyDrive/"
    except (ImportError, ModuleNotFoundError):
        print("Not in Google Colab. Using current directory.")
        DRIVE_PATH = "./"

    DATA_DIR = os.path.join(DRIVE_PATH, "TFG/ClinAIS_dataset")
    MODEL_DIR = os.path.join(DRIVE_PATH, "TFG/TRAINS/models/clinais-final-working")
    os.makedirs(DATA_DIR, exist_ok=True)
    os.makedirs(MODEL_DIR, exist_ok=True)

    # Using the augmented dataset is still the best approach for performance
    TRAIN_FILE = os.path.join(DATA_DIR, "clinais.train_dev_augmented_SAFE.json")
    TEST_FILE = os.path.join(DATA_DIR, "clinais.test&background.blind.json")
    PREDICTIONS_JSON = os.path.join(MODEL_DIR, "predictions.json")
    SUBMISSION_ZIP = os.path.join(MODEL_DIR, "submission.zip")

    print(f"📁 Data Directory: {DATA_DIR}")
    print(f"🎯 Model Output Directory: {MODEL_DIR}")

    if not os.path.exists(TRAIN_FILE):
        raise FileNotFoundError(f"CRITICAL: Training file not found at {TRAIN_FILE}.")
    if not os.path.exists(TEST_FILE):
        raise FileNotFoundError(f"CRITICAL: Test file not found at {TEST_FILE}.")

    # --- Run Pipeline ---
    train_model(train_data_path=TRAIN_FILE, model_output_dir=MODEL_DIR)
    generate_predictions(model_path=MODEL_DIR, test_data_path=TEST_FILE, output_json_path=PREDICTIONS_JSON)

    print("\n--- Creating Submission ZIP ---")
    with zipfile.ZipFile(SUBMISSION_ZIP, 'w', zipfile.ZIP_DEFLATED) as zf:
        zf.write(PREDICTIONS_JSON, os.path.basename(PREDICTIONS_JSON))
    print(f"📦 Submission ZIP created at: {SUBMISSION_ZIP}")

    print("\n🎉🎉🎉 FINAL SUBMISSION PIPELINE COMPLETED SUCCESSFULLY! 🎉🎉🎉")
    print(f"You can now download and submit the file: {SUBMISSION_ZIP}")

if __name__ == "__main__":
    main()

🚀 Using device: cuda
🔥 GPU: NVIDIA A100-SXM4-40GB
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📁 Data Directory: /content/drive/MyDrive/TFG/ClinAIS_dataset
🎯 Model Output Directory: /content/drive/MyDrive/TFG/TRAINS/models/clinais-final-working

--- Starting Model Training ---


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/bsc-bio-ehr-es and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


📊 Preparing training data from: /content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.train_dev_augmented_SAFE.json


Processing entries:   0%|          | 0/1689 [00:00<?, ?it/s]

✅ Data processed: 1689 training examples.


Map:   0%|          | 0/1689 [00:00<?, ? examples/s]

  trainer = Trainer(


🏋️‍♂️ Starting training...


Step,Training Loss
100,1.3231
200,0.8007
300,0.6577


✅ Training finished.
💾 Final model saved to: /content/drive/MyDrive/TFG/TRAINS/models/clinais-final-working

--- Generating Final Predictions ---
🧪 Using test data: /content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.test&background.blind.json


Device set to use cuda:0


Predicting on test set:   0%|          | 0/2843 [00:00<?, ?it/s]

✅ Predictions saved to: /content/drive/MyDrive/TFG/TRAINS/models/clinais-final-working/predictions.json

--- Creating Submission ZIP ---
📦 Submission ZIP created at: /content/drive/MyDrive/TFG/TRAINS/models/clinais-final-working/submission.zip

🎉🎉🎉 FINAL SUBMISSION PIPELINE COMPLETED SUCCESSFULLY! 🎉🎉🎉
You can now download and submit the file: /content/drive/MyDrive/TFG/TRAINS/models/clinais-final-working/submission.zip


In [None]:
# ================================
# COMPREHENSIVE PROJECT STATISTICS
# ================================
import os
import json
import zipfile
from pathlib import Path
import pandas as pd
from datetime import datetime
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import transformers

def format_size(size_bytes):
    """Convert bytes to human readable format"""
    if size_bytes == 0:
        return "0 B"
    size_names = ["B", "KB", "MB", "GB", "TB"]
    import math
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return f"{s} {size_names[i]}"

def get_directory_size(path):
    """Calculate total size of directory"""
    total_size = 0
    if os.path.exists(path):
        if os.path.isfile(path):
            return os.path.getsize(path)
        for dirpath, dirnames, filenames in os.walk(path):
            for filename in filenames:
                filepath = os.path.join(dirpath, filename)
                try:
                    total_size += os.path.getsize(filepath)
                except (OSError, IOError):
                    pass
    return total_size

def analyze_json_dataset(file_path):
    """Analyze a ClinAIS JSON dataset file"""
    if not os.path.exists(file_path):
        return None

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        ds = ClinAISDataset(**data)

        # Basic stats
        stats = {
            'entries': len(ds.annotated_entries),
            'file_size': format_size(os.path.getsize(file_path)),
            'file_size_bytes': os.path.getsize(file_path)
        }

        # Text statistics
        text_lengths = []
        total_sections = 0
        total_boundaries = 0
        section_counts = {section: 0 for section in ClinicalSections.list()}

        for entry in ds.annotated_entries.values():
            text_lengths.append(len(entry.note_text))
            total_sections += len(entry.section_annotation.gold)
            total_boundaries += len(entry.boundary_annotation.gold)

            # Count sections
            for section in entry.section_annotation.gold:
                if section.label in section_counts:
                    section_counts[section.label] += 1

        stats.update({
            'avg_text_length': sum(text_lengths) / len(text_lengths) if text_lengths else 0,
            'min_text_length': min(text_lengths) if text_lengths else 0,
            'max_text_length': max(text_lengths) if text_lengths else 0,
            'total_sections': total_sections,
            'avg_sections_per_entry': total_sections / len(ds.annotated_entries) if ds.annotated_entries else 0,
            'total_boundaries': total_boundaries,
            'avg_boundaries_per_entry': total_boundaries / len(ds.annotated_entries) if ds.annotated_entries else 0,
            'section_distribution': section_counts
        })

        return stats

    except Exception as e:
        return {'error': str(e)}

def analyze_model_directory(model_path):
    """Analyze a trained model directory"""
    if not os.path.exists(model_path):
        return None

    stats = {
        'exists': True,
        'total_size': format_size(get_directory_size(model_path)),
        'total_size_bytes': get_directory_size(model_path),
        'files': {}
    }

    # Expected model files
    expected_files = [
        'config.json',
        'pytorch_model.bin',
        'tokenizer.json',
        'tokenizer_config.json',
        'special_tokens_map.json',
        'vocab.txt',
        'training_args.bin',
        'training_info.json'
    ]

    for filename in expected_files:
        filepath = os.path.join(model_path, filename)
        if os.path.exists(filepath):
            stats['files'][filename] = {
                'exists': True,
                'size': format_size(os.path.getsize(filepath)),
                'size_bytes': os.path.getsize(filepath)
            }
        else:
            stats['files'][filename] = {'exists': False}

    # Check for additional files
    if os.path.exists(model_path):
        all_files = os.listdir(model_path)
        additional_files = [f for f in all_files if f not in expected_files]
        if additional_files:
            stats['additional_files'] = {}
            for filename in additional_files:
                filepath = os.path.join(model_path, filename)
                if os.path.isfile(filepath):
                    stats['additional_files'][filename] = {
                        'size': format_size(os.path.getsize(filepath)),
                        'size_bytes': os.path.getsize(filepath)
                    }
                elif os.path.isdir(filepath):
                    stats['additional_files'][filename] = {
                        'type': 'directory',
                        'size': format_size(get_directory_size(filepath)),
                        'size_bytes': get_directory_size(filepath)
                    }

    # Try to load model config
    try:
        config_path = os.path.join(model_path, 'config.json')
        if os.path.exists(config_path):
            with open(config_path, 'r') as f:
                config = json.load(f)
            stats['model_config'] = {
                'model_type': config.get('model_type'),
                'num_labels': config.get('num_labels'),
                'hidden_size': config.get('hidden_size'),
                'num_attention_heads': config.get('num_attention_heads'),
                'num_hidden_layers': config.get('num_hidden_layers'),
                'vocab_size': config.get('vocab_size')
            }
    except:
        pass

    # Try to load training info
    try:
        training_info_path = os.path.join(model_path, 'training_info.json')
        if os.path.exists(training_info_path):
            with open(training_info_path, 'r') as f:
                training_info = json.load(f)
            stats['training_info'] = training_info
    except:
        pass

    return stats

def show_comprehensive_stats():
    """Show comprehensive statistics about the entire project"""
    print("🔍" + "="*80)
    print("🔍 COMPREHENSIVE PROJECT STATISTICS")
    print("🔍" + "="*80)
    print(f"📅 Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    # Define all paths
    base_path = "/content/drive/MyDrive/TFG/ClinAIS_dataset"
    model_output_dir = "/content/drive/MyDrive/TFG/TRAINS/models/clinais-final-augmented"

    datasets = {
        "Train": os.path.join(base_path, "clinais.train.json"),
        "Dev": os.path.join(base_path, "clinais.dev.json"),
        "Test": os.path.join(base_path, "clinais.test&background.blind.json"),
        "Combined": os.path.join(base_path, "clinais.train_dev_combined.json"),
        "Translated": os.path.join(base_path, "clinais.train_translated_SAFE.json"),
        "Augmented": os.path.join(base_path, "clinais.train_dev_augmented_SAFE.json")
    }

    # 1. SYSTEM INFORMATION
    print("\n🖥️  === SYSTEM INFORMATION ===")
    print(f"🐍 Python Version: {sys.version.split()[0]}")
    print(f"🔥 PyTorch Version: {torch.__version__}")
    print(f"🤖 Transformers Version: {transformers.__version__}")
    print(f"💻 Device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")

    if torch.cuda.is_available():
        print(f"🚀 GPU: {torch.cuda.get_device_name()}")
        print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
        print(f"🔋 GPU Memory Available: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated()) / 1e9:.1f} GB")

    # 2. DIRECTORY STRUCTURE
    print("\n📁 === DIRECTORY STRUCTURE ===")
    directories_to_check = {
        "Dataset Base": base_path,
        "Model Output": model_output_dir,
        "Drive Root": "/content/drive/MyDrive/TFG",
        "TRAINS": "/content/drive/MyDrive/TFG/TRAINS",
        "Models": "/content/drive/MyDrive/TFG/TRAINS/models"
    }

    for name, path in directories_to_check.items():
        if os.path.exists(path):
            size = get_directory_size(path)
            file_count = sum([len(files) for r, d, files in os.walk(path)])
            dir_count = sum([len(dirs) for r, dirs, files in os.walk(path)])
            print(f"✅ {name}: {format_size(size)} ({file_count} files, {dir_count} dirs)")
        else:
            print(f"❌ {name}: Not found")

    # 3. DATASET ANALYSIS
    print("\n📊 === DATASET ANALYSIS ===")
    dataset_stats = {}
    total_entries = 0

    for name, path in datasets.items():
        print(f"\n📋 {name} Dataset:")
        stats = analyze_json_dataset(path)
        if stats:
            if 'error' in stats:
                print(f"  ❌ Error: {stats['error']}")
            else:
                dataset_stats[name] = stats
                total_entries += stats['entries']
                print(f"  📄 Entries: {stats['entries']:,}")
                print(f"  💾 Size: {stats['file_size']}")
                print(f"  📏 Avg text length: {stats['avg_text_length']:.0f} chars")
                print(f"  📑 Avg sections/entry: {stats['avg_sections_per_entry']:.1f}")
                print(f"  🔗 Avg boundaries/entry: {stats['avg_boundaries_per_entry']:.1f}")

                # Show section distribution
                print(f"  🏷️  Section Distribution:")
                for section, count in stats['section_distribution'].items():
                    percentage = (count / stats['total_sections']) * 100 if stats['total_sections'] > 0 else 0
                    print(f"    {section}: {count} ({percentage:.1f}%)")
        else:
            print(f"  ❌ File not found")

    # Dataset comparison
    if len(dataset_stats) > 1:
        print(f"\n📈 === DATASET COMPARISON ===")
        comparison_data = []
        for name, stats in dataset_stats.items():
            comparison_data.append({
                'Dataset': name,
                'Entries': f"{stats['entries']:,}",
                'Size': stats['file_size'],
                'Avg Text Len': f"{stats['avg_text_length']:.0f}",
                'Avg Sections': f"{stats['avg_sections_per_entry']:.1f}",
                'Avg Boundaries': f"{stats['avg_boundaries_per_entry']:.1f}"
            })

        df = pd.DataFrame(comparison_data)
        print(df.to_string(index=False))

    # 4. MODEL ANALYSIS
    print(f"\n🤖 === MODEL ANALYSIS ===")
    model_stats = analyze_model_directory(model_output_dir)

    if model_stats and model_stats['exists']:
        print(f"✅ Model exists: {model_stats['total_size']}")

        print(f"\n📄 Model Files:")
        for filename, info in model_stats['files'].items():
            if info['exists']:
                print(f"  ✅ {filename}: {info['size']}")
            else:
                print(f"  ❌ {filename}: Missing")

        if 'additional_files' in model_stats:
            print(f"\n📄 Additional Files:")
            for filename, info in model_stats['additional_files'].items():
                if 'type' in info and info['type'] == 'directory':
                    print(f"  📁 {filename}: {info['size']}")
                else:
                    print(f"  📄 {filename}: {info['size']}")

        if 'model_config' in model_stats:
            print(f"\n⚙️  Model Configuration:")
            config = model_stats['model_config']
            for key, value in config.items():
                if value is not None:
                    print(f"  {key}: {value}")

        if 'training_info' in model_stats:
            print(f"\n🏋️‍♂️ Training Information:")
            training = model_stats['training_info']
            for key, value in training.items():
                if key == 'labels':
                    print(f"  {key}: {len(value)} labels")
                elif isinstance(value, float):
                    print(f"  {key}: {value:.4f}")
                else:
                    print(f"  {key}: {value}")
    else:
        print(f"❌ Model not found at {model_output_dir}")

    # 5. PREDICTION FILES
    print(f"\n🔮 === PREDICTION FILES ===")
    prediction_files = {
        "Dev Predictions": os.path.join(model_output_dir, "dev_predictions.json"),
        "Test Predictions": os.path.join(model_output_dir, "test_predictions.json"),
        "Dev Eval Report": os.path.join(model_output_dir, "dev_evaluation_report.json"),
        "Submission ZIP": os.path.join(model_output_dir, "clinais_final_submission.zip")
    }

    for name, path in prediction_files.items():
        if os.path.exists(path):
            size = format_size(os.path.getsize(path))
            print(f"✅ {name}: {size}")

            # Special analysis for prediction files
            if name.endswith("Predictions") and path.endswith('.json'):
                try:
                    pred_stats = analyze_json_dataset(path)
                    if pred_stats and 'entries' in pred_stats:
                        print(f"   📊 Entries: {pred_stats['entries']:,}")
                except:
                    pass

            # Special analysis for evaluation report
            elif name == "Dev Eval Report":
                try:
                    with open(path, 'r') as f:
                        eval_data = json.load(f)
                    b2_score = eval_data.get('b2_score', 0)
                    print(f"   🎯 B2 Score: {b2_score:.4f} ({b2_score*100:.2f}%)")
                    print(f"   ❌ Boundary Errors: {eval_data.get('total_boundary_errors', 'N/A')}")
                except:
                    pass

            # Special analysis for ZIP file
            elif name == "Submission ZIP":
                try:
                    with zipfile.ZipFile(path, 'r') as zipf:
                        files_in_zip = zipf.namelist()
                        print(f"   📦 Files in ZIP: {files_in_zip}")
                        # Check ZIP contents
                        for zip_file in files_in_zip:
                            info = zipf.getinfo(zip_file)
                            print(f"   📄 {zip_file}: {format_size(info.file_size)}")
                except:
                    pass
        else:
            print(f"❌ {name}: Not found")

    # 6. STORAGE SUMMARY
    print(f"\n💾 === STORAGE SUMMARY ===")
    total_dataset_size = sum([stats['file_size_bytes'] for stats in dataset_stats.values()])
    model_size = model_stats['total_size_bytes'] if model_stats and model_stats['exists'] else 0

    print(f"📊 Total Dataset Size: {format_size(total_dataset_size)}")
    print(f"🤖 Model Size: {format_size(model_size)}")
    print(f"📁 Total Project Size: {format_size(total_dataset_size + model_size)}")
    print(f"📈 Total Entries Across All Datasets: {total_entries:,}")

    # 7. CLINICAL SECTIONS OVERVIEW
    print(f"\n🏥 === CLINICAL SECTIONS OVERVIEW ===")
    print(f"📋 Available Sections:")
    for i, section in enumerate(ClinicalSections.list(), 1):
        print(f"  {i}. {section}")

    # 8. READINESS CHECK
    print(f"\n✅ === SUBMISSION READINESS CHECK ===")
    readiness_items = [
        ("Train Dataset", datasets["Train"]),
        ("Dev Dataset", datasets["Dev"]),
        ("Test Dataset", datasets["Test"]),
        ("Augmented Dataset", datasets["Augmented"]),
        ("Trained Model", model_output_dir),
        ("Test Predictions", os.path.join(model_output_dir, "test_predictions.json")),
        ("Submission ZIP", os.path.join(model_output_dir, "clinais_final_submission.zip"))
    ]

    all_ready = True
    for item_name, item_path in readiness_items:
        if os.path.exists(item_path):
            print(f"✅ {item_name}")
        else:
            print(f"❌ {item_name}")
            all_ready = False

    if all_ready:
        print(f"\n🎉 === PROJECT STATUS: READY FOR SUBMISSION! ===")
    else:
        print(f"\n⚠️ === PROJECT STATUS: INCOMPLETE ===")
        print("Some required files are missing. Please complete the pipeline.")

    print(f"\n🔍" + "="*80)
    print(f"🔍 END OF COMPREHENSIVE ANALYSIS")
    print(f"🔍" + "="*80)

# Run the comprehensive analysis
show_comprehensive_stats()

🔍 COMPREHENSIVE PROJECT STATISTICS
📅 Analysis Date: 2025-06-19 09:20:04

🖥️  === SYSTEM INFORMATION ===
🐍 Python Version: 3.11.13
🔥 PyTorch Version: 2.6.0+cu124
🤖 Transformers Version: 4.52.4
💻 Device: cuda
🚀 GPU: NVIDIA A100-SXM4-40GB
💾 GPU Memory: 42.5 GB
🔋 GPU Memory Available: 38.9 GB

📁 === DIRECTORY STRUCTURE ===
✅ Dataset Base: 644.74 MB (18 files, 0 dirs)
✅ Model Output: 3.45 GB (39 files, 2 dirs)
✅ Drive Root: 213.28 GB (2185 files, 228 dirs)
✅ TRAINS: 212.65 GB (2066 files, 208 dirs)
✅ Models: 173.82 GB (1668 files, 173 dirs)

📊 === DATASET ANALYSIS ===

📋 Train Dataset:
  📄 Entries: 781
  💾 Size: 39.37 MB
  📏 Avg text length: 2286 chars
  📑 Avg sections/entry: 8.3
  🔗 Avg boundaries/entry: 349.1
  🏷️  Section Distribution:
    PRESENT_ILLNESS: 1258 (19.4%)
    DERIVED_FROM/TO: 180 (2.8%)
    PAST_MEDICAL_HISTORY: 692 (10.7%)
    FAMILY_HISTORY: 84 (1.3%)
    EXPLORATION: 1533 (23.7%)
    TREATMENT: 1471 (22.7%)
    EVOLUTION: 1258 (19.4%)

📋 Dev Dataset:
  📄 Entries: 127
  💾

In [None]:
# APPROACH (MEJORES RESULTADOS)

In [None]:
# This command updates the necessary libraries. Run this cell first.
!pip install --upgrade transformers datasets evaluate accelerate



In [None]:
# ===================================================================
# FINAL SUBMISSION PIPELINE (EXPERIMENT E8 - POST-PROCESSING FIX)
#
# This definitive version fixes the pydantic.ValidationError by
# correctly handling the output of the prediction pipeline before
# running the baseline post-processor.
# ===================================================================
import os
import json
import logging
import gc
import math
import numpy as np
from enum import Enum
from typing import List, Dict, Any
from pydantic import BaseModel, Field
from pathlib import Path
import zipfile

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast

from tqdm.auto import tqdm

from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForTokenClassification, AutoTokenizer,
    DataCollatorForTokenClassification, get_cosine_schedule_with_warmup
)
import evaluate
from sklearn.utils.class_weight import compute_class_weight


# --- Basic Setup ---
logging.basicConfig(level=logging.INFO)
logging.getLogger("transformers").setLevel(logging.WARNING)
logging.getLogger("pydantic").setLevel(logging.WARNING)


# --- Device Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")
if torch.cuda.is_available():
    print(f"🔥 GPU: {torch.cuda.get_device_name()}")
    torch.backends.cudnn.benchmark = True


def safe_cuda_cleanup():
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
    gc.collect()


# ================================
# 1. DATASET MODELS (PYDANTIC)
# ================================
class ClinicalSections(str, Enum):
    PRESENT_ILLNESS = "PRESENT_ILLNESS"
    DERIVED_FROM_TO = "DERIVED_FROM/TO"
    PAST_MEDICAL_HISTORY = "PAST_MEDICAL_HISTORY"
    FAMILY_HISTORY = "FAMILY_HISTORY"
    EXPLORATION = "EXPLORATION"
    TREATMENT = "TREATMENT"
    EVOLUTION = "EVOLUTION"
    @classmethod
    def list(cls): return [c.value for c in cls]

class SectionAnnotation(BaseModel):
    segment: str; label: ClinicalSections; start_offset: int; end_offset: int
class SectionAnnotations(BaseModel):
    gold: List[SectionAnnotation] = Field(default_factory=list); prediction: List[SectionAnnotation] = Field(default_factory=list)
class BoundaryAnnotation(BaseModel):
    span: str; boundary: ClinicalSections | None; start_offset: int; end_offset: int
class BoundaryAnnotations(BaseModel):
    gold: List[BoundaryAnnotation] = Field(default_factory=list); prediction: List[BoundaryAnnotation] = Field(default_factory=list)
class Entry(BaseModel):
    note_id: str; note_text: str; section_annotation: SectionAnnotations = Field(default_factory=SectionAnnotations); boundary_annotation: BoundaryAnnotations = Field(default_factory=BoundaryAnnotations)
class ClinAISDataset(BaseModel):
    annotated_entries: Dict[str, Entry]; scores: Dict[str, Any] = Field(default_factory=dict)
    def to_json(self):
        if hasattr(self, 'model_dump_json'): return self.model_dump_json(indent=2)
        return json.dumps(self.dict(), indent=2, ensure_ascii=False)
    @classmethod
    def from_json(cls, json_str: str):
        if hasattr(cls, 'model_validate_json'): return cls.model_validate_json(json_str)
        return cls.parse_raw(json_str)


# ================================
# 2. DATA PREPARATION
# ================================
def prepare_training_data(dataset_path: str, tokenizer, label2id: Dict, max_length: int = 512, split_ratio: float = 0.9):
    print(f"📊 Preparing training data from: {dataset_path}")
    with open(dataset_path, 'r', encoding='utf-8') as f: ds = ClinAISDataset.from_json(f.read())
    examples = []
    for entry in tqdm(ds.annotated_entries.values(), desc="Processing entries"):
        tokens, labels, current_section = [], [], None
        for ba in entry.boundary_annotation.gold:
            tokens.append(ba.span);
            if ba.boundary: current_section = ba.boundary
            label_name = current_section or ClinicalSections.PRESENT_ILLNESS; labels.append(label2id[label_name])
        if tokens: examples.append({'tokens': tokens, 'labels': labels})
    np.random.seed(42); np.random.shuffle(examples)
    split_idx = int(len(examples) * split_ratio)
    train_examples, val_examples = examples[:split_idx], examples[split_idx:]
    print(f"✅ Data split: {len(train_examples)} training, {len(val_examples)} validation examples.")
    def tokenize_and_align(batch):
        tokenized = tokenizer(batch['tokens'], truncation=True, is_split_into_words=True, max_length=max_length)
        aligned_labels = []
        for i, labels in enumerate(batch['labels']):
            word_ids = tokenized.word_ids(batch_index=i); previous_word_idx = None; label_ids = []
            for word_idx in word_ids:
                if word_idx is None: label_ids.append(-100)
                elif word_idx != previous_word_idx: label_ids.append(labels[word_idx])
                else: label_ids.append(-100)
                previous_word_idx = word_idx
            aligned_labels.append(label_ids)
        tokenized['labels'] = aligned_labels
        return tokenized
    train_ds = Dataset.from_list(train_examples).map(tokenize_and_align, batched=True, remove_columns=['tokens'])
    val_ds = Dataset.from_list(val_examples).map(tokenize_and_align, batched=True, remove_columns=['tokens'])
    return DatasetDict({'train': train_ds, 'validation': val_ds})

def compute_class_weights(dataset_path: str, label2id: Dict):
    print("⚖️ Computing class weights for minority classes...")
    with open(dataset_path, 'r', encoding='utf-8') as f: ds = ClinAISDataset.from_json(f.read())
    all_labels = [label2id[ba.boundary] for entry in ds.annotated_entries.values() for ba in entry.boundary_annotation.gold if ba.boundary is not None]
    unique_labels = np.unique(all_labels)
    weights = compute_class_weight('balanced', classes=unique_labels, y=all_labels)
    weight_tensor = torch.ones(len(label2id)).to(device)
    for label, weight in zip(unique_labels, weights): weight_tensor[label] = weight
    print("✅ Class weights computed.")
    return weight_tensor


# ================================
# 3. TRAINING (E8: ADVANCED HPs)
# ================================
def train_model_e8(train_data_path: str, model_output_dir: str):
    """Manual PyTorch training loop with E8's advanced hyperparameters."""
    print("\n--- Starting E8 Training (Advanced HPs) ---")

    # --- E8 Config ---
    base_model = "PlanTL-GOB-ES/bsc-bio-ehr-es"
    label_list = ClinicalSections.list()
    label2id = {label: i for i, label in enumerate(label_list)}
    id2label = {i: label for i, label in enumerate(label_list)}

    # Hyperparameters
    EPOCHS = 10
    PATIENCE = 2
    LEARNING_RATE = 2e-5
    TRAIN_BATCH_SIZE = 8
    EVAL_BATCH_SIZE = 16
    MAX_GRAD_NORM = 1.0
    LABEL_SMOOTHING = 0.1

    # --- Model, Tokenizer, Data ---
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    model = AutoModelForTokenClassification.from_pretrained(base_model, num_labels=len(label_list), id2label=id2label, label2id=label2id).to(device)
    dataset = prepare_training_data(train_data_path, tokenizer, label2id)
    data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)
    train_dataloader = DataLoader(dataset["train"], shuffle=True, collate_fn=data_collator, batch_size=TRAIN_BATCH_SIZE)
    eval_dataloader = DataLoader(dataset["validation"], collate_fn=data_collator, batch_size=EVAL_BATCH_SIZE)
    class_weights = compute_class_weights(train_data_path, label2id)
    loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights, label_smoothing=LABEL_SMOOTHING)

    # --- Optimizer, Scheduler, Scaler (for FP16) ---
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    num_training_steps = len(train_dataloader) * EPOCHS
    lr_scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    scaler = GradScaler()

    # --- Training Loop ---
    print("🏋️‍♂️ Starting manual training loop for E8...")
    best_f1 = 0.0
    epochs_no_improve = 0
    metric = evaluate.load("seqeval")

    for epoch in range(EPOCHS):
        print(f"\n--- Epoch {epoch + 1}/{EPOCHS} ---")
        model.train()
        total_loss = 0
        for batch in tqdm(train_dataloader, desc="Training"):
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()
            with autocast():
                outputs = model(**batch)
                loss = loss_fct(outputs.logits.view(-1, model.config.num_labels), batch["labels"].view(-1))
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            scaler.step(optimizer)
            scaler.update()
            lr_scheduler.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"  Avg Training Loss: {avg_train_loss:.4f}")

        # --- Evaluation Step ---
        model.eval()
        all_preds, all_labels = [], []
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad(): outputs = model(**batch)
            logits = outputs.logits.argmax(dim=-1); labels = batch["labels"]
            all_preds.extend(logits.cpu().numpy()); all_labels.extend(labels.cpu().numpy())

        true_predictions = [[id2label[p] for (p, l) in zip(pred, lab) if l != -100] for pred, lab in zip(all_preds, all_labels)]
        true_labels = [[id2label[l] for (p, l) in zip(pred, lab) if l != -100] for pred, lab in zip(all_preds, all_labels)]
        results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
        current_f1 = results["overall_f1"]
        print(f"  Validation F1: {current_f1:.4f} | Best F1: {best_f1:.4f}")

        # --- Save Best Model & Early Stopping ---
        if current_f1 > best_f1:
            best_f1 = current_f1; epochs_no_improve = 0
            print(f"  🎉 New best model found! Saving to {model_output_dir}")
            model.save_pretrained(model_output_dir); tokenizer.save_pretrained(model_output_dir)
        else:
            epochs_no_improve += 1
            print(f"  F1 did not improve. Early stopping counter: {epochs_no_improve}/{PATIENCE}")

        if epochs_no_improve >= PATIENCE:
            print(f"🛑 Early stopping triggered after {epoch + 1} epochs."); break

    print("✅ Training finished."); safe_cuda_cleanup()


# ===================================================================
# 4. PREDICTION (E8: BASELINE POST-PROCESSING - CORRECTED)
# ===================================================================

# Helper class for predictions before they are processed
class TempPrediction:
    def __init__(self, entity_group, score, word, start, end):
        self.entity_group = entity_group
        self.score = score
        self.word = word
        self.start = start
        self.end = end
        # Add attributes to match the final SectionAnnotation for convenience
        self.label = entity_group
        self.segment = word
        self.start_offset = start
        self.end_offset = end

# Baseline Post-Processor Class (from your original working script)
class PredictionPostProcessor:
    def __init__(self, prediction_sections):
        self.sections = prediction_sections
        self.min_section_size = 3

    def get_section_size(self, sec): return len(sec.word.strip().split())

    def merge_undersize_sections(self):
        # This logic needs to be careful not to merge into a section that will be deleted
        i = 0
        while i < len(self.sections):
            if self.get_section_size(self.sections[i]) < self.min_section_size:
                # Try to merge with previous if it exists and is large enough
                if i > 0:
                    self.sections[i-1].word += self.sections[i].word
                    self.sections[i-1].end = self.sections[i].end
                    del self.sections[i]
                # Otherwise, try to merge with next if it exists
                elif i < len(self.sections) - 1:
                    self.sections[i+1].word = self.sections[i].word + self.sections[i+1].word
                    self.sections[i+1].start = self.sections[i].start
                    del self.sections[i]
                else:
                    i += 1 # Cannot merge, move on
            else:
                i += 1

    def merge_contiguous_equivalent_sections(self):
        if not self.sections: return
        merged = [self.sections[0]]
        for i in range(1, len(self.sections)):
            if self.sections[i].entity_group == merged[-1].entity_group:
                merged[-1].word += self.sections[i].word
                merged[-1].end = self.sections[i].end
            else:
                merged.append(self.sections[i])
        self.sections = merged

    def process(self):
        self.merge_contiguous_equivalent_sections()
        self.merge_undersize_sections()
        return self.sections

def generate_predictions_e8(model_path: str, test_data_path: str, output_json_path: str):
    """Generates predictions using the Baseline Post-Processing logic for E8."""
    from transformers import pipeline
    print(f"\n--- E8: Generating Predictions with Baseline Post-Processing (FIXED) ---")
    ner_pipe = pipeline("token-classification", model=model_path, tokenizer=model_path, device=0 if device.type == 'cuda' else -1, aggregation_strategy="simple")
    with open(test_data_path, 'r', encoding='utf-8') as f: test_ds = ClinAISDataset.from_json(f.read())

    for entry in tqdm(test_ds.annotated_entries.values(), desc="Predicting on test set"):
        # 1. Get raw predictions
        raw_predictions = ner_pipe(entry.note_text)
        if not raw_predictions:
            raw_predictions = [{'entity_group': 'PRESENT_ILLNESS', 'score': 0.5, 'word': entry.note_text, 'start': 0, 'end': len(entry.note_text)}]

        # --- START OF FIX ---
        # 2. Convert raw dicts to temporary objects before processing
        temp_sections = [TempPrediction(**pred) for pred in raw_predictions]

        # 3. Run baseline post-processor on these temporary objects
        processor = PredictionPostProcessor(temp_sections)
        processed_sections = processor.process()
        # --- END OF FIX ---

        # 4. Create final Pydantic Section Annotations from the *processed* sections
        entry.section_annotation.prediction = []
        for section in processed_sections:
            entry.section_annotation.prediction.append(SectionAnnotation(
                segment=section.word, label=ClinicalSections(section.entity_group),
                start_offset=section.start, end_offset=section.end
            ))

        # 5. Create Boundary Annotations based on the final processed sections
        entry.boundary_annotation.prediction = []
        for gold_ba in entry.boundary_annotation.gold:
            entry.boundary_annotation.prediction.append(BoundaryAnnotation(
                span=gold_ba.span, boundary=None, start_offset=gold_ba.start_offset, end_offset=gold_ba.end_offset
            ))

        for section in processed_sections:
            section_start_assigned = False
            for ba in entry.boundary_annotation.prediction:
                if ba.start_offset >= section.start_offset and ba.start_offset < section.end_offset and not section_start_assigned:
                    ba.boundary = ClinicalSections(section.label)
                    section_start_assigned = True
                    break

    with open(output_json_path, 'w', encoding='utf-8') as f: f.write(test_ds.to_json())
    print(f"✅ E8 Predictions saved to: {output_json_path}")


# ================================
# 5. MAIN EXECUTION
# ================================
def main():
    """Main execution function to run the entire E8 pipeline."""
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        DRIVE_PATH = "/content/drive/MyDrive/"
    except (ImportError, ModuleNotFoundError):
        print("Not in Google Colab. Using current directory.")
        DRIVE_PATH = "./"

    DATA_DIR = os.path.join(DRIVE_PATH, "TFG/ClinAIS_dataset")
    MODEL_DIR = os.path.join(DRIVE_PATH, "TFG/TRAINS/models/clinais-E8-final")
    os.makedirs(DATA_DIR, exist_ok=True)
    os.makedirs(MODEL_DIR, exist_ok=True)

    TRAIN_FILE = os.path.join(DATA_DIR, "clinais.train_dev_augmented_SAFE.json")
    TEST_FILE = os.path.join(DATA_DIR, "clinais.test&background.blind.json")
    PREDICTIONS_JSON = os.path.join(MODEL_DIR, "predictions.json")
    SUBMISSION_ZIP = os.path.join(MODEL_DIR, "submission.zip")

    print(f"📁 Data Directory: {DATA_DIR}")
    print(f"🎯 Model Output Directory: {MODEL_DIR}")

    if not os.path.exists(TRAIN_FILE): raise FileNotFoundError(f"CRITICAL: Training file not found at {TRAIN_FILE}.")
    if not os.path.exists(TEST_FILE): raise FileNotFoundError(f"CRITICAL: Test file not found at {TEST_FILE}.")

    # --- Run E8 Pipeline ---
    train_model_e8(train_data_path=TRAIN_FILE, model_output_dir=MODEL_DIR)
    generate_predictions_e8(model_path=MODEL_DIR, test_data_path=TEST_FILE, output_json_path=PREDICTIONS_JSON)

    print("\n--- Creating Submission ZIP ---")
    with zipfile.ZipFile(SUBMISSION_ZIP, 'w', zipfile.ZIP_DEFLATED) as zf:
        zf.write(PREDICTIONS_JSON, os.path.basename(PREDICTIONS_JSON))
    print(f"📦 Submission ZIP created at: {SUBMISSION_ZIP}")

    print("\n🎉🎉🎉 EXPERIMENT E8 PIPELINE COMPLETED SUCCESSFULLY! 🎉🎉🎉")
    print(f"You can now download and submit the file: {SUBMISSION_ZIP}")

if __name__ == "__main__":
    main()

🚀 Using device: cuda
🔥 GPU: NVIDIA A100-SXM4-40GB
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📁 Data Directory: /content/drive/MyDrive/TFG/ClinAIS_dataset
🎯 Model Output Directory: /content/drive/MyDrive/TFG/TRAINS/models/clinais-E8-final

--- Starting E8 Training (Advanced HPs) ---


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/bsc-bio-ehr-es and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


📊 Preparing training data from: /content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.train_dev_augmented_SAFE.json


Processing entries:   0%|          | 0/1689 [00:00<?, ?it/s]

✅ Data split: 1520 training, 169 validation examples.


Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

Map:   0%|          | 0/169 [00:00<?, ? examples/s]

⚖️ Computing class weights for minority classes...
✅ Class weights computed.
🏋️‍♂️ Starting manual training loop for E8...


  scaler = GradScaler()



--- Epoch 1/10 ---


Training:   0%|          | 0/190 [00:00<?, ?it/s]

  with autocast():


  Avg Training Loss: 2.1954


Evaluating:   0%|          | 0/11 [00:00<?, ?it/s]



  Validation F1: 0.6045 | Best F1: 0.0000
  🎉 New best model found! Saving to /content/drive/MyDrive/TFG/TRAINS/models/clinais-E8-final

--- Epoch 2/10 ---


Training:   0%|          | 0/190 [00:00<?, ?it/s]

  Avg Training Loss: 1.8695


Evaluating:   0%|          | 0/11 [00:00<?, ?it/s]

  Validation F1: 0.7185 | Best F1: 0.6045
  🎉 New best model found! Saving to /content/drive/MyDrive/TFG/TRAINS/models/clinais-E8-final

--- Epoch 3/10 ---


Training:   0%|          | 0/190 [00:00<?, ?it/s]

  Avg Training Loss: 1.7172


Evaluating:   0%|          | 0/11 [00:00<?, ?it/s]

  Validation F1: 0.7594 | Best F1: 0.7185
  🎉 New best model found! Saving to /content/drive/MyDrive/TFG/TRAINS/models/clinais-E8-final

--- Epoch 4/10 ---


Training:   0%|          | 0/190 [00:00<?, ?it/s]

  Avg Training Loss: 1.6178


Evaluating:   0%|          | 0/11 [00:00<?, ?it/s]

  Validation F1: 0.7740 | Best F1: 0.7594
  🎉 New best model found! Saving to /content/drive/MyDrive/TFG/TRAINS/models/clinais-E8-final

--- Epoch 5/10 ---


Training:   0%|          | 0/190 [00:00<?, ?it/s]

  Avg Training Loss: 1.5546


Evaluating:   0%|          | 0/11 [00:00<?, ?it/s]

  Validation F1: 0.7736 | Best F1: 0.7740
  F1 did not improve. Early stopping counter: 1/2

--- Epoch 6/10 ---


Training:   0%|          | 0/190 [00:00<?, ?it/s]

  Avg Training Loss: 1.5055


Evaluating:   0%|          | 0/11 [00:00<?, ?it/s]

  Validation F1: 0.7725 | Best F1: 0.7740
  F1 did not improve. Early stopping counter: 2/2
🛑 Early stopping triggered after 6 epochs.
✅ Training finished.

--- E8: Generating Predictions with Baseline Post-Processing (FIXED) ---


Device set to use cuda:0


Predicting on test set:   0%|          | 0/2843 [00:00<?, ?it/s]

✅ E8 Predictions saved to: /content/drive/MyDrive/TFG/TRAINS/models/clinais-E8-final/predictions.json

--- Creating Submission ZIP ---
📦 Submission ZIP created at: /content/drive/MyDrive/TFG/TRAINS/models/clinais-E8-final/submission.zip

🎉🎉🎉 EXPERIMENT E8 PIPELINE COMPLETED SUCCESSFULLY! 🎉🎉🎉
You can now download and submit the file: /content/drive/MyDrive/TFG/TRAINS/models/clinais-E8-final/submission.zip


In [None]:
## NUEVO DATASET CON LOS SINÓNIMOS

In [None]:
# ===================================================================
# FINAL SUBMISSION PIPELINE (EXPERIMENT E12 - CT-EBM-SP AUGMENTATION)
# FIXED VERSION - Corrected path resolution for extracted files
# ===================================================================
import os
import json
import logging
import gc
import numpy as np
import random
import re
from collections import defaultdict
from urllib.request import urlretrieve
from enum import Enum
from typing import List, Dict, Any
from pydantic import BaseModel, Field
import zipfile

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast

from tqdm.auto import tqdm

from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForTokenClassification, AutoTokenizer,
    DataCollatorForTokenClassification, get_cosine_schedule_with_warmup
)
import evaluate
from sklearn.utils.class_weight import compute_class_weight


# --- Basic Setup ---
logging.basicConfig(level=logging.INFO)
logging.getLogger("transformers").setLevel(logging.WARNING)
logging.getLogger("pydantic").setLevel(logging.WARNING)


# --- Device Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")
if torch.cuda.is_available():
    print(f"🔥 GPU: {torch.cuda.get_device_name()}")
    torch.backends.cudnn.benchmark = True


def safe_cuda_cleanup():
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
    gc.collect()


# ================================
# 1. DATASET MODELS (PYDANTIC)
# ================================
class ClinicalSections(str, Enum):
    PRESENT_ILLNESS = "PRESENT_ILLNESS"
    DERIVED_FROM_TO = "DERIVED_FROM/TO"
    PAST_MEDICAL_HISTORY = "PAST_MEDICAL_HISTORY"
    FAMILY_HISTORY = "FAMILY_HISTORY"
    EXPLORATION = "EXPLORATION"
    TREATMENT = "TREATMENT"
    EVOLUTION = "EVOLUTION"
    @classmethod
    def list(cls): return [c.value for c in cls]

class SectionAnnotation(BaseModel):
    segment: str; label: ClinicalSections; start_offset: int; end_offset: int
class SectionAnnotations(BaseModel):
    gold: List[SectionAnnotation] = Field(default_factory=list); prediction: List[SectionAnnotation] = Field(default_factory=list)
class BoundaryAnnotation(BaseModel):
    span: str; boundary: ClinicalSections | None; start_offset: int; end_offset: int
class BoundaryAnnotations(BaseModel):
    gold: List[BoundaryAnnotation] = Field(default_factory=list); prediction: List[BoundaryAnnotation] = Field(default_factory=list)
class Entry(BaseModel):
    note_id: str; note_text: str; section_annotation: SectionAnnotations = Field(default_factory=SectionAnnotations); boundary_annotation: BoundaryAnnotations = Field(default_factory=BoundaryAnnotations)
class ClinAISDataset(BaseModel):
    annotated_entries: Dict[str, Entry]; scores: Dict[str, Any] = Field(default_factory=dict)
    def to_json(self):
        if hasattr(self, 'model_dump_json'): return self.model_dump_json(indent=2)
        return json.dumps(self.dict(), indent=2, ensure_ascii=False)
    @classmethod
    def from_json(cls, json_str: str):
        if hasattr(cls, 'model_validate_json'): return cls.model_validate_json(json_str)
        return cls.parse_raw(json_str)


# ================================
# 2. CT-EBM-SP SYNONYM EXTRACTION & DATA AUGMENTATION (FIXED PATHS)
# ================================

def download_ct_ebm_dataset():
    """Download and extract CT-EBM-SP dataset with corrected path handling."""
    zip_path = "CT-EBM-SP.zip"

    print(f"🔍 DEBUG: Current working directory: {os.getcwd()}")
    print(f"🔍 DEBUG: Checking for zip file at: {zip_path}")
    print(f"🔍 DEBUG: Zip file exists: {os.path.exists(zip_path)}")

    if not os.path.exists(zip_path):
        print("📥 Downloading CT-EBM-SP dataset...")
        try:
            urlretrieve("https://www.lllf.uam.es/ESP/nlpdata/wp2/CT-EBM-SP.zip", zip_path)
            print(f"✅ Download completed. File size: {os.path.getsize(zip_path)} bytes")
        except Exception as e:
            print(f"❌ Download failed: {e}")
            return None

    print("📂 Extracting dataset...")
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(".")
        print("✅ Dataset extraction completed!")

        # The key fix: return the actual extraction location
        # The zip extracts files directly to current directory, not into a CT-EBM-SP subfolder
        return "."  # Files are extracted to current directory

    except Exception as e:
        print(f"❌ Extraction failed: {e}")
        return None

def parse_brat_file(ann_file_path):
    """Parse a single BRAT annotation file to extract entity-UMLS mappings."""
    entities = {}
    umls_mappings = {}

    try:
        with open(ann_file_path, 'r', encoding='utf-8', errors='ignore') as f:
            lines = f.readlines()

        # First pass: collect entity definitions
        for line in lines:
            line = line.strip()
            if line.startswith('T') and '\t' in line:
                parts = line.split('\t')
                if len(parts) >= 3:
                    entity_id = parts[0]
                    entity_text = parts[2].strip()
                    entities[entity_id] = entity_text

        # Second pass: collect UMLS annotations
        for line in lines:
            line = line.strip()
            if line.startswith('#') and 'AnnotatorNotes' in line:
                # Handle different BRAT annotation formats
                if '\t' in line:
                    parts = line.split('\t')
                    if len(parts) >= 3:
                        # Format: #1\tAnnotatorNotes T3\tC0033487
                        entity_part = parts[1]
                        umls_part = parts[2]
                        if 'AnnotatorNotes' in entity_part:
                            entity_tokens = entity_part.split()
                            entity_id = entity_tokens[-1] if entity_tokens else ""
                            umls_code = umls_part.strip()
                    elif len(parts) == 2:
                        # Format: #1\tAnnotatorNotes T3 C0033487
                        full_part = parts[1]
                        tokens = full_part.split()
                        if len(tokens) >= 3 and 'AnnotatorNotes' in tokens[0]:
                            entity_id = tokens[1]
                            umls_code = tokens[2]
                        else:
                            continue
                    else:
                        continue
                else:
                    # Format without tabs: #1 AnnotatorNotes T3 C0033487
                    tokens = line.split()
                    if len(tokens) >= 4 and 'AnnotatorNotes' in tokens[1]:
                        entity_id = tokens[2]
                        umls_code = tokens[3]
                    else:
                        continue

                if entity_id in entities and umls_code and umls_code.startswith('C'):
                    entity_text = entities[entity_id].lower().strip()
                    if entity_text and len(entity_text) > 2:  # Filter very short terms
                        umls_mappings[entity_text] = umls_code

    except Exception:
        pass  # Silently handle errors

    return umls_mappings

def extract_ct_ebm_synonyms():
    """Extract medical synonyms from CT-EBM-SP dataset with fixed path resolution."""
    print("🔍 Extracting medical synonyms from CT-EBM-SP...")

    # Download and extract dataset
    base_path = download_ct_ebm_dataset()
    if not base_path:
        print("❌ Failed to download/extract CT-EBM-SP dataset")
        return {}

    umls_to_terms = defaultdict(set)
    total_files = 0
    processed_files = 0
    total_mappings = 0

    # Process dev and test directories (they're in the current directory after extraction)
    for split in ['dev', 'test']:
        split_path = os.path.join(base_path, split)
        print(f"\n🔍 DEBUG: Checking split directory: {split_path}")
        print(f"🔍 DEBUG: Split directory exists: {os.path.exists(split_path)}")

        if not os.path.exists(split_path):
            print(f"⚠️ Split directory {split} not found")
            continue

        for subdir in ['abstracts', 'eudract']:
            subdir_path = os.path.join(split_path, subdir)
            print(f"🔍 DEBUG: Checking subdirectory: {subdir_path}")
            print(f"🔍 DEBUG: Subdirectory exists: {os.path.exists(subdir_path)}")

            if not os.path.exists(subdir_path):
                print(f"⚠️ Subdirectory {subdir} not found in {split}")
                continue

            # List files in subdirectory
            try:
                files = os.listdir(subdir_path)
                ann_files = [f for f in files if f.endswith('.ann')]
                total_files += len(ann_files)

                print(f"✅ Found {len(files)} total files, {len(ann_files)} .ann files in {split}/{subdir}")

                for ann_file in tqdm(ann_files, desc=f"Processing {split}/{subdir}", leave=False):
                    ann_path = os.path.join(subdir_path, ann_file)
                    umls_mappings = parse_brat_file(ann_path)

                    processed_files += 1
                    file_mappings = len(umls_mappings)
                    total_mappings += file_mappings

                    # Group terms by UMLS concept
                    for term, umls_code in umls_mappings.items():
                        umls_to_terms[umls_code].add(term)

            except Exception as e:
                print(f"❌ Error processing {subdir_path}: {e}")

    print(f"\n📊 EXTRACTION SUMMARY:")
    print(f"  Total files found: {total_files}")
    print(f"  Files processed: {processed_files}")
    print(f"  Total UMLS mappings: {total_mappings}")
    print(f"  Unique UMLS concepts: {len(umls_to_terms)}")

    # Build synonym dictionary: term -> list of synonyms
    synonym_dict = {}

    for umls_code, terms in umls_to_terms.items():
        if len(terms) > 1:  # Only concepts with multiple surface forms
            terms_list = list(terms)
            for term in terms_list:
                synonyms = [t for t in terms_list if t != term]
                if synonyms:
                    synonym_dict[term] = synonyms

    print(f"✅ Extracted {len(synonym_dict)} terms with synonyms")

    # Show examples
    if synonym_dict:
        print("\n🔤 Synonym examples:")
        examples = list(synonym_dict.items())[:5]
        for term, synonyms in examples:
            print(f"  '{term}' → {synonyms[:3]}")
    else:
        print("❌ No synonyms extracted!")

    return synonym_dict

def augment_text_with_synonyms(text, synonym_dict, augmentation_prob=0.3):
    """Augment text by randomly replacing terms with their medical synonyms."""
    if not synonym_dict or not text:
        return text

    # Convert to lowercase for matching
    text_lower = text.lower()

    # Sort terms by length (longest first) to avoid partial matches
    sorted_terms = sorted(synonym_dict.keys(), key=len, reverse=True)

    for term in sorted_terms:
        if term in text_lower and random.random() < augmentation_prob:
            synonyms = synonym_dict[term]
            if synonyms:
                replacement = random.choice(synonyms)
                # Use word boundaries for precise replacement
                pattern = r'\b' + re.escape(term) + r'\b'
                text_lower = re.sub(pattern, replacement, text_lower, count=1)

    return text_lower


# ================================
# 3. DATA PREPARATION
# ================================
def prepare_training_data(dataset_path: str, tokenizer, label2id: Dict, max_length: int = 512, split_ratio: float = 0.9,
                          use_synonym_augmentation: bool = True, augmentation_prob: float = 0.3):
    print(f"📊 Preparing training data from: {dataset_path}")
    with open(dataset_path, 'r', encoding='utf-8') as f: ds = ClinAISDataset.from_json(f.read())

    # Initialize synonym dictionary if augmentation is enabled
    synonym_dict = {}
    if use_synonym_augmentation:
        try:
            print("🔧 Setting up medical synonym augmentation...")
            synonym_dict = extract_ct_ebm_synonyms()
            print(f"✅ Ready for augmentation with {len(synonym_dict)} synonym mappings")
        except Exception as e:
            print(f"⚠️ Synonym extraction failed: {e}. Continuing without augmentation.")

    examples = []
    original_count = 0
    augmented_count = 0

    for entry in tqdm(ds.annotated_entries.values(), desc="Processing entries"):
        tokens, labels, current_section = [], [], None
        for ba in entry.boundary_annotation.gold:
            token_text = ba.span
            tokens.append(token_text)
            if ba.boundary: current_section = ba.boundary
            label_name = current_section or ClinicalSections.PRESENT_ILLNESS
            labels.append(label2id[label_name])

        if tokens:
            # Add original example
            examples.append({'tokens': tokens, 'labels': labels})
            original_count += 1

            # Add augmented version if synonym augmentation is enabled
            if synonym_dict and random.random() < 0.7:  # 70% chance to augment
                augmented_tokens = []
                changed = False

                for token in tokens:
                    augmented_token = augment_text_with_synonyms(token, synonym_dict, augmentation_prob)
                    if augmented_token != token.lower():
                        changed = True
                    augmented_tokens.append(augmented_token)

                # Only add if augmentation actually changed something
                if changed:
                    examples.append({'tokens': augmented_tokens, 'labels': labels})
                    augmented_count += 1

    np.random.seed(42); np.random.shuffle(examples)
    split_idx = int(len(examples) * split_ratio)
    train_examples, val_examples = examples[:split_idx], examples[split_idx:]

    print(f"📈 Dataset Statistics:")
    print(f"  Original examples: {original_count}")
    print(f"  Augmented examples: {augmented_count}")
    print(f"  Total examples: {len(examples)}")
    print(f"  Training: {len(train_examples)}, Validation: {len(val_examples)}")

    if augmented_count > 0:
        augmentation_ratio = (augmented_count / original_count) * 100
        print(f"  🔄 Augmentation ratio: {augmentation_ratio:.1f}%")

    def tokenize_and_align(batch):
        tokenized = tokenizer(batch['tokens'], truncation=True, is_split_into_words=True, max_length=max_length)
        aligned_labels = []
        for i, labels in enumerate(batch['labels']):
            word_ids = tokenized.word_ids(batch_index=i); previous_word_idx = None; label_ids = []
            for word_idx in word_ids:
                if word_idx is None: label_ids.append(-100)
                elif word_idx != previous_word_idx: label_ids.append(labels[word_idx])
                else: label_ids.append(-100)
                previous_word_idx = word_idx
            aligned_labels.append(label_ids)
        tokenized['labels'] = aligned_labels
        return tokenized
    train_ds = Dataset.from_list(train_examples).map(tokenize_and_align, batched=True, remove_columns=['tokens'])
    val_ds = Dataset.from_list(val_examples).map(tokenize_and_align, batched=True, remove_columns=['tokens'])
    return DatasetDict({'train': train_ds, 'validation': val_ds})

def compute_class_weights(dataset_path: str, label2id: Dict):
    print("⚖️ Computing class weights for minority classes...")
    with open(dataset_path, 'r', encoding='utf-8') as f: ds = ClinAISDataset.from_json(f.read())
    all_labels = [label2id[ba.boundary] for entry in ds.annotated_entries.values() for ba in entry.boundary_annotation.gold if ba.boundary is not None]
    unique_labels = np.unique(all_labels)
    weights = compute_class_weight('balanced', classes=unique_labels, y=all_labels)
    weight_tensor = torch.ones(len(label2id)).to(device)
    for label, weight in zip(unique_labels, weights): weight_tensor[label] = weight
    print("✅ Class weights computed.")
    return weight_tensor


# ================================
# 4. TRAINING (E12: ADVANCED HPs + AUGMENTATION)
# ================================
def train_model_e12(train_data_path: str, model_output_dir: str):
    """Manual PyTorch training loop with E12's advanced hyperparameters and CT-EBM-SP augmentation."""
    print("\n--- Starting E12 Training (Advanced HPs + CT-EBM-SP Augmentation) ---")

    # --- E12 Config ---
    base_model = "PlanTL-GOB-ES/bsc-bio-ehr-es"
    label_list = ClinicalSections.list()
    label2id = {label: i for i, label in enumerate(label_list)}
    id2label = {i: label for i, label in enumerate(label_list)}

    # Hyperparameters
    EPOCHS = 10
    PATIENCE = 2
    LEARNING_RATE = 2e-5
    TRAIN_BATCH_SIZE = 8
    EVAL_BATCH_SIZE = 16
    MAX_GRAD_NORM = 1.0
    LABEL_SMOOTHING = 0.1

    # --- Model, Tokenizer, Data ---
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    model = AutoModelForTokenClassification.from_pretrained(base_model, num_labels=len(label_list), id2label=id2label, label2id=label2id).to(device)
    dataset = prepare_training_data(train_data_path, tokenizer, label2id, use_synonym_augmentation=True)
    data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)
    train_dataloader = DataLoader(dataset["train"], shuffle=True, collate_fn=data_collator, batch_size=TRAIN_BATCH_SIZE)
    eval_dataloader = DataLoader(dataset["validation"], collate_fn=data_collator, batch_size=EVAL_BATCH_SIZE)
    class_weights = compute_class_weights(train_data_path, label2id)
    loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights, label_smoothing=LABEL_SMOOTHING)

    # --- Optimizer, Scheduler, Scaler (for FP16) ---
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    num_training_steps = len(train_dataloader) * EPOCHS
    lr_scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    scaler = GradScaler()

    # --- Training Loop ---
    print("🏋️‍♂️ Starting manual training loop for E12...")
    best_f1 = 0.0
    epochs_no_improve = 0
    metric = evaluate.load("seqeval")

    for epoch in range(EPOCHS):
        print(f"\n--- Epoch {epoch + 1}/{EPOCHS} ---")
        model.train()
        total_loss = 0
        for batch in tqdm(train_dataloader, desc="Training"):
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()
            with autocast():
                outputs = model(**batch)
                loss = loss_fct(outputs.logits.view(-1, model.config.num_labels), batch["labels"].view(-1))
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            scaler.step(optimizer)
            scaler.update()
            lr_scheduler.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"  Avg Training Loss: {avg_train_loss:.4f}")

        # --- Evaluation Step ---
        model.eval()
        all_preds, all_labels = [], []
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad(): outputs = model(**batch)
            logits = outputs.logits.argmax(dim=-1); labels = batch["labels"]
            all_preds.extend(logits.cpu().numpy()); all_labels.extend(labels.cpu().numpy())

        true_predictions = [[id2label[p] for (p, l) in zip(pred, lab) if l != -100] for pred, lab in zip(all_preds, all_labels)]
        true_labels = [[id2label[l] for (p, l) in zip(pred, lab) if l != -100] for pred, lab in zip(all_preds, all_labels)]
        results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
        current_f1 = results["overall_f1"]
        print(f"  Validation F1: {current_f1:.4f} | Best F1: {best_f1:.4f}")

        # --- Save Best Model & Early Stopping ---
        if current_f1 > best_f1:
            best_f1 = current_f1; epochs_no_improve = 0
            print(f"  🎉 New best model found! Saving to {model_output_dir}")
            model.save_pretrained(model_output_dir); tokenizer.save_pretrained(model_output_dir)
        else:
            epochs_no_improve += 1
            print(f"  F1 did not improve. Early stopping counter: {epochs_no_improve}/{PATIENCE}")

        if epochs_no_improve >= PATIENCE:
            print(f"🛑 Early stopping triggered after {epoch + 1} epochs."); break

    print("✅ Training finished."); safe_cuda_cleanup()


# ===================================================================
# 5. PREDICTION (E12: BASELINE POST-PROCESSING - CORRECTED)
# ===================================================================

# Helper class for predictions before they are processed
class TempPrediction:
    def __init__(self, entity_group, score, word, start, end):
        self.entity_group = entity_group
        self.score = score
        self.word = word
        self.start = start
        self.end = end
        # Add attributes to match the final SectionAnnotation for convenience
        self.label = entity_group
        self.segment = word
        self.start_offset = start
        self.end_offset = end

# Baseline Post-Processor Class (from your original working script)
class PredictionPostProcessor:
    def __init__(self, prediction_sections):
        self.sections = prediction_sections
        self.min_section_size = 3

    def get_section_size(self, sec): return len(sec.word.strip().split())

    def merge_undersize_sections(self):
        # This logic needs to be careful not to merge into a section that will be deleted
        i = 0
        while i < len(self.sections):
            if self.get_section_size(self.sections[i]) < self.min_section_size:
                # Try to merge with previous if it exists and is large enough
                if i > 0:
                    self.sections[i-1].word += self.sections[i].word
                    self.sections[i-1].end = self.sections[i].end
                    del self.sections[i]
                # Otherwise, try to merge with next if it exists
                elif i < len(self.sections) - 1:
                    self.sections[i+1].word = self.sections[i].word + self.sections[i+1].word
                    self.sections[i+1].start = self.sections[i].start
                    del self.sections[i]
                else:
                    i += 1 # Cannot merge, move on
            else:
                i += 1

    def merge_contiguous_equivalent_sections(self):
        if not self.sections: return
        merged = [self.sections[0]]
        for i in range(1, len(self.sections)):
            if self.sections[i].entity_group == merged[-1].entity_group:
                merged[-1].word += self.sections[i].word
                merged[-1].end = self.sections[i].end
            else:
                merged.append(self.sections[i])
        self.sections = merged

    def process(self):
        self.merge_contiguous_equivalent_sections()
        self.merge_undersize_sections()
        return self.sections

def generate_predictions_e12(model_path: str, test_data_path: str, output_json_path: str):
    """Generates predictions using the Baseline Post-Processing logic for E12."""
    from transformers import pipeline
    print(f"\n--- E12: Generating Predictions with Baseline Post-Processing (FIXED) ---")
    ner_pipe = pipeline("token-classification", model=model_path, tokenizer=model_path, device=0 if device.type == 'cuda' else -1, aggregation_strategy="simple")
    with open(test_data_path, 'r', encoding='utf-8') as f: test_ds = ClinAISDataset.from_json(f.read())

    for entry in tqdm(test_ds.annotated_entries.values(), desc="Predicting on test set"):
        # 1. Get raw predictions
        raw_predictions = ner_pipe(entry.note_text)
        if not raw_predictions:
            raw_predictions = [{'entity_group': 'PRESENT_ILLNESS', 'score': 0.5, 'word': entry.note_text, 'start': 0, 'end': len(entry.note_text)}]

        # --- START OF FIX ---
        # 2. Convert raw dicts to temporary objects before processing
        temp_sections = [TempPrediction(**pred) for pred in raw_predictions]

        # 3. Run baseline post-processor on these temporary objects
        processor = PredictionPostProcessor(temp_sections)
        processed_sections = processor.process()
        # --- END OF FIX ---

        # 4. Create final Pydantic Section Annotations from the *processed* sections
        entry.section_annotation.prediction = []
        for section in processed_sections:
            entry.section_annotation.prediction.append(SectionAnnotation(
                segment=section.word, label=ClinicalSections(section.entity_group),
                start_offset=section.start, end_offset=section.end
            ))

        # 5. Create Boundary Annotations based on the final processed sections
        entry.boundary_annotation.prediction = []
        for gold_ba in entry.boundary_annotation.gold:
            entry.boundary_annotation.prediction.append(BoundaryAnnotation(
                span=gold_ba.span, boundary=None, start_offset=gold_ba.start_offset, end_offset=gold_ba.end_offset
            ))

        for section in processed_sections:
            section_start_assigned = False
            for ba in entry.boundary_annotation.prediction:
                if ba.start_offset >= section.start_offset and ba.start_offset < section.end_offset and not section_start_assigned:
                    ba.boundary = ClinicalSections(section.label)
                    section_start_assigned = True
                    break

    with open(output_json_path, 'w', encoding='utf-8') as f: f.write(test_ds.to_json())
    print(f"✅ E12 Predictions saved to: {output_json_path}")


# ================================
# 6. MAIN EXECUTION
# ================================
def main():
    """Main execution function to run the entire E12 pipeline."""
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        DRIVE_PATH = "/content/drive/MyDrive/"
    except (ImportError, ModuleNotFoundError):
        print("Not in Google Colab. Using current directory.")
        DRIVE_PATH = "./"

    DATA_DIR = os.path.join(DRIVE_PATH, "TFG/ClinAIS_dataset")
    MODEL_DIR = os.path.join(DRIVE_PATH, "TFG/TRAINS/models/clinais-E12-final")
    os.makedirs(DATA_DIR, exist_ok=True)
    os.makedirs(MODEL_DIR, exist_ok=True)

    TRAIN_FILE = os.path.join(DATA_DIR, "clinais.train_dev_augmented_SAFE.json")
    TEST_FILE = os.path.join(DATA_DIR, "clinais.test&background.blind.json")
    PREDICTIONS_JSON = os.path.join(MODEL_DIR, "predictions.json")
    SUBMISSION_ZIP = os.path.join(MODEL_DIR, "submission.zip")

    print(f"📁 Data Directory: {DATA_DIR}")
    print(f"🎯 Model Output Directory: {MODEL_DIR}")

    if not os.path.exists(TRAIN_FILE): raise FileNotFoundError(f"CRITICAL: Training file not found at {TRAIN_FILE}.")
    if not os.path.exists(TEST_FILE): raise FileNotFoundError(f"CRITICAL: Test file not found at {TEST_FILE}.")

    # --- Run E12 Pipeline ---
    train_model_e12(train_data_path=TRAIN_FILE, model_output_dir=MODEL_DIR)
    generate_predictions_e12(model_path=MODEL_DIR, test_data_path=TEST_FILE, output_json_path=PREDICTIONS_JSON)

    print("\n--- Creating Submission ZIP ---")
    with zipfile.ZipFile(SUBMISSION_ZIP, 'w', zipfile.ZIP_DEFLATED) as zf:
        zf.write(PREDICTIONS_JSON, os.path.basename(PREDICTIONS_JSON))
    print(f"📦 Submission ZIP created at: {SUBMISSION_ZIP}")

    print("\n🎉🎉🎉 EXPERIMENT E12 PIPELINE COMPLETED SUCCESSFULLY! 🎉🎉🎉")
    print(f"You can now download and submit the file: {SUBMISSION_ZIP}")

if __name__ == "__main__":
    main()

🚀 Using device: cuda
🔥 GPU: NVIDIA A100-SXM4-40GB
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📁 Data Directory: /content/drive/MyDrive/TFG/ClinAIS_dataset
🎯 Model Output Directory: /content/drive/MyDrive/TFG/TRAINS/models/clinais-E12-final

--- Starting E12 Training (Advanced HPs + CT-EBM-SP Augmentation) ---


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/bsc-bio-ehr-es and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


📊 Preparing training data from: /content/drive/MyDrive/TFG/ClinAIS_dataset/clinais.train_dev_augmented_SAFE.json
🔧 Setting up medical synonym augmentation...
🔍 Extracting medical synonyms from CT-EBM-SP...
🔍 DEBUG: Current working directory: /content
🔍 DEBUG: Checking for zip file at: CT-EBM-SP.zip
🔍 DEBUG: Zip file exists: True
📂 Extracting dataset...
✅ Dataset extraction completed!

🔍 DEBUG: Checking split directory: ./dev
🔍 DEBUG: Split directory exists: True
🔍 DEBUG: Checking subdirectory: ./dev/abstracts
🔍 DEBUG: Subdirectory exists: True
✅ Found 200 total files, 100 .ann files in dev/abstracts


Processing dev/abstracts:   0%|          | 0/100 [00:00<?, ?it/s]

🔍 DEBUG: Checking subdirectory: ./dev/eudract
🔍 DEBUG: Subdirectory exists: True
✅ Found 280 total files, 140 .ann files in dev/eudract


Processing dev/eudract:   0%|          | 0/140 [00:00<?, ?it/s]


🔍 DEBUG: Checking split directory: ./test
🔍 DEBUG: Split directory exists: True
🔍 DEBUG: Checking subdirectory: ./test/abstracts
🔍 DEBUG: Subdirectory exists: True
✅ Found 200 total files, 100 .ann files in test/abstracts


Processing test/abstracts:   0%|          | 0/100 [00:00<?, ?it/s]

🔍 DEBUG: Checking subdirectory: ./test/eudract
🔍 DEBUG: Subdirectory exists: True
✅ Found 280 total files, 140 .ann files in test/eudract


Processing test/eudract:   0%|          | 0/140 [00:00<?, ?it/s]


📊 EXTRACTION SUMMARY:
  Total files found: 480
  Files processed: 480
  Total UMLS mappings: 8624
  Unique UMLS concepts: 2788
✅ Extracted 1500 terms with synonyms

🔤 Synonym examples:
  'analgésico' → ['analgésicos']
  'analgésicos' → ['analgésico']
  'arteriales' → ['arterias', 'arterial', 'arteria']
  'arterias' → ['arteriales', 'arterial', 'arteria']
  'arterial' → ['arteriales', 'arterias', 'arteria']
✅ Ready for augmentation with 1500 synonym mappings


Processing entries:   0%|          | 0/1689 [00:00<?, ?it/s]

📈 Dataset Statistics:
  Original examples: 1689
  Augmented examples: 1124
  Total examples: 2813
  Training: 2531, Validation: 282
  🔄 Augmentation ratio: 66.5%


Map:   0%|          | 0/2531 [00:00<?, ? examples/s]

Map:   0%|          | 0/282 [00:00<?, ? examples/s]

⚖️ Computing class weights for minority classes...
✅ Class weights computed.
🏋️‍♂️ Starting manual training loop for E12...


  scaler = GradScaler()



--- Epoch 1/10 ---


Training:   0%|          | 0/317 [00:00<?, ?it/s]

  with autocast():


  Avg Training Loss: 2.0608


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]



  Validation F1: 0.7263 | Best F1: 0.0000
  🎉 New best model found! Saving to /content/drive/MyDrive/TFG/TRAINS/models/clinais-E12-final

--- Epoch 2/10 ---


Training:   0%|          | 0/317 [00:00<?, ?it/s]

  Avg Training Loss: 1.6422


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

  Validation F1: 0.7717 | Best F1: 0.7263
  🎉 New best model found! Saving to /content/drive/MyDrive/TFG/TRAINS/models/clinais-E12-final

--- Epoch 3/10 ---


Training:   0%|          | 0/317 [00:00<?, ?it/s]

  Avg Training Loss: 1.4912


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

  Validation F1: 0.7910 | Best F1: 0.7717
  🎉 New best model found! Saving to /content/drive/MyDrive/TFG/TRAINS/models/clinais-E12-final

--- Epoch 4/10 ---


Training:   0%|          | 0/317 [00:00<?, ?it/s]

  Avg Training Loss: 1.3891


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

  Validation F1: 0.7993 | Best F1: 0.7910
  🎉 New best model found! Saving to /content/drive/MyDrive/TFG/TRAINS/models/clinais-E12-final

--- Epoch 5/10 ---


Training:   0%|          | 0/317 [00:00<?, ?it/s]

  Avg Training Loss: 1.3210


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

  Validation F1: 0.8064 | Best F1: 0.7993
  🎉 New best model found! Saving to /content/drive/MyDrive/TFG/TRAINS/models/clinais-E12-final

--- Epoch 6/10 ---


Training:   0%|          | 0/317 [00:00<?, ?it/s]

  Avg Training Loss: 1.2632


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

  Validation F1: 0.8234 | Best F1: 0.8064
  🎉 New best model found! Saving to /content/drive/MyDrive/TFG/TRAINS/models/clinais-E12-final

--- Epoch 7/10 ---


Training:   0%|          | 0/317 [00:00<?, ?it/s]

  Avg Training Loss: 1.2216


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

  Validation F1: 0.8311 | Best F1: 0.8234
  🎉 New best model found! Saving to /content/drive/MyDrive/TFG/TRAINS/models/clinais-E12-final

--- Epoch 8/10 ---


Training:   0%|          | 0/317 [00:00<?, ?it/s]

  Avg Training Loss: 1.1842


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

  Validation F1: 0.8371 | Best F1: 0.8311
  🎉 New best model found! Saving to /content/drive/MyDrive/TFG/TRAINS/models/clinais-E12-final

--- Epoch 9/10 ---


Training:   0%|          | 0/317 [00:00<?, ?it/s]

  Avg Training Loss: 1.1859


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

  Validation F1: 0.8401 | Best F1: 0.8371
  🎉 New best model found! Saving to /content/drive/MyDrive/TFG/TRAINS/models/clinais-E12-final

--- Epoch 10/10 ---


Training:   0%|          | 0/317 [00:00<?, ?it/s]

  Avg Training Loss: 1.1741


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

  Validation F1: 0.8399 | Best F1: 0.8401
  F1 did not improve. Early stopping counter: 1/2
✅ Training finished.

--- E12: Generating Predictions with Baseline Post-Processing (FIXED) ---


Device set to use cuda:0


Predicting on test set:   0%|          | 0/2843 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


✅ E12 Predictions saved to: /content/drive/MyDrive/TFG/TRAINS/models/clinais-E12-final/predictions.json

--- Creating Submission ZIP ---
📦 Submission ZIP created at: /content/drive/MyDrive/TFG/TRAINS/models/clinais-E12-final/submission.zip

🎉🎉🎉 EXPERIMENT E12 PIPELINE COMPLETED SUCCESSFULLY! 🎉🎉🎉
You can now download and submit the file: /content/drive/MyDrive/TFG/TRAINS/models/clinais-E12-final/submission.zip


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Convertir etiquetas planas para la matriz de confusión
flat_preds = [label for sublist in true_predictions for label in sublist]
flat_labels = [label for sublist in true_labels for label in sublist]

# Solo usar etiquetas presentes en label_list
valid_labels = [label for label in label_list if label in flat_labels or label in flat_preds]

# Calcular matriz
cm = confusion_matrix(flat_labels, flat_preds, labels=valid_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=valid_labels)

# Mostrar y guardar matriz
plt.figure(figsize=(10, 8))
disp.plot(cmap=plt.cm.Blues, xticks_rotation=45)
plt.title(f"Confusion Matrix - Epoch {epoch+1}")
plt.tight_layout()
plt.savefig(f"confusion_matrix_epoch_{epoch+1}.png")
plt.close()
print(f"📊 Confusion matrix saved: confusion_matrix_epoch_{epoch+1}.png")
