In [1]:
import logging
from typing import Dict, List, Tuple
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
from collections import Counter
from sklearn.metrics import precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
class RoleAnnotation:
    def __init__(self, document_id: str, entity: str, start_offset: int,
                 end_offset: int, main_role: str, fine_role: str, text: str):
        self.document_id = document_id
        self.entity = entity
        self.start_offset = start_offset
        self.end_offset = end_offset
        self.main_role = main_role
        self.fine_role = fine_role
        self.text = text

    def get_context(self, window_size: int = 100) -> str:
        start = max(0, self.start_offset - window_size)
        end = min(len(self.text), self.end_offset + window_size)
        return self.text[start:end]

In [4]:
class MultilingualRoleDataset:
    def __init__(self, base_path: str, languages: List[str]):
        logger.info("Initializing MultilingualRoleDataset")
        self.base_path = Path(base_path)
        self.languages = languages
        self.annotations: List[RoleAnnotation] = []
        self.role_to_idx: Dict[str, int] = {}
        self.fine_role_to_idx: Dict[str, int] = {}

    def load_raw_document(self, language: str, doc_id: str) -> str:
        logger.info(f"Loading raw document for language: {language}, doc_id: {doc_id}")
        doc_path = self.base_path / language / "raw-documents" / doc_id
        with open(doc_path, 'r', encoding='utf-8') as f:
            return f.read()

    def parse_annotation_line(self, line: str) -> Tuple[str, str, int, int, str, str]:
        parts = line.strip().split('\t')
        doc_id = parts[0]
        entity = parts[1]
        start_offset = int(parts[2])
        end_offset = int(parts[3])
        main_role = parts[4]
        fine_role = parts[5]
        return doc_id, entity, start_offset, end_offset, main_role, fine_role

    def load_data(self):
        logger.info("Loading annotation data")
        main_roles = set()
        fine_roles = set()

        for lang in self.languages:
            annot_path = self.base_path / lang / "subtask-1-annotations.txt"
            logger.info(f"Reading annotations from {annot_path}")
            with open(annot_path, 'r', encoding='utf-8') as f:
                for line in f:
                    _, _, _, _, main_role, fine_role = self.parse_annotation_line(line)
                    main_roles.add(main_role)
                    fine_roles.add(fine_role)

        self.role_to_idx = {role: idx for idx, role in enumerate(sorted(main_roles))}
        self.fine_role_to_idx = {role: idx for idx, role in enumerate(sorted(fine_roles))}
        logger.info("Finished processing roles")

        for lang in self.languages:
            annot_path = self.base_path / lang / "subtask-1-annotations.txt"
            logger.info(f"Loading annotations for language: {lang}")
            with open(annot_path, 'r', encoding='utf-8') as f:
                current_doc = None
                current_doc_id = None

                for line in f:
                    doc_id, entity, start, end, main_role, fine_role = self.parse_annotation_line(line)

                    if doc_id != current_doc_id:
                        current_doc = self.load_raw_document(lang, doc_id)
                        current_doc_id = doc_id

                    annotation = RoleAnnotation(
                        document_id=doc_id,
                        entity=entity,
                        start_offset=start,
                        end_offset=end,
                        main_role=main_role,
                        fine_role=fine_role,
                        text=current_doc
                    )
                    self.annotations.append(annotation)

        logger.info(f"Total annotations loaded: {len(self.annotations)}")

In [5]:
class RoleClassificationDataset(Dataset):
    def __init__(self, data: MultilingualRoleDataset, tokenizer: AutoTokenizer,
                 max_length: int = 512, context_window: int = 100):
        logger.info("Initializing RoleClassificationDataset")
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.context_window = context_window

    def __len__(self):
        return len(self.data.annotations)

    def __getitem__(self, idx):
        annotation = self.data.annotations[idx]
        context = annotation.get_context(self.context_window)
        input_text = f"{context} [ENT] {annotation.entity} [/ENT]"

        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'main_role': torch.tensor(self.data.role_to_idx[annotation.main_role]),
            'fine_role': torch.tensor(self.data.fine_role_to_idx[annotation.fine_role])
        }

In [6]:
class MainRoleClassifier(nn.Module):
    def __init__(self, model_name='xlm-roberta-base', num_labels=3):
        logger.info("Initializing MainRoleClassifier")
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        hidden_size = self.bert.config.hidden_size

        self.intermediate = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, hidden_size // 2)
        )

        self.classifier = nn.Linear(hidden_size // 2, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        features = self.intermediate(pooled_output)
        logits = self.classifier(features)
        return logits

In [7]:
def create_balanced_sampler(dataset):
    logger.info("Creating balanced sampler")
    labels = [item['main_role'].item() for item in dataset]
    class_counts = Counter(labels)
    weights = [1.0 / class_counts[label.item()] for label in [item['main_role'] for item in dataset]]
    sampler = WeightedRandomSampler(weights, len(weights))
    return sampler

In [8]:
def train_model(model, train_loader, val_loader, epochs=5, learning_rate=2e-5):
    logger.info("Starting training")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        logger.info(f"Epoch {epoch + 1}/{epochs}")
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['main_role'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        logger.info(f"Training Loss for epoch {epoch + 1}: {total_loss / len(train_loader):.4f}")

        # Validation
        model.eval()
        val_preds = []
        val_labels = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['main_role']

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs, dim=1).cpu()

                val_preds.extend(preds.numpy())
                val_labels.extend(labels.numpy())

        precision, recall, f1, _ = precision_recall_fscore_support(
            val_labels, val_preds, average='micro'
        )

        logger.info(f"Validation Metrics for epoch {epoch + 1}:")
        logger.info(f"Micro Precision: {precision:.4f}")
        logger.info(f"Micro Recall: {recall:.4f}")
        logger.info(f"Micro F1: {f1:.4f}")

In [9]:
def run_training(base_path: str, languages: List[str]):
    logger.info("Starting training pipeline")
    # Initialize data processor
    data_processor = MultilingualRoleDataset(base_path, languages)
    data_processor.load_data()

    # Initialize tokenizer
    logger.info("Initializing tokenizer")
    tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
    special_tokens = {'additional_special_tokens': ['[ENT]', '[/ENT]']}
    tokenizer.add_special_tokens(special_tokens)

    # Create dataset
    logger.info("Creating dataset")
    dataset = RoleClassificationDataset(data_processor, tokenizer)

    # Split dataset
    logger.info("Splitting dataset into training and validation sets")
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    # Create data loaders
    logger.info("Creating data loaders")
    train_sampler = create_balanced_sampler(train_dataset)
    train_loader = DataLoader(
        train_dataset,
        batch_size=16,
        sampler=train_sampler
    )
    val_loader = DataLoader(val_dataset, batch_size=16)

    # Initialize model and resize token embeddings
    logger.info("Initializing model")
    model = MainRoleClassifier(num_labels=len(data_processor.role_to_idx))
    model.bert.resize_token_embeddings(len(tokenizer))

    # Train the model
    logger.info("Training model")
    train_model(model, train_loader, val_loader)

    logger.info("Training complete")
    return model, tokenizer, data_processor.role_to_idx

In [10]:
base_path = "Semevaltraining_data_16_October_release"
languages = ['EN', 'BG', 'HI', 'PT']

model, tokenizer, role_map = run_training(base_path, languages)

# Save the trained model
logger.info("Saving trained model")
torch.save({
    'model_state_dict': model.state_dict(),
    'role_map': role_map
}, 'role_classifier.pt')
logger.info("Model saved successfully")

2025-01-19 17:06:33,749 - INFO - Starting training pipeline
2025-01-19 17:06:33,751 - INFO - Initializing MultilingualRoleDataset
2025-01-19 17:06:33,752 - INFO - Loading annotation data
2025-01-19 17:06:33,753 - INFO - Reading annotations from Semevaltraining_data_16_October_release/EN/subtask-1-annotations.txt
2025-01-19 17:06:33,759 - INFO - Reading annotations from Semevaltraining_data_16_October_release/BG/subtask-1-annotations.txt
2025-01-19 17:06:33,763 - INFO - Reading annotations from Semevaltraining_data_16_October_release/HI/subtask-1-annotations.txt
2025-01-19 17:06:33,769 - INFO - Reading annotations from Semevaltraining_data_16_October_release/PT/subtask-1-annotations.txt
2025-01-19 17:06:33,774 - INFO - Finished processing roles
2025-01-19 17:06:33,775 - INFO - Loading annotations for language: EN
2025-01-19 17:06:33,777 - INFO - Loading raw document for language: EN, doc_id: EN_UA_103861.txt
2025-01-19 17:06:33,779 - INFO - Loading raw document for language: EN, doc_id:

In [14]:
import torch
from transformers import AutoTokenizer, AutoModel

def load_model(checkpoint_path='role_classifier.pt'):
    # Initialize tokenizer first
    tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
    tokenizer.add_special_tokens({'additional_special_tokens': ['[ENT]', '[/ENT]']})
    
    # Load the checkpoint
    checkpoint = torch.load(checkpoint_path)
    role_map = checkpoint['role_map']
    
    # Initialize model with correct vocab size
    model = MainRoleClassifier(num_labels=len(role_map))
    model.bert.resize_token_embeddings(len(tokenizer))  # Resize before loading state dict
    
    # Load state dict
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    return model, tokenizer, role_map

def predict_role(text: str, entity: str, model, tokenizer, role_map):
    # Prepare input text with entity markers
    input_text = f"{text} [ENT] {entity} [/ENT]"
    
    # Tokenize
    encoding = tokenizer(
        input_text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    # Get prediction
    with torch.no_grad():
        outputs = model(
            input_ids=encoding['input_ids'],
            attention_mask=encoding['attention_mask']
        )
        prediction = torch.argmax(outputs, dim=1).item()
    
    # Convert prediction to role label
    inverse_role_map = {v: k for k, v in role_map.items()}
    predicted_role = inverse_role_map[prediction]
    
    return predicted_role

In [38]:
model, tokenizer, role_map = load_model('role_classifier.pt')

2025-01-22 19:04:03,046 - INFO - Initializing MainRoleClassifier


In [19]:
text = "The protesters gathered outside the parliament building while Sarah Johnson, the Minister of Justice, addressed the media."
entity = "Sarah Johnson"
role = predict_role(text, entity, model, tokenizer, role_map)
print(f"Entity: {entity}, {role_map}")
print(f"Predicted Role: {role}")

Entity: Sarah Johnson, {'Antagonist': 0, 'Innocent': 1, 'Protagonist': 2}
Predicted Role: Protagonist


In [17]:
# Neural Network Model for Fine-grained Classification
class FineRoleClassifier(nn.Module):
    def __init__(self, model_name='xlm-roberta-base', num_labels=3):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        hidden_size = self.bert.config.hidden_size

        # More complex intermediate layers for fine-grained classification
        self.intermediate = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, hidden_size // 2)
        )

        self.classifier = nn.Linear(hidden_size // 2, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        features = self.intermediate(pooled_output)
        logits = self.classifier(features)
        return logits

def train_fine_model(model, train_loader, val_loader, epochs=5, learning_rate=2e-5):
    logger.info("Starting fine-grained role training")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    
    best_f1 = 0
    best_model_state = None

    for epoch in range(epochs):
        logger.info(f"Epoch {epoch + 1}/{epochs}")
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['fine_role'].to(device)  # Using fine_role instead of main_role

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        logger.info(f"Training Loss for epoch {epoch + 1}: {avg_loss:.4f}")

        # Validation
        model.eval()
        val_preds = []
        val_labels = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['fine_role']

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs, dim=1).cpu()

                val_preds.extend(preds.numpy())
                val_labels.extend(labels.numpy())

        precision, recall, f1, _ = precision_recall_fscore_support(
            val_labels, val_preds, average='micro'
        )
        
        # Save best model
        if f1 > best_f1:
            best_f1 = f1
            best_model_state = model.state_dict().copy()

        logger.info(f"Validation Metrics for epoch {epoch + 1}:")
        logger.info(f"Micro Precision: {precision:.4f}")
        logger.info(f"Micro Recall: {recall:.4f}")
        logger.info(f"Micro F1: {f1:.4f}")

    # Restore best model
    model.load_state_dict(best_model_state)
    return model

def predict_roles(model, tokenizer, text, entity, role_map, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model.eval()
    model = model.to(device)

    # Prepare input text with entity markers
    input_text = f"{text} [ENT] {entity} [/ENT]"
    
    # Tokenize
    encoding = tokenizer(
        input_text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Get prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs, dim=1)

    # Convert prediction to role label
    idx_to_role = {idx: role for role, idx in role_map.items()}
    predicted_role = idx_to_role[predictions.item()]

    return predicted_role

# Updated main execution function
def run_fine_role_training(base_path: str, languages: List[str]):
    logger.info("Starting fine-grained role classification training pipeline")
    
    # Initialize data processor
    data_processor = MultilingualRoleDataset(base_path, languages)
    data_processor.load_data()

    # Initialize tokenizer
    logger.info("Initializing tokenizer")
    tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
    special_tokens = {'additional_special_tokens': ['[ENT]', '[/ENT]']}
    tokenizer.add_special_tokens(special_tokens)

    # Create dataset
    logger.info("Creating dataset")
    dataset = RoleClassificationDataset(data_processor, tokenizer)

    # Split dataset
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    # Create data loaders
    train_sampler = create_balanced_sampler(train_dataset)
    train_loader = DataLoader(train_dataset, batch_size=16, sampler=train_sampler)
    val_loader = DataLoader(val_dataset, batch_size=16)

    # Initialize fine-grained model
    model = FineRoleClassifier(num_labels=len(data_processor.fine_role_to_idx))
    model.bert.resize_token_embeddings(len(tokenizer))

    # Train the model
    model = train_fine_model(model, train_loader, val_loader)

    logger.info("Fine-grained role training complete")
    return model, tokenizer, data_processor.fine_role_to_idx

In [18]:
base_path = "Semevaltraining_data_16_October_release"
languages = ['EN', 'BG', 'HI', 'PT']

# Train fine-grained role model
fine_model, tokenizer, fine_role_map = run_fine_role_training(base_path, languages)

# Save the trained model
torch.save({
    'model_state_dict': fine_model.state_dict(),
    'fine_role_map': fine_role_map
}, 'fine_role_classifier.pt')

# Example prediction
sample_text = "The protesters gathered outside the parliament building."
sample_entity = "protesters"
predicted_role = predict_roles(fine_model, tokenizer, sample_text, sample_entity, fine_role_map)
print(f"Predicted fine-grained role: {predicted_role}")

2025-01-21 14:45:49,289 - INFO - Starting fine-grained role classification training pipeline
2025-01-21 14:45:49,294 - INFO - Initializing MultilingualRoleDataset
2025-01-21 14:45:49,301 - INFO - Loading annotation data
2025-01-21 14:45:49,302 - INFO - Reading annotations from Semevaltraining_data_16_October_release/EN/subtask-1-annotations.txt
2025-01-21 14:45:49,310 - INFO - Reading annotations from Semevaltraining_data_16_October_release/BG/subtask-1-annotations.txt
2025-01-21 14:45:49,314 - INFO - Reading annotations from Semevaltraining_data_16_October_release/HI/subtask-1-annotations.txt
2025-01-21 14:45:49,319 - INFO - Reading annotations from Semevaltraining_data_16_October_release/PT/subtask-1-annotations.txt
2025-01-21 14:45:49,322 - INFO - Finished processing roles
2025-01-21 14:45:49,322 - INFO - Loading annotations for language: EN
2025-01-21 14:45:49,323 - INFO - Loading raw document for language: EN, doc_id: EN_UA_103861.txt
2025-01-21 14:45:49,326 - INFO - Loading raw d

Predicted fine-grained role: Victim


In [None]:
import os
def text_population(directory, string1, string2):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".txt"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        lines = f.readlines()
                    
                    with open(file_path, 'w', encoding='utf-8') as f:
                        for line in lines:
                            line = line.rstrip()
                            f.write(f"{line}\t{string1}\t{string2}\n")
                    
                    print(f"Processed: {file_path}")
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
    
    

In [27]:
def load_model_and_tokenizer(model_path, tokenizer_name):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    special_tokens = {'additional_special_tokens': ['[ENT]', '[/ENT]']}
    tokenizer.add_special_tokens(special_tokens)

    checkpoint = torch.load(model_path)
    fine_role_map = checkpoint['fine_role_map']

    model = FineRoleClassifier(num_labels=len(fine_role_map))

    model.bert.resize_token_embeddings(len(tokenizer))
    model.load_state_dict(checkpoint['model_state_dict'], strict=False) 

    return model, tokenizer, fine_role_map

In [24]:
directory_to_complete = "/Users/katyhrib/PycharmProjects/semeval/testdata_ST12/EN/"
character_roles = {
    "PROTAGONIST": ['Guardian', 'Martyr', 'Peacemaker', 'Rebel', 'Underdog', 'Virtuous'],
    "ANTAGONIST": ['Instigator', 'Conspirator', 'Tyrant', 'Foreign_Adversary', 'Traitor',
                   'Spy', 'Saboteur', 'Corrupt', 'Incompetent', 'Terrorist', 'Deceiver', 'Bigot'],
    "INNOCENT": ['Forgotten', 'Exploited', 'Victim', 'Scapegoat']
}

In [23]:
import os
import torch
from transformers import AutoTokenizer

def load_text_file(file_path):
    with open(file_path, 'r') as f:
        return f.read()

def parse_entity_file(entity_file_path):
    """Parses the entity file to extract file names, entities, and offsets."""
    entities_info = []
    with open(entity_file_path, 'r') as ef:
        for line in ef:
            parts = line.strip().split('\t')
            if len(parts) < 3:
                raise ValueError("Each line in the entity file must contain at least a file name, entity mention, and offsets.")
            file_name = parts[0]
            entity = parts[1]
            offsets = list(map(int, parts[2:]))
            entities_info.append((file_name, entity, offsets))
    return entities_info

def predict_entity_roles(model, tokenizer, text, entity, role_map, device=None):
    """Predicts the role of an entity in a text."""
    input_text = f"{text} [ENT] {entity} [/ENT]"
    return predict_roles(model, tokenizer, input_text, entity, role_map, device)

def update_entity_file(entity_file_path, updated_data, output_path=None):
    """Updates the entity file with predictions."""
    if output_path is None:
        output_path = entity_file_path + '.predictions'

    with open(output_path, 'w') as out_file:
        for line in updated_data:
            out_file.write(line + '\n')


In [35]:
def predict_from_multiple_files(model, tokenizer, role_map, text_folder, entity_file, output_entity_file=None):
    """
    Processes multiple text files and predicts roles for entities mentioned in the entity file.
    """
    entities_info = parse_entity_file(entity_file)
    updated_data = []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    for file_name, entity, offsets in entities_info:
        text_path = os.path.join(text_folder, file_name)
        if not os.path.exists(text_path):
            raise FileNotFoundError(f"Text file '{file_name}' not found in folder '{text_folder}'.")

        text = load_text_file(text_path)
        
        start_offset, end_offset = offsets
        if start_offset < 0 or end_offset > len(text):
            raise ValueError(f"Offsets {offsets} are invalid for text file '{file_name}'.")

        entity_context = text[start_offset:end_offset+1]
        if entity_context != entity:
            raise ValueError(f"Entity '{entity}' does not match the text content at offsets {offsets} in file '{file_name}'.")

        predicted_role = predict_entity_roles(model, tokenizer, text, entity, role_map, device)
        
        updated_line = f"{file_name}\t{entity}\t{start_offset}\t{end_offset}\t{predicted_role}"
        updated_data.append(updated_line)

    update_entity_file(entity_file, updated_data, output_entity_file)
    print(f"Predictions saved to {output_entity_file or entity_file + '.predictions'}")


In [36]:
model_path = 'fine_role_classifier.pt'
tokenizer_name = 'xlm-roberta-base'
text_folder = directory_to_complete + "subtask-1-documents"
entity_file = directory_to_complete +'subtask-1-entity-mentions.txt'
output_entity_file = directory_to_complete + 'subtask-1-entity-mentions3.txt'

model, tokenizer, fine_role_map = load_model_and_tokenizer(model_path, tokenizer_name)



In [37]:
predict_from_multiple_files(model, tokenizer, fine_role_map, text_folder, entity_file, output_entity_file)

Predictions saved to /Users/katyhrib/PycharmProjects/semeval/testdata_ST12/EN/subtask-1-entity-mentions2.txt


In [33]:
with open(text_folder +'/CC_TEST_00063.txt', 'r') as f:
    text = f.read()
print(text[787:801]) 

Climate Action


In [58]:
directory_to_complete = "/Users/katyhrib/PycharmProjects/semeval/dev-documents_4_December/BG/"
model_path = 'fine_role_classifier.pt'
tokenizer_name = 'xlm-roberta-base'
text_folder = directory_to_complete + "subtask-1-documents"
entity_file = directory_to_complete +'subtask-1-entity-mentions.txt'
output_entity_file = directory_to_complete + 'subtask-1-entity-mentions2.txt'

In [None]:
model, tokenizer, role_map = load_model('role_classifier.pt')

In [59]:
predict_from_multiple_files(model, tokenizer, role_map, text_folder, entity_file, output_entity_file)

Predictions saved to /Users/katyhrib/PycharmProjects/semeval/dev-documents_4_December/BG/subtask-1-entity-mentions2.txt


In [52]:
with open(text_folder +'/PT_URW_TEST_489.txt', 'r') as f:
    text = f.read()
print(text[246:251]) 


ia, q
