In [1]:
import pandas as pd
import os

# 1. Paths to Annotations and Documents
annotation_file = "/home/ali.mekky/Documents/NLP/Assignment_2/SemEval2024/EN/subtask-1-annotations.txt"  # Path to the annotation file
documents_dir = "/home/ali.mekky/Documents/NLP/Assignment_2/SemEval2024/EN/raw-documents"  # Directory where text files are stored

# Read annotations as raw text
with open(annotation_file, "r", encoding="utf-8") as file:
    annotation_lines = file.readlines()

# Parse annotation lines into a structured format
structured_data = []

for line in annotation_lines:
    # Split the line by tab or another consistent delimiter
    parts = line.strip().split("\t")
    file_id = parts[0]  # File identifier (e.g., BG_670.txt)
    entity = parts[1]  # Named entity
    start = int(parts[2])  # Start offset
    end = int(parts[3])  # End offset
    roles = parts[4:]  # Dynamic roles

    # Load the corresponding document text using the file ID
    document_path = os.path.join(documents_dir, file_id)
    if os.path.exists(document_path):
        with open(document_path, "r", encoding="utf-8") as doc_file:
            document = doc_file.read()
    else:
        print(f"Warning: File {file_id} not found in {documents_dir}")
        document = ""

    # Append structured data
    structured_data.append({
        "File": file_id,
        "Document": document,
        "Entity": entity,
        "Start": start,
        "End": end,
        "main_role": roles[0],
        "fine_grained_roles": roles[1:]
    })

# Create a DataFrame
df = pd.DataFrame(structured_data)

In [2]:
df.head()

Unnamed: 0,File,Document,Entity,Start,End,main_role,fine_grained_roles
0,EN_UA_103861.txt,The World Needs Peacemaker Trump Again \n\n by...,Chinese,791,797,Antagonist,[Spy]
1,EN_UA_103861.txt,The World Needs Peacemaker Trump Again \n\n by...,China,1516,1520,Antagonist,[Instigator]
2,EN_UA_103861.txt,The World Needs Peacemaker Trump Again \n\n by...,Hamas,2121,2125,Antagonist,[Terrorist]
3,EN_UA_103861.txt,The World Needs Peacemaker Trump Again \n\n by...,Donald Trump,4909,4920,Protagonist,"[Peacemaker, Guardian]"
4,EN_UA_021270.txt,"Ukraine's Fate Will Be Decided In Coming Year,...",Yermak,667,672,Antagonist,[Incompetent]


In [3]:
df['main_role'].value_counts()

main_role
Antagonist     264
Protagonist    103
Innocent        47
Name: count, dtype: int64

In [4]:
fine_grained_roles_count = df["fine_grained_roles"].explode().value_counts()

# Display results
fine_grained_roles_count

fine_grained_roles
Instigator           49
Guardian             40
Conspirator          38
Incompetent          35
Foreign Adversary    35
Victim               33
Tyrant               29
Deceiver             26
Saboteur             20
Virtuous             19
Corrupt              17
Peacemaker           15
Terrorist            14
Underdog             12
Martyr               11
Rebel                11
Bigot                 9
Traitor               8
Scapegoat             8
Exploited             6
Spy                   3
Forgotten             1
Name: count, dtype: int64

In [5]:
def find_paragraph_with_entity(text, entity):
    # Split the text into paragraphs
    paragraphs = text.split("\n\n")  # Assuming paragraphs are separated by double newlines

    # Search for the paragraph containing the entity
    for paragraph in paragraphs:
        if entity in paragraph:
            return paragraph  # Return the paragraph if the entity is found

    return None  # Return None if the entity is not found in any paragraph

# Example usage
text = """
Climate Crazies Fail in Attempt to Vandalize Another Classic Work of Art 

Another of the world’s most recognized and most valuable pieces of art was the target of climate-change activists. Climate crazies tried — and failed — to glue themselves to Edvard Munch’s 1893 painting “The Scream” in Oslo on Friday.

It was yet another example of climate change activists using priceless works of art to protest the use of fossil fuels, which climate zealots believe is leading to global warming. In October, climate activists attacked “Girl with a Pearl Earring” by Johannes Vermeer in The Hague. Climate hysterics were also responsible for an attack on Van Gogh’s “Sunflowers” and John Constable’s “The Hay Wain” in London over the summer.

The new climate-related vandalism fad seems to have begun with an attack on Da Vinci’s “Mona Lisa” in which a climate fanatic feigned a disability in order to get close enough to smear a pastry on the painting.

In addition, “Peach Trees in Blossom” by Van Gogh; “My Heart’s in the Highlands” by Horatio McCulloch; “Tomson’s Aeolian Harp” by J.M.W. Turner; “The Last Supper” by Giampietrino; “Sistine Madonna” by Raphael; and “Haystacks” by Monet have been targeted by climate hysterics since May.

As of yet, none of the artwork has been damaged, due to being protected by glass.

Video of Friday’s attack shows two young climate vandals attempting to glue themselves to the artwork. Police apprehended the hooligans, and reported there was some glue residue on the glass that protects the paintings.

“I scream for people dying,” one of the activists shouted.

“I scream when lawmakers ignore science,” the other shouted.

The Norwegian climate activist group Stopp Oljeletinga, which translates to “Stop Oil Exploration,” claimed responsibility for the attack.

The group demands that the Norwegian government declare “an immediate halt to all further exploration for oil on the Norwegian continental shelf,” and present “a concrete plan for fair adjustment for today’s oil workers.”

A spokesperson for the group claimed that the vandalism was an attempt to “pressure lawmakers into stopping oil exploration.”

“We are campaigning against ‘Scream’ because it is perhaps Norway’s most famous painting,” said Astrid Rem, a spokesperson for Stopp Oljeletinga. “There have been lots of similar actions around Europe. They have managed something that no other action has managed: achieve an extremely large amount of coverage and press.”

But there’s good press and there’s bad press. These crazy antics are of the bad variety.

Norway is one of the world’s top oil exporters and provides oil and natural gas to much of Europe, a continent in the midst of a serious energy crunch brought about partly by the war in Ukraine. Russia, the largest supplier of natural gas to Europe, has severely restricted supplies and has shut down the Nord Stream 2 pipeline, which it claims was sabotaged.

Without much-needed Norwegian fossil fuels, Europe could be in for an extremely cold winter.

The art world has acknowledged their concern over the new phenomenon:

“In recent weeks, there have been several attacks on works of art in international museum collections. The activists responsible for them severely underestimate the fragility of these irreplaceable objects, which must be preserved as part of our world cultural heritage,” read a statement signed by approximately one hundred gallery directors and museums.

These climate crazies are apparently willing to allow a continent to freeze this winter over their fears of an over-hyped “problem” peddled by globalists who flew more than 400 carbon-spewing private jets to COP27 in Egypt last week. If they truly believe the hype surrounding climate change, they’re targeting the wrong thing.
"""
entity_to_find = "COP27"
paragraph = find_paragraph_with_entity(text, entity_to_find)

if paragraph:
    print(f"Paragraph containing '{entity_to_find}':\n\n{paragraph}")
else:
    print(f"The entity '{entity_to_find}' was not found in the text.")

Paragraph containing 'COP27':

These climate crazies are apparently willing to allow a continent to freeze this winter over their fears of an over-hyped “problem” peddled by globalists who flew more than 400 carbon-spewing private jets to COP27 in Egypt last week. If they truly believe the hype surrounding climate change, they’re targeting the wrong thing.



In [8]:
import os

# Define the directory containing the text files
directory = "/home/ali.mekky/Documents/NLP/Assignment_2/SemEval2024/EN/raw-documents"

# Initialize a counter for paragraphs exceeding 400 words
paragraphs_exceeding_400 = 0

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):  # Only process text files
        file_path = os.path.join(directory, filename)
        
        # Read the text from the file
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
        
        # Split the text into paragraphs
        paragraphs = text.split("\n\n")  # Assuming paragraphs are separated by double newlines
        
        # Count the number of words in each paragraph and check if it exceeds 400
        for paragraph in paragraphs:
            word_count = len(paragraph.split())
            if word_count > 300:
                paragraphs_exceeding_400 += 1

# Output the result
print(f"Number of paragraphs exceeding 400 words: {paragraphs_exceeding_400}")


Number of paragraphs exceeding 400 words: 0


In [10]:
def get_paragraph_by_entity_offset(text, start_offset, end_offset):
    """
    Given a text and the start and end offsets of an entity, return the paragraph containing the entity.
    
    Args:
        text (str): The full text.
        start_offset (int): The starting character offset of the entity.
        end_offset (int): The ending character offset of the entity.
    
    Returns:
        str: The paragraph containing the entity, or None if not found.
    """
    # Split the text into paragraphs
    paragraphs = text.split("\n\n")  # Assuming paragraphs are separated by double newlines
    
    # Track the start and end indices of each paragraph
    current_position = 0
    for paragraph in paragraphs:
        start = current_position
        end = start + len(paragraph)
        
        # Check if the entity offsets fall within this paragraph
        if start <= start_offset < end or start < end_offset <= end:
            return paragraph
        
        # Update the current position (add 2 for double newline delimiter)
        current_position = end + 2
    
    return None  # Return None if no paragraph matches

# Example usage
text = """
Climate Crazies Fail in Attempt to Vandalize Another Classic Work of Art 

 Another of the world’s most recognized and most valuable pieces of art was the target of climate-change activists. Climate crazies tried — and failed — to glue themselves to Edvard Munch’s 1893 painting “The Scream” in Oslo on Friday.

It was yet another example of climate change activists using priceless works of art to protest the use of fossil fuels, which climate zealots believe is leading to global warming. In October, climate activists attacked “Girl with a Pearl Earring” by Johannes Vermeer in The Hague. Climate hysterics were also responsible for an attack on Van Gogh’s “Sunflowers” and John Constable’s “The Hay Wain” in London over the summer.

The new climate-related vandalism fad seems to have begun with an attack on Da Vinci’s “Mona Lisa” in which a climate fanatic feigned a disability in order to get close enough to smear a pastry on the painting.

In addition, “Peach Trees in Blossom” by Van Gogh; “My Heart’s in the Highlands” by Horatio McCulloch; “Tomson’s Aeolian Harp” by J.M.W. Turner; “The Last Supper” by Giampietrino; “Sistine Madonna” by Raphael; and “Haystacks” by Monet have been targeted by climate hysterics since May.

As of yet, none of the artwork has been damaged, due to being protected by glass.

Video of Friday’s attack shows two young climate vandals attempting to glue themselves to the artwork. Police apprehended the hooligans, and reported there was some glue residue on the glass that protects the paintings.

“I scream for people dying,” one of the activists shouted.

“I scream when lawmakers ignore science,” the other shouted.

The Norwegian climate activist group Stopp Oljeletinga, which translates to “Stop Oil Exploration,” claimed responsibility for the attack.

The group demands that the Norwegian government declare “an immediate halt to all further exploration for oil on the Norwegian continental shelf,” and present “a concrete plan for fair adjustment for today’s oil workers.”

A spokesperson for the group claimed that the vandalism was an attempt to “pressure lawmakers into stopping oil exploration.”

“We are campaigning against ‘Scream’ because it is perhaps Norway’s most famous painting,” said Astrid Rem, a spokesperson for Stopp Oljeletinga. “There have been lots of similar actions around Europe. They have managed something that no other action has managed: achieve an extremely large amount of coverage and press.”

But there’s good press and there’s bad press. These crazy antics are of the bad variety.

Norway is one of the world’s top oil exporters and provides oil and natural gas to much of Europe, a continent in the midst of a serious energy crunch brought about partly by the war in Ukraine. Russia, the largest supplier of natural gas to Europe, has severely restricted supplies and has shut down the Nord Stream 2 pipeline, which it claims was sabotaged.

Without much-needed Norwegian fossil fuels, Europe could be in for an extremely cold winter.

The art world has acknowledged their concern over the new phenomenon:

“In recent weeks, there have been several attacks on works of art in international museum collections. The activists responsible for them severely underestimate the fragility of these irreplaceable objects, which must be preserved as part of our world cultural heritage,” read a statement signed by approximately one hundred gallery directors and museums.

These climate crazies are apparently willing to allow a continent to freeze this winter over their fears of an over-hyped “problem” peddled by globalists who flew more than 400 carbon-spewing private jets to COP27 in Egypt last week. If they truly believe the hype surrounding climate change, they’re targeting the wrong thing.
"""

start_offset = 1700  # Start index of the entity
end_offset = 1716    # End index of the entity

paragraph = get_paragraph_by_entity_offset(text, start_offset, end_offset)

if paragraph:
    print(f"Paragraph containing the entity ({start_offset}, {end_offset}):\n{paragraph}")
else:
    print(f"No paragraph found for the entity ({start_offset}, {end_offset}).")


Paragraph containing the entity (1700, 1716):
The Norwegian climate activist group Stopp Oljeletinga, which translates to “Stop Oil Exploration,” claimed responsibility for the attack.


In [3]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from data.preprocessing import load_annotations_no_tokenization

# Add the parent directory (code/) to the Python path
annotation_file = "/home/ali.mekky/Documents/NLP/Assignment_2/SemEval2024/EN/subtask-1-annotations.txt"
documents_dir = "/home/ali.mekky/Documents/NLP/Assignment_2/SemEval2024/EN/raw-documents"


# Load and preprocess data without tokenization
df = load_annotations_no_tokenization(annotation_file, documents_dir)

# Display a sample of the processed data
df.head()

               File Original Entity  \
0  EN_UA_103861.txt         Chinese   
1  EN_UA_103861.txt           China   
2  EN_UA_103861.txt           Hamas   
3  EN_UA_103861.txt    Donald Trump   
4  EN_UA_021270.txt          Yermak   

                                 Processed Paragraph    main_role  \
0  There has been an astounding 6,300% increase i...   Antagonist   
1  China is constantly threatening Taiwan and its...   Antagonist   
2  On October 7, Israel was invaded by Hamas resu...   Antagonist   
3  As the world deals with multiple international...  Protagonist   
4  Yermak sought to assure the audience that Zele...   Antagonist   

       fine_grained_roles  
0                   [Spy]  
1            [Instigator]  
2             [Terrorist]  
3  [Peacemaker, Guardian]  
4           [Incompetent]  


In [1]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from data.preprocessing import load_annotations_no_tokenization

# Add the parent directory (code/) to the Python path
annotation_file = "/home/ali.mekky/Documents/NLP/Assignment_2/SemEval2024/EN/subtask-1-annotations.txt"
documents_dir = "/home/ali.mekky/Documents/NLP/Assignment_2/SemEval2024/EN/raw-documents"


annotation_file_test = "/home/ali.mekky/Documents/NLP/Assignment_2/SemEval2024/EN/subtask-1-entity-mentions.txt"
documents_dir_test = "/home/ali.mekky/Documents/NLP/Assignment_2/SemEval2024/EN/subtask-1-documents"

 
# Load and preprocess data without tokenization
# df = load_annotations_no_tokenization(annotation_file_test, documents_dir_test, is_test=True)
df = load_annotations_no_tokenization(annotation_file, documents_dir, is_test= False)

# Display a sample of the processed data
# df.head()

['Antagonist', 'Spy']
1
['Antagonist', 'Instigator']
1
['Antagonist', 'Terrorist']
1
['Protagonist', 'Peacemaker', 'Guardian']
0
['Antagonist', 'Incompetent']
1
['Antagonist', 'Incompetent']
1
['Antagonist', 'Incompetent']
1
['Antagonist', 'Traitor']
1
['Antagonist', 'Bigot']
1
['Antagonist', 'Instigator']
1
['Protagonist', 'Guardian']
0
['Antagonist', 'Foreign Adversary']
1
['Antagonist', 'Foreign Adversary']
1
['Antagonist', 'Tyrant']
1
['Innocent', 'Victim']
2
['Antagonist', 'Instigator']
1
['Antagonist', 'Foreign Adversary']
1
['Antagonist', 'Foreign Adversary']
1
['Protagonist', 'Guardian']
0
['Protagonist', 'Guardian']
0
['Protagonist', 'Underdog']
0
['Antagonist', 'Terrorist']
1
['Antagonist', 'Corrupt']
1
['Antagonist', 'Tyrant']
1
['Antagonist', 'Tyrant']
1
['Antagonist', 'Tyrant']
1
['Antagonist', 'Corrupt']
1
['Antagonist', 'Foreign Adversary']
1
['Antagonist', 'Corrupt']
1
['Protagonist', 'Guardian']
0
['Protagonist', 'Guardian']
0
['Antagonist', 'Foreign Adversary']
1
['Pr

In [2]:
df = df[df['File'] == "EN_CC_100012.txt"]

In [3]:
none_rows = df[df['Processed Paragraph'].isnull()]
none_rows.head()

Unnamed: 0,File,Original Entity,Processed Paragraph,Start,End,main_role,fine_grained_roles,multi_labels,main_role_label


In [4]:
df

Unnamed: 0,File,Original Entity,Processed Paragraph,Start,End,main_role,fine_grained_roles,multi_labels,main_role_label
362,EN_CC_100012.txt,World Economic Forum,The <ENTITY_START>World Economic Forum<ENTITY...,87,106,Antagonist,[Conspirator],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1
363,EN_CC_100012.txt,the United Nations,The WEF has enlisted <ENTITY_START>the United ...,691,708,Antagonist,[Conspirator],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1
364,EN_CC_100012.txt,Dr. Carol Baker,<ENTITY_START>Dr. Carol Baker<ENTITY_END> was ...,1775,1789,Antagonist,[Conspirator],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1
365,EN_CC_100012.txt,Bill Gates-,Is anyone surprised that a <ENTITY_START>Bill ...,2228,2238,Antagonist,[Conspirator],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1
366,EN_CC_100012.txt,Biden,<ENTITY_START>Biden<ENTITY_END> is a compromis...,3520,3524,Antagonist,"[Corrupt, Traitor]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...",1
367,EN_CC_100012.txt,Robert F. Kennedy,"But it gets even worse, as <ENTITY_START>Rober...",4416,4432,Protagonist,[Guardian],"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0


In [14]:
from transformers import AutoTokenizer
from torch.optim import AdamW  # Use this instead of transformers.AdamW
from torch.utils.data import DataLoader
from data.dataset import RoleDataset
from data.preprocessing import load_and_tokenize_data
from models.model import DebertaForMultiLabelClassification, FocalLoss
from training.train import train_model
from training.evaluate import evaluate_model
import torch
import numpy as np
from data.preprocessing import split_dataset_with_stratification
from transformers import get_scheduler

from torch.utils.tensorboard import SummaryWriter

# Paths
# save_path = "preprocessed_data.pt"
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)


text = "But it gets even worse, as <ENTITY_START>Robert F. Kennedy<ENTITY_END> explained in New York City. The bioweapon is targeted to take out white people and spare other races."

tokenized_inputs = tokenizer(
    text,
    max_length=512,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)

print(tokenized_inputs)
41552
8662

{'input_ids': tensor([[    0,  1708,    24,  1516,   190,  3007,     6,    25, 28696,  5382,
          8662,  1215,  4014, 11328, 15698, 25244,   274,     4,  5076, 41552,
          5382,  8662,  1215,  9309, 15698,  2002,    11,   188,   469,   412,
             4,    20,  4003, 22034, 36951,    16,  3656,     7,   185,    66,
          1104,    82,     8, 10628,    97,  4694,     4,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,  

8662

In [15]:
INPUT_TEXT = "<ENTITY_START>Biden<ENTITY_END> is a compromised puppet politician who has been shamelessly selling out America to the globalists for decades."
tokenized_inputs = tokenizer(
    INPUT_TEXT,
    max_length=512,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)

print(tokenized_inputs)

{'input_ids': tensor([[    0, 41552,  5382,  8662,  1215,  4014, 11328, 15698,   387, 12145,
         41552,  5382,  8662,  1215,  9309, 15698,    16,    10, 13969, 29771,
          8676,    54,    34,    57, 36778,   352,  2183,    66,   730,     7,
             5,   720,  1952,    13,  1724,     4,     2,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,  

In [11]:
input_ids = tokenized_inputs["input_ids"]
entity_start_id = tokenizer.convert_tokens_to_ids("<ENTITY_START>")
entity_end_id = tokenizer.convert_tokens_to_ids("<ENTITY_END>")


In [12]:
print(entity_start_id)
print(entity_end_id)

3
3


In [9]:
entity_start_pos = (input_ids == entity_start_id).nonzero(as_tuple=True)[1].item() + 1  # Position after <ENTITY_START>
entity_end_pos = (input_ids == entity_end_id).nonzero(as_tuple=True)[1].item()   

RuntimeError: a Tensor with 0 elements cannot be converted to Scalar

In [None]:
entity_embeddings = last_hidden_state[0, entity_start_pos:entity_end_pos, :]  # Shape: (entity_length, hidden_size)

In [31]:
import torch
from transformers import AutoTokenizer, AutoModel

# Example input text
text = "<ENTITY_START>Biden<ENTITY_END> is a compromised puppet politician who has been shamelessly selling out America to the globalists for decades."

# Load tokenizer and model
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Add custom tokens for <ENTITY_START> and <ENTITY_END>
special_tokens = {"additional_special_tokens": ["<ENTITY_START>", "<ENTITY_END>"]}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

# Tokenize the input text
inputs = tokenizer(
    text,
    max_length=512,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)

# Convert special tokens to IDs
entity_start_id = tokenizer.convert_tokens_to_ids("<ENTITY_START>")
entity_end_id = tokenizer.convert_tokens_to_ids("<ENTITY_END>")

print(entity_start_id)
print(entity_end_id)

# Get input IDs and find positions of <ENTITY_START> and <ENTITY_END>
input_ids = inputs["input_ids"].squeeze(0)
entity_start_idx = (input_ids == entity_start_id).nonzero(as_tuple=True)[0].item() + 1
entity_end_idx = (input_ids == entity_end_id).nonzero(as_tuple=True)[0].item()

# Pass the input through the model to get hidden states
with torch.no_grad():
    outputs = model(**inputs)

# Extract token embeddings between <ENTITY_START> and <ENTITY_END>
hidden_states = outputs.last_hidden_state.squeeze(0)  # Shape: (seq_length, hidden_size)
entity_embeddings = hidden_states[entity_start_idx:entity_end_idx]  # Extract embeddings

# Extract token IDs and corresponding tokens between the markers
tokens_between = input_ids[entity_start_idx:entity_end_idx]
tokens_text = tokenizer.convert_ids_to_tokens(tokens_between)

# Print results
print(f"Tokens between <ENTITY_START> and <ENTITY_END>: {tokens_text}")
print(f"Embeddings shape: {entity_embeddings.shape}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


50265
50266
Tokens between <ENTITY_START> and <ENTITY_END>: ['B', 'iden']
Embeddings shape: torch.Size([2, 768])
