<a href="https://colab.research.google.com/github/Chaitanya0604/nlp-group-50-semeval-2026-task-4/blob/main/Narrative_Similarity_track_a.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Option 1: Use Google Drive (persistent)
from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/semeval-2026-task-4-baselines/data"
OUTPUT_PATH = "/content/drive/MyDrive/SemEvalProjectNLP/track_a"

# Option 2: Use local content folder (temporary)
# DATA_PATH = "/content/semeval-2026-task-4-baselines/data"
# OUTPUT_PATH = "/content/track_a"

import os
os.makedirs(OUTPUT_PATH, exist_ok=True)  # create output folder if it doesn't exist
print("Output folder:", OUTPUT_PATH)

# Standard imports
import pandas as pd
import numpy as np
import torch
import re
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity


Step 1: Setup & Imports

We import all the necessary Python libraries for data handling, preprocessing, embedding generation, and modeling.
We also set the paths for the dataset and output folders. The notebook can be configured to work either with Google Drive
for persistent storage or with the local /content folder in Colab for temporary storage.
Creating the output folder ensures that all predictions and results are saved properly.


In [None]:
import os
import pandas as pd

df = pd.read_json(os.path.join(DATA_PATH, "dev_track_a.jsonl"), lines=True)
df.head()


Step 2: Load Dataset

We load the JSONL dataset containing triples of (Anchor, Story A, Story B) with a label indicating which story is more similar to the anchor.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Paths
DATA_PATH = "/content/drive/MyDrive/semeval-2026-task-4-baselines/data"
OUTPUT_PATH = "/content/drive/MyDrive/semeval-2026-task-4-baselines/output"

import os
os.makedirs(OUTPUT_PATH, exist_ok=True)


Step 3: Preprocessing

We normalize the text: remove extra spaces, lowercase everything, and normalize quotes. Punctuation is preserved for discourse features.

In [None]:
def preprocess_text(text):
    """
    Clean and normalize text:
    - Strip leading/trailing spaces
    - Replace multiple spaces with single space
    - Normalize quotes
    - Convert to lowercase
    """
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    text = text.replace('“', '"').replace('”', '"').replace("’", "'")
    text = text.lower()
    return text

# Apply preprocessing
for col in ["anchor_text", "text_a", "text_b"]:
    df[col] = df[col].apply(preprocess_text)


Step 4: Convert Triples to Pairwise Examples

To train a similarity model, we convert each triple into two pairwise examples:
* (Anchor, A) → label = 1 if A is closer, else 0
* (Anchor, B) → label = 1 if B is closer, else 0

We convert each triple into two pairwise samples for training the Siamese and Cross-Encoder models.

In [None]:
pairwise_data = []
for _, row in df.iterrows():
    # Pair: (Anchor, A) → 1 if A is closer else 0
    pairwise_data.append({
        "anchor": row["anchor_text"],
        "candidate": row["text_a"],
        "label": 1 if row["text_a_is_closer"] else 0
    })
    # Pair: (Anchor, B) → 1 if B is closer else 0
    pairwise_data.append({
        "anchor": row["anchor_text"],
        "candidate": row["text_b"],
        "label": 1 if not row["text_a_is_closer"] else 0
    })

pairwise_df = pd.DataFrame(pairwise_data)
pairwise_df.head()


Step 5: Embedding Creation (Hugging Face)

We generate embeddings using sentence-transformers/all-mpnet-base-v2. Both anchors and candidates are encoded.

In [None]:
# Load embedding model
embed_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Encode all anchors and candidates
anchor_embeddings = embed_model.encode(pairwise_df["anchor"].tolist(), convert_to_tensor=True, show_progress_bar=True)
candidate_embeddings = embed_model.encode(pairwise_df["candidate"].tolist(), convert_to_tensor=True, show_progress_bar=True)


Step 5 – Embedding Creation with Optional Models

You can generate embeddings using different types of transformer models depending on your needs:
Semantic embeddings: Use google/bert-base for general-purpose sentence embeddings with fine-tuning.

Discourse-aware embeddings: Use models like facebook/bart-large or roberta-large fine-tuned on NLI/STS tasks to capture narrative coherence and story-level relations.

You can compute embeddings for each story (anchor, A, B) and later compare which model gives better performance.

In [None]:
# Step 5: Embedding Creation with Optional Models

import torch
from transformers import AutoTokenizer, AutoModel

# Choose ANY HuggingFace model:
# Semantic: "bert-base-uncased"
# Discourse-aware: "facebook/bart-large", "roberta-large-mnli"

model_name = "bert-base-uncased"  # change this if you want another model

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

def get_embedding(text):
    """
    Generate embeddings for a single text using mean pooling.
    Works for semantic or discourse-aware models.
    """
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.squeeze().cpu().numpy()

# APPLY TO YOUR SINGLE DATAFRAME
df["anchor_emb"] = df["anchor_text"].apply(get_embedding)
df["text_a_emb"] = df["text_a"].apply(get_embedding)
df["text_b_emb"] = df["text_b"].apply(get_embedding)

print("Embeddings computed successfully!")



Step 6: Siamese Model

We build a Siamese network that takes embeddings of (anchor, candidate) and outputs a similarity score.


In [None]:
import torch.nn as nn

class SiameseNetwork(nn.Module):
    def __init__(self, embedding_dim=768):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim*2, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, anchor_emb, candidate_emb):
        x = torch.cat([anchor_emb, candidate_emb], dim=1)
        return self.fc(x)

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
siamese_model = SiameseNetwork().to(device)


Step 7: Cross-Encoder Model

We use a Hugging Face CrossEncoder to predict similarity on concatenated text pairs.

In [None]:
cross_model = CrossEncoder('cross-encoder/stsb-roberta-large', num_labels=1, device=device)


Step 8: Ensemble Prediction Logic

We combine three methods: Siamese similarity score, Cross-Encoder score, and cosine similarity of embeddings.

In [None]:
def predict_ensemble(anchor, text_a, text_b):
    """
    Ensemble prediction using:
    - Siamese network
    - Cross-encoder
    - Cosine similarity
    """
    anchor = preprocess_text(anchor)
    text_a = preprocess_text(text_a)
    text_b = preprocess_text(text_b)

    anchor_emb = embed_model.encode([anchor], convert_to_tensor=True).to(device)
    a_emb = embed_model.encode([text_a], convert_to_tensor=True).to(device)
    b_emb = embed_model.encode([text_b], convert_to_tensor=True).to(device)

    # Siamese
    siam_a = siamese_model(anchor_emb, a_emb).item()
    siam_b = siamese_model(anchor_emb, b_emb).item()

    # Cosine similarity
    cos_a = cosine_similarity(anchor_emb.cpu().numpy(), a_emb.cpu().numpy())[0][0]
    cos_b = cosine_similarity(anchor_emb.cpu().numpy(), b_emb.cpu().numpy())[0][0]

    # Cross-encoder
    cross_scores = cross_model.predict([[anchor, text_a], [anchor, text_b]])
    cross_a, cross_b = cross_scores[0], cross_scores[1]

    # Ensemble average
    score_a = np.mean([siam_a, cos_a, cross_a])
    score_b = np.mean([siam_b, cos_b, cross_b])

    return "A" if score_a > score_b else "B"

# Apply ensemble to dev set
df["predicted"] = df.apply(lambda row: predict_ensemble(row["anchor_text"], row["text_a"], row["text_b"]), axis=1)
df.head()


Step 9: Evaluate

We compute accuracy and Pearson correlation between predicted and human labels.

In [None]:
from scipy.stats import pearsonr

df["predicted_label"] = df["predicted"].apply(lambda x: 1 if x=="A" else 0)
labels = df["text_a_is_closer"].astype(int)

accuracy = (df["predicted_label"] == labels).mean()
r, _ = pearsonr(df["predicted_label"], labels)

print(f"Accuracy: {accuracy:.3f}")
print(f"Pearson r: {r:.3f}")


Step 10: Save Output

We save predictions in track_a.jsonl in the output folder. This can be in /content or Google Drive.



In [None]:
output_file = os.path.join(OUTPUT_PATH, "track_a.jsonl")
df.to_json(output_file, orient='records', lines=True)
print("Saved predictions to:", output_file)