# Load Data

In [1]:
!gdown 1dF9QHxdcM9LC-YahddQiKmqbCEiz_DZm
!unzip /content/data.zip

Downloading...
From: https://drive.google.com/uc?id=1dF9QHxdcM9LC-YahddQiKmqbCEiz_DZm
To: /content/data.zip
  0% 0.00/182k [00:00<?, ?B/s]100% 182k/182k [00:00<00:00, 153MB/s]
Archive:  /content/data.zip
   creating: data/
  inflating: data/50test_questions.json  
  inflating: data/QA_records_final.json  


## Extract Sentences

In [2]:
import json
with open("data/QA_records_final.json", encoding="utf-8") as f:
    data = json.load(f)

sentences = [entry["sentence"].replace("\u200c", "") for entry in data]

## Create a Dataset for MLM

In [3]:
from datasets import Dataset
dataset = Dataset.from_dict({"text": sentences})

## Tokenize with GLOT500 Tokenizer

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('cis-lmu/glot500-base')

def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/496 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/7.66M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

Map:   0%|          | 0/342 [00:00<?, ? examples/s]

## Create MLM-style dataset

In [5]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Load GLOT500 Model for MLM

In [6]:
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("cis-lmu/glot500-base")

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

## Train the Model for MLM

In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./glot500-mlm",
    report_to="none",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=430, training_loss=1.5029032862463663, metrics={'train_runtime': 439.6743, 'train_samples_per_second': 3.889, 'train_steps_per_second': 0.978, 'total_flos': 113006789245440.0, 'train_loss': 1.5029032862463663, 'epoch': 5.0})

## Save pre-trained model and its tokenizer

In [8]:
model.save_pretrained("./glot500-mlm")
tokenizer.save_pretrained("./glot500-mlm")

('./glot500-mlm/tokenizer_config.json',
 './glot500-mlm/special_tokens_map.json',
 './glot500-mlm/sentencepiece.bpe.model',
 './glot500-mlm/added_tokens.json',
 './glot500-mlm/tokenizer.json')

# Contrastive Learning

-------------------------------------------------


## Load Data for CL

In [9]:
with open("data/QA_records_final.json", encoding="utf-8") as f:
    raw_data = json.load(f)

sentence_question_pairs = []
for item in raw_data:
    sentence = item["sentence"].replace("\u200c", "")
    questions = [qa["question"].replace("\u200c", "") for qa in item["Q&A"][:5]]
    if len(questions) == 5:
        sentence_question_pairs.append((sentence, questions))

In [10]:
def collate_fn(batch):
    sentences = []
    questions = []
    for sentence, q_list in batch:
        sentences.append(sentence)
        questions.extend(q_list)
    return sentences, questions  # len(sentences) = B, len(questions) = B * 5

## Create a Dataset for CL

In [11]:
from torch.utils.data import Dataset, DataLoader

class SQDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs  # each item is (sentence, [q1, q2, q3, q4, q5])

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.pairs[idx]  # returns (sentence, [5 questions])

dataset = SQDataset(sentence_question_pairs)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

## Define the Info-NCE loss

In [12]:
import torch
import torch.nn.functional as F

def info_nce_loss(context_embs, question_embs, temperature=0.05):
    B, D = context_embs.size()
    G = 5
    device = context_embs.device
    context_embs = F.normalize(context_embs, dim=1)        # (B, D)
    question_embs = F.normalize(question_embs, dim=1)      # (B*G, D)

    # Questions → Contexts
    logits_q2c = torch.matmul(question_embs, context_embs.T) / temperature  # (B*G, B)
    # Positive mask for q2c: each group of G questions maps to 1 context
    pos_q2c = torch.arange(B, device=device).repeat_interleave(G)  # (B*G,)
    loss_q2c = F.cross_entropy(logits_q2c, pos_q2c)

    # Contexts → Questions
    logits_c2q = torch.matmul(context_embs, question_embs.T) / temperature  # (B, B*G)
    # Positive mask for c2q: each context matches its G questions
    pos_mask_c2q = torch.zeros(B, B * G, device=device)
    for i in range(B):
        pos_mask_c2q[i, i * G: (i + 1) * G] = 1
    # Log softmax across all candidate questions
    log_probs_c2q = F.log_softmax(logits_c2q, dim=1)
    # Average log-probability across the G positives
    loss_c2q = -(pos_mask_c2q * log_probs_c2q).sum(dim=1).mean()

    loss = (loss_q2c + loss_c2q) / 2
    return loss

## Load the pre-trained model (MLM)

In [13]:
from transformers import AutoModel
tokenizer = AutoTokenizer.from_pretrained("./glot500-mlm")
model = AutoModel.from_pretrained("./glot500-mlm").to("cuda")

Some weights of XLMRobertaModel were not initialized from the model checkpoint at ./glot500-mlm and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Fine-tune the pre-trained model for CL

In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

model.train()
for epoch in range(10):
    total_loss = 0
    for batch in dataloader:
        sentences, questions = batch  # sentences: [B], questions: [B×5]
        encoded_sent = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to("cuda")
        encoded_ques = tokenizer(questions, padding=True, truncation=True, return_tensors="pt").to("cuda")

        # Get CLS representations from the model
        sent_embs = model(**encoded_sent).last_hidden_state[:, 0, :]  # [B, D]
        ques_embs = model(**encoded_ques).last_hidden_state[:, 0, :]  # [B×5, D]

        # Compute InfoNCE loss
        loss = info_nce_loss(sent_embs, ques_embs)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}: Avg Loss = {total_loss / len(dataloader):.4f}")

Epoch 1: Avg Loss = 10.3681
Epoch 2: Avg Loss = 7.0942
Epoch 3: Avg Loss = 6.2030
Epoch 4: Avg Loss = 5.7654
Epoch 5: Avg Loss = 5.5200
Epoch 6: Avg Loss = 5.2402
Epoch 7: Avg Loss = 5.0995
Epoch 8: Avg Loss = 4.9834
Epoch 9: Avg Loss = 4.8825
Epoch 10: Avg Loss = 4.7696


In [15]:
model.save_pretrained("./glot500-contrastive")
tokenizer.save_pretrained("./glot500-contrastive")

('./glot500-contrastive/tokenizer_config.json',
 './glot500-contrastive/special_tokens_map.json',
 './glot500-contrastive/sentencepiece.bpe.model',
 './glot500-contrastive/added_tokens.json',
 './glot500-contrastive/tokenizer.json')

# Validate the models
------------

## Load and Prepare Validation Data

In [17]:
with open("/content/data/50test_questions.json", encoding="utf-8") as f:
    val_data = json.load(f)

eval_questions = []
for dictionary in val_data:
    eval_questions.append(dictionary["question"].replace("\u200c", ""))

contexts = sentences = [entry["sentence"].replace("\u200c", "") for entry in data]

## TF-IDF Approach

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer().fit(contexts)
context_vectors = vectorizer.transform(contexts)

def tfidf_answer(question):
    q_vec = vectorizer.transform([question])
    sims = torch.tensor(cosine_similarity(q_vec, context_vectors))
    _, top_indices = torch.topk(sims, k=3)
    return [contexts[i] for i in top_indices[0]]

tfidf_outputs = [tfidf_answer(q) for q in eval_questions]

## Required Functions for Retrieval

In [20]:
def get_embeddings(texts, model, tokenizer, batch_size=8):
    model.eval()
    embeddings = []
    dataloader = DataLoader(texts, batch_size=batch_size)
    with torch.no_grad():
        for batch in dataloader:
            tokenized = tokenizer(list(batch), return_tensors="pt", truncation=True, padding=True).to("cuda")
            output = model(**tokenized)
            cls_emb = output.last_hidden_state[:, 0, :]
            cls_emb = F.normalize(cls_emb, dim=1)
            embeddings.append(cls_emb)
    return torch.cat(embeddings, dim=0)

def contrastive_answer(question, model, tokenizer, candidate_embs):
    question_enc = tokenizer(question, return_tensors="pt", truncation=True, padding=True).to("cuda")
    with torch.no_grad():
        question_emb = model(**question_enc).last_hidden_state[:, 0, :]
        question_emb = F.normalize(question_emb, dim=1)
        sims = torch.matmul(question_emb, candidate_embs.T)
        _, top_indices = torch.topk(sims, k=3)
    return [contexts[i] for i in top_indices[0]]

## Zero-Shot GLOT500

In [21]:
zeroshot_tokenizer = AutoTokenizer.from_pretrained("cis-lmu/glot500-base")
zeroshot_model = AutoModel.from_pretrained("cis-lmu/glot500-base").to("cuda")
candidate_zeroshot_embs = get_embeddings(contexts, zeroshot_model, zeroshot_tokenizer)
zeroshot_outputs = [contrastive_answer(q, zeroshot_model, zeroshot_tokenizer, candidate_zeroshot_embs) for q in eval_questions]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at cis-lmu/glot500-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Fine-Tuned GLOT500

In [22]:
finetuned_tokenizer = AutoTokenizer.from_pretrained("./glot500-contrastive")
finetuned_model = model #AutoModel.from_pretrained("./glot500-contrastive").to("cuda")
candidate_finetuned_embs = get_embeddings(contexts, finetuned_model, finetuned_tokenizer)
finetuned_outputs = [contrastive_answer(q, finetuned_model, finetuned_tokenizer, candidate_finetuned_embs) for q in eval_questions]

# Save Answers

In [23]:
results = []
for i in range(len(eval_questions)):
    results.append({
        "question": eval_questions[i],
        "tfidf_answer": tfidf_outputs[i],
        "zeroshot_answer": zeroshot_outputs[i],
        "finetuned_answer": finetuned_outputs[i]
    })

## JSON

In [24]:
with open("evaluation_outputs.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

## CSV

In [25]:
import pandas as pd
df = pd.DataFrame(results)
df.to_csv("evaluation_outputs.csv", index=False, encoding="utf-8")

# Bonus: Duplicate detection

In [27]:
sentence_embs = get_embeddings(sentences, finetuned_model, finetuned_tokenizer)
sentence_embs = F.normalize(sentence_embs, dim=1)
similarities = torch.matmul(sentence_embs, sentence_embs.T)

threshold = 0.99
duplicates = []
for i in range(len(sentences)):
    for j in range(i + 1, len(sentences)):
        if similarities[i, j] > threshold:
            duplicates.append((i, j, similarities[i, j].item()))

for i, j, sim in duplicates:
    print(f"Sentence{i}: {sentences[i]}")
    print(f"Sentence{j}: {sentences[j]}")
    print(f"Similarity: {sim:.4f}")
    print("-" * 300)

Sentence9: در تاریخ ۲۲ خرداد سال ۱۲۳۷ هجری شمسی در شهر شیراز، سید ابراهیم ضیاء الواعظین چشم به جهان گشود. وی بعدها به عنوان مدیر مسئول روزنامه "ایران آزاد" شناخته شد، روزنامهای که از سال ۱۳۰۰ شمسی در تهران با موضوعات سیاسی و اجتماعی منتشر میشد. این روزنامه که به دلیل چاپ مقالات تند و انتقادی از همان ابتدا مورد توجه فراوان قرار گرفت، با تیراژی بالا خوانندگان زیادی را جذب کرد. اما به دلیل همین انتقادات، چند بار توقیف شد و مدیر مسئول آن نیز تبعید گردید. برای کسب اطلاعات بیشتر در این باره میتوانید به کتاب "روز شمار تاریخ معاصر ایران" نوشته حسن فراهانی مراجعه فرمایید.
Sentence161: در تاریخ ۲۲ خرداد سال ۱۲۳۷ هجری شمسی در شهر شیراز، سید ابراهیم ضیاء الواعظین چشم به جهان گشود. وی بعدها به عنوان مدیر مسئول روزنامه "ایران آزاد" شناخته شد، روزنامهای که از سال ۱۳۰۰ شمسی در تهران با موضوعات سیاسی و اجتماعی منتشر میشد. این روزنامه که به دلیل چاپ مقالات تند و انتقادی از همان ابتدا مورد توجه فراوان قرار گرفت، با تیراژی بالا خوانندگان زیادی را جذب کرد. اما به دلیل همین انتقادات، چند بار توقیف شد و مدیر