### 1. Setup: Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 2. Setup: Install Dependencies

In [None]:
!pip install sentence-transformers transformers==4.30.0 torch huggingface_hub==0.16.4 torchvision --upgrade -q

In [None]:
!pip install protobuf==3.20.3

In [None]:
!pip uninstall tensorflow -y

### 3. Setup: Authenticate with Hugging Face Hub

In [None]:
from huggingface_hub import notebook_login
print("Please log in to Hugging Face Hub to access gated models:")
notebook_login()

Please log in to Hugging Face Hub to access gated models:


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### 4. Model Training: Fine-Tune the GLOT500 Model
*This cell loads the training data, configures the contrastive learning loss, and fine-tunes the `cis-lmu/glot500-base` model, saving the result to Google Drive.*

In [None]:
import json
import os 
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import logging
from huggingface_hub import model_info 
import torch


os.environ["WANDB_DISABLED"] = "true"
logger = logging.getLogger(__name__) 
logger.info("Weights & Biases (wandb) logging has been disabled.")
torch.cuda.empty_cache()


if not logger.hasHandlers(): 
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')



DRIVE_BASE_PATH = '/content/drive/My Drive/NLP_HW2/'
DATA_DIR_NAME = 'Data'        
PASSAGES_DIR_NAME = 'Passages'  

DATA_PATH = os.path.join(DRIVE_BASE_PATH, DATA_DIR_NAME)
TRAINING_QUESTIONS_FILE = os.path.join(DATA_PATH, 'Training_Questions.json')
PASSAGES_DIR = os.path.join(DATA_PATH, PASSAGES_DIR_NAME)
MODEL_OUTPUT_PATH = os.path.join(DRIVE_BASE_PATH, 'Models/cis-lmu_glot500-base_finetuned_retrieval_v1')

# Ensure model output directory exists and paths are correct
if not os.path.exists(DRIVE_BASE_PATH):
    logger.error(f"CRITICAL: Google Drive base path does not exist: {DRIVE_BASE_PATH}. Did you run Step 2 (drive.mount)?")
    raise FileNotFoundError(f"Drive path {DRIVE_BASE_PATH} not found.")

os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)
logger.info(f"Drive base path: {DRIVE_BASE_PATH}")
logger.info(f"Data path: {DATA_PATH}")
logger.info(f"Passages directory: {PASSAGES_DIR}")
logger.info(f"Model will be saved to: {MODEL_OUTPUT_PATH}")

# Load and Prepare Training Data
logger.info(f"Attempting to load training questions from: {TRAINING_QUESTIONS_FILE}")
try:
    with open(TRAINING_QUESTIONS_FILE, 'r', encoding='utf-8') as f:
        training_questions_raw = json.load(f)
    logger.info(f"Loaded {len(training_questions_raw)} raw training questions.")
except FileNotFoundError:
    logger.error(f"CRITICAL: Training questions file not found at: {TRAINING_QUESTIONS_FILE}")
    logger.error(f"Please ensure the file exists and the paths (DATA_DIR_NAME='{DATA_DIR_NAME}') are correct.")
    raise
except json.JSONDecodeError:
    logger.error(f"CRITICAL: Error decoding JSON from {TRAINING_QUESTIONS_FILE}. Please check the file format.")
    raise
except Exception as e:
    logger.error(f"CRITICAL: An unexpected error occurred while loading training questions: {e}")
    raise

train_examples = []
not_found_passages = 0
empty_passages = 0

logger.info(f"Attempting to process passages from directory: {PASSAGES_DIR}")
if not os.path.isdir(PASSAGES_DIR):
    logger.error(f"CRITICAL: Passages directory not found at: {PASSAGES_DIR}. Please check PASSAGES_DIR_NAME='{PASSAGES_DIR_NAME}'.")
    raise FileNotFoundError(f"Passages directory {PASSAGES_DIR} not found.")

for item_idx, item in enumerate(training_questions_raw):
    question = item.get('question')
    passage_filename = item.get('passage_reference')

    if not question or not passage_filename:
        logger.warning(f"Skipping item {item_idx+1}/{len(training_questions_raw)} due to missing question or passage_reference: {item}")
        continue

    passage_file_path = os.path.join(PASSAGES_DIR, passage_filename)

    try:
        with open(passage_file_path, 'r', encoding='utf-8') as pf:
            passage_text = pf.read()
        if not passage_text.strip(): # Check if passage is empty or only whitespace
            logger.warning(f"Passage file {passage_file_path} for question '{question}' (item {item_idx+1}) is empty. Skipping.")
            empty_passages += 1
            continue
        train_examples.append(InputExample(texts=[question, passage_text]))
    except FileNotFoundError:
        logger.warning(f"Passage file not found: {passage_file_path} for question: '{question}' (item {item_idx+1})")
        not_found_passages += 1
    except Exception as e:
        logger.error(f"Error reading passage file {passage_file_path} (item {item_idx+1}): {e}")
        not_found_passages += 1

logger.info(f"Prepared {len(train_examples)} training examples.")
if not_found_passages > 0:
    logger.warning(f"{not_found_passages} passage files were not found or could not be read.")
if empty_passages > 0:
    logger.warning(f"{empty_passages} passage files were found but were empty.")

if not train_examples:
    logger.error("CRITICAL: No training examples were prepared. Exiting fine-tuning.")
else:
    # Define the Model
    model_name = 'cis-lmu/glot500-base'

    try:
        logger.info(f"Verifying model '{model_name}' on Hugging Face Hub...")
        model_info_obj = model_info(repo_id=model_name)
        logger.info(f"Model '{model_name}' (ID: {model_info_obj.modelId}) found on Hugging Face Hub.")
        if model_info_obj.gated:
             logger.warning(f"Model '{model_name}' is gated. Ensure you have accepted terms on its Hugging Face page and are logged in (via notebook_login).")
    except Exception as e:
        logger.error(f"CRITICAL: Could not find or verify model '{model_name}' on Hugging Face Hub: {e}")
        raise

    logger.info(f"Loading base model: {model_name} using SentenceTransformer.")
    try:
        model = SentenceTransformer(model_name)
        logger.info(f"Successfully loaded model: {model_name}")
    except Exception as e:
        logger.error(f"CRITICAL: Error loading SentenceTransformer model '{model_name}': {e}")
        raise

    # Define the Loss Function
    train_loss = losses.MultipleNegativesRankingLoss(model=model)
    logger.info(f"Using loss function: {type(train_loss).__name__}")

    # Configure Training
    batch_size = 8
    num_epochs = 3

    steps_per_epoch = (len(train_examples) + batch_size - 1) // batch_size
    total_training_steps = steps_per_epoch * num_epochs
    warmup_steps = int(total_training_steps * 0.1)

    logger.info(f"Batch size: {batch_size}")
    logger.info(f"Number of epochs: {num_epochs}")
    logger.info(f"Total training examples: {len(train_examples)}")
    logger.info(f"Steps per epoch: {steps_per_epoch}")
    logger.info(f"Total training steps: {total_training_steps}")
    logger.info(f"Calculated warmup steps: {warmup_steps}")

    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)

    # Train the Model
    logger.info("Starting model training...")
    try:
        model.fit(train_objectives=[(train_dataloader, train_loss)],
                  epochs=num_epochs,
                  warmup_steps=warmup_steps,
                  output_path=MODEL_OUTPUT_PATH,
                  show_progress_bar=True,
                  checkpoint_path=os.path.join(MODEL_OUTPUT_PATH, 'checkpoints'),
                  checkpoint_save_steps=max(1, steps_per_epoch // 2) if steps_per_epoch > 0 else 100,
                  checkpoint_save_total_limit=3,
                  save_best_model=False
                 )
        logger.info(f"Training complete. Model artifacts saved to: {MODEL_OUTPUT_PATH}")
    except Exception as e:
        logger.error(f"CRITICAL: An error occurred during model training: {e}")
        raise

    print(f"\n--- Part 3: Fine-tuning of '{model_name}' complete ---")
    print(f"The fine-tuned model and checkpoints should be in: {MODEL_OUTPUT_PATH}")

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/cis-lmu_glot500-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at /root/.cache/torch/sentence_transformers/cis-lmu_glot500-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this mo

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/82 [00:00<?, ?it/s]

Iteration:   0%|          | 0/82 [00:00<?, ?it/s]

Iteration:   0%|          | 0/82 [00:00<?, ?it/s]


--- Part 3: Fine-tuning of 'cis-lmu/glot500-base' complete ---
The fine-tuned model and checkpoints should be in: /content/drive/My Drive/NLP_HW2/Models/cis-lmu_glot500-base_finetuned_retrieval_v1


### 5. Evaluation: Generate Retrieval Results for All Models
*This script evaluates the three models (TF-IDF, Zero-shot, and Fine-tuned) against the 50 evaluation questions and saves the top 3 results for each in `Retrieval_Results.json`.*

In [None]:
# Part 4: Retrieval Evaluation
import json
import os
import glob
import re
import numpy as np
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Paths
DRIVE_BASE_PATH = '/content/drive/MyDrive/NLP_HW2/'
DATA_DIR_NAME = 'Data'
PASSAGES_DIR_NAME = 'Passages'
MODELS_DIR_NAME = 'Models'

DATA_PATH = os.path.join(DRIVE_BASE_PATH, DATA_DIR_NAME)
EVALUATION_QUESTIONS_FILE = os.path.join(DATA_PATH, 'Evaluation_Questions.json')
PASSAGES_DIR = os.path.join(DATA_PATH, PASSAGES_DIR_NAME)
FINETUNED_MODEL_PATH = os.path.join(DRIVE_BASE_PATH, MODELS_DIR_NAME, 'cis-lmu_glot500-base_finetuned_retrieval_v1')
PART4_OUTPUT_FILE = os.path.join(DRIVE_BASE_PATH, 'Retrieval_Results.json')

logger.info(f"Evaluation questions file: {EVALUATION_QUESTIONS_FILE}")
logger.info(f"Passages directory: {PASSAGES_DIR}")
logger.info(f"Fine-tuned model path: {FINETUNED_MODEL_PATH}")

# Helper Functions
def extract_relevant_snippet(passage, question, max_length=500):
    """Extract the most relevant part of passage related to the question"""
    # Persian-aware sentence splitting
    sentences = re.split(r'(?<=[.!?؟])\s+', passage)
    sentences = [s.strip() for s in sentences if s.strip()]

    if not sentences:
        return passage[:max_length] + "..." if len(passage) > max_length else passage

    try:
        vectorizer = TfidfVectorizer()
        tfidf = vectorizer.fit_transform([question] + sentences)
        question_vec = tfidf[0]
        sim_scores = [cosine_similarity(question_vec, tfidf[i+1])[0][0] for i in range(len(sentences))]
        most_relevant_idx = np.argmax(sim_scores)

        start = max(0, most_relevant_idx - 1)
        end = min(len(sentences), most_relevant_idx + 2)
        snippet = " ".join(sentences[start:end])

        if len(snippet) > max_length:
            snippet = snippet[:max_length].rsplit(' ', 1)[0] + "..."
        return snippet
    except:
        return passage[:max_length] + "..." if len(passage) > max_length else passage

def generate_answer(question, context):
    """Generate answer using QA model"""
    try:
        qa_model = pipeline("question-answering",
                          model="deepset/xlm-roberta-base-squad2",
                          device=0 if torch.cuda.is_available() else -1)
        result = qa_model(question=question, context=context, max_answer_len=100)
        return result['answer']
    except Exception as e:
        logger.warning(f"Answer generation failed: {str(e)}")
        return "پاسخ یافت نشد"

# Load Evaluation Questions
logger.info("Loading evaluation questions...")
try:
    with open(EVALUATION_QUESTIONS_FILE, 'r', encoding='utf-8') as f:
        evaluation_questions = json.load(f)
    logger.info(f"Loaded {len(evaluation_questions)} evaluation questions.")
except Exception as e:
    logger.error(f"Error loading evaluation questions: {e}")
    raise

# Load Corpus Passages
logger.info(f"Loading corpus passages from: {PASSAGES_DIR}")
corpus_passages = []
passage_files = glob.glob(os.path.join(PASSAGES_DIR, "*.txt"))

if not passage_files:
    logger.error(f"No .txt files found in passages directory: {PASSAGES_DIR}")
    raise FileNotFoundError(f"No passage files found in {PASSAGES_DIR}")

for passage_file in passage_files:
    try:
        with open(passage_file, 'r', encoding='utf-8') as pf:
            passage_text = pf.read().strip()
        if passage_text:
            corpus_passages.append(passage_text)
    except Exception as e:
        logger.error(f"Error reading passage file {passage_file}: {e}")

logger.info(f"Loaded {len(corpus_passages)} non-empty passages from corpus.")

# TF-IDF Retrieval Setup
logger.info("Setting up TF-IDF vectorizer...")
vectorizer = TfidfVectorizer()
corpus_tfidf_matrix = vectorizer.fit_transform(corpus_passages)
logger.info(f"TF-IDF matrix shape: {corpus_tfidf_matrix.shape}")

# Load Models
logger.info("Loading models...")
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Base (zero-shot) model
BASE_MODEL_NAME = 'cis-lmu/glot500-base'
try:
    base_model = SentenceTransformer(BASE_MODEL_NAME, device=device)
    logger.info(f"Encoding passages with base model ({device})...")
    corpus_embeddings_base_model = base_model.encode(corpus_passages,
                                                    convert_to_tensor=True,
                                                    show_progress_bar=True,
                                                    batch_size=32)
except Exception as e:
    logger.error(f"Error loading base model: {e}")
    raise

# Fine-tuned model
try:
    finetuned_model = SentenceTransformer(FINETUNED_MODEL_PATH, device=device)
    logger.info(f"Encoding passages with fine-tuned model ({device})...")
    corpus_embeddings_finetuned_model = finetuned_model.encode(corpus_passages,
                                                             convert_to_tensor=True,
                                                             show_progress_bar=True,
                                                             batch_size=32)
except Exception as e:
    logger.error(f"Error loading fine-tuned model: {e}")
    raise

# Initialize QA model
logger.info("Loading QA model for answer generation...")
qa_pipeline = pipeline("question-answering",
                      model="deepset/xlm-roberta-base-squad2",
                      device=0 if torch.cuda.is_available() else -1)

# Perform Retrieval and Generate Results 
final_results = []

logger.info("Starting retrieval and answer generation...")
for question_data in evaluation_questions:
    question_id = question_data["evaluation_question_id"]
    question_text = question_data["question"]
    province = question_data.get("expected_province_for_answer", "")

    if not question_text:
        continue

    result = {
        "evaluation_question_id": question_id,
        "question_text": question_text,
        "expected_province": province,
        "tfidf_top3": [],
        "zeroshot_glot500_top3": [],
        "finetuned_glot500_top3": [],
        "generated_answer": ""
    }

    # TF-IDF Retrieval
    question_tfidf = vectorizer.transform([question_text])
    similarities = cosine_similarity(question_tfidf, corpus_tfidf_matrix)[0]
    top_indices = np.argsort(similarities)[-3:][::-1]

    for rank, idx in enumerate(top_indices, 1):
        result["tfidf_top3"].append({
            "passage_text_snippet": extract_relevant_snippet(corpus_passages[idx], question_text),
            "score": float(similarities[idx]),
            "rank": rank
        })

    # Zero-shot Retrieval
    question_embedding = base_model.encode(question_text, convert_to_tensor=True)
    similarities = util.cos_sim(question_embedding, corpus_embeddings_base_model)[0]
    top_indices = torch.topk(similarities, k=3).indices

    for rank, idx in enumerate(top_indices, 1):
        result["zeroshot_glot500_top3"].append({
            "passage_text_snippet": extract_relevant_snippet(corpus_passages[idx], question_text),
            "score": similarities[idx].item(),
            "rank": rank
        })

    # Fine-tuned Retrieval
    question_embedding = finetuned_model.encode(question_text, convert_to_tensor=True)
    similarities = util.cos_sim(question_embedding, corpus_embeddings_finetuned_model)[0]
    top_indices = torch.topk(similarities, k=3).indices

    for rank, idx in enumerate(top_indices, 1):
        result["finetuned_glot500_top3"].append({
            "passage_text_snippet": extract_relevant_snippet(corpus_passages[idx], question_text),
            "score": similarities[idx].item(),
            "rank": rank
        })

    # Generate answer using top passage from fine-tuned model
    if result["finetuned_glot500_top3"]:
        top_passage_idx = top_indices[0].item()
        context = corpus_passages[top_passage_idx]
        try:
            qa_result = qa_pipeline(question=question_text, context=context, max_answer_len=100)
            result["generated_answer"] = qa_result['answer']
        except Exception as e:
            logger.warning(f"Answer generation failed for Q{question_id}: {str(e)}")
            result["generated_answer"] = "پاسخ یافت نشد"

    final_results.append(result)
    logger.info(f"Processed question {question_id}/{len(evaluation_questions)}")

# Save Results 
logger.info(f"Saving final results to {PART4_OUTPUT_FILE}")
try:
    with open(PART4_OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(final_results, f, indent=2, ensure_ascii=False)
    logger.info("Results saved successfully.")
except Exception as e:
    logger.error(f"Error saving results: {e}")
    raise

print(f"\n--- Part 4: Retrieval with all three models complete ---")
print(f"Results saved to {PART4_OUTPUT_FILE}")

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/cis-lmu_glot500-base were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at /root/.cache/torch/sentence_transformers/cis-lmu_glot500-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this mo

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]




--- Part 4: Retrieval with all three models complete ---
Results saved to /content/drive/MyDrive/NLP_HW2/Retrieval_Results.json


### 6. Bonus Task: Duplicate Passage Detection
*This script uses the fine-tuned model to find and report passages in the corpus that are highly similar to each other.*

In [None]:


import os
import glob
from sentence_transformers import SentenceTransformer, util
import torch
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

try:
    
    FINETUNED_MODEL_PATH
    PASSAGES_DIR
except NameError:
    logger.warning("Paths not defined, defining them now. Make sure they are correct.")
    DRIVE_BASE_PATH = '/content/drive/My Drive/NLP_HW2/'
    DATA_DIR_NAME = 'Data'
    PASSAGES_DIR_NAME = 'Passages'
    MODELS_DIR_NAME = 'Models'
    DATA_PATH = os.path.join(DRIVE_BASE_PATH, DATA_DIR_NAME)
    PASSAGES_DIR = os.path.join(DATA_PATH, PASSAGES_DIR_NAME)
    FINETUNED_MODEL_PATH = os.path.join(DRIVE_BASE_PATH, MODELS_DIR_NAME, 'cis-lmu_glot500-base_finetuned_retrieval_v1')

logger.info(f"Using fine-tuned model from: {FINETUNED_MODEL_PATH}")
logger.info(f"Loading passages from: {PASSAGES_DIR}")

# Load the Fine-Tuned Model
try:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = SentenceTransformer(FINETUNED_MODEL_PATH, device=device)
    logger.info(f"Successfully loaded fine-tuned model on device: {device}")
except Exception as e:
    logger.error(f"Failed to load the fine-tuned model. Ensure training was successful and the path is correct.")
    raise e

# Load All Passages 
passage_files = glob.glob(os.path.join(PASSAGES_DIR, "*.txt"))
corpus_passages = {} 

for passage_file in passage_files:
    try:
        with open(passage_file, 'r', encoding='utf-8') as pf:
            passage_text = pf.read().strip()
        if passage_text:
            corpus_passages[os.path.basename(passage_file)] = passage_text
    except Exception as e:
        logger.error(f"Error reading passage file {passage_file}: {e}")

logger.info(f"Loaded {len(corpus_passages)} non-empty passages.")

corpus_filenames = list(corpus_passages.keys())
corpus_texts = list(corpus_passages.values())

# Encode All Passages 
logger.info("Encoding all passages with the fine-tuned model. This may take a moment...")
corpus_embeddings = model.encode(corpus_texts,
                                 convert_to_tensor=True,
                                 show_progress_bar=True,
                                 batch_size=32)

logger.info(f"Encoding complete. Shape of embeddings tensor: {corpus_embeddings.shape}")

# Find and Display Duplicate Pairs
score_threshold = 0.90

logger.info(f"Finding duplicate pairs with a similarity score > {score_threshold}...")
duplicate_pairs = util.paraphrase_mining_embeddings(corpus_embeddings,
                                                    top_k=5,
                                                    query_chunk_size=5000, 
                                                    corpus_chunk_size=100000)

logger.info(f"Found {len(duplicate_pairs)} potential duplicate pairs above the threshold.")

print("\\n" + "="*50)
print("              DUPLICATE DETECTION RESULTS")
print("="*50 + "\\n")

if not duplicate_pairs:
    print(f" No duplicate pairs found with a similarity score above {score_threshold}.")
else:
    # Sort pairs by score for cleaner output
    duplicate_pairs.sort(key=lambda x: x[0], reverse=True)

    for score, i, j in duplicate_pairs:
        # Check if score is above our defined threshold
        if score > score_threshold:
            file1 = corpus_filenames[i]
            file2 = corpus_filenames[j]

            if file1 < file2:
                print(f"Score: {score:.4f}")
                print(f"File 1: {file1}")
                print(f"File 2: {file2}")
                print(f"Text 1 Snippet: '{corpus_texts[i][:100]}...'")
                print(f"Text 2 Snippet: '{corpus_texts[j][:100]}...'")
                print("-"*50 + "\\n")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

              DUPLICATE DETECTION RESULTS
Score: 0.9287
File 1: Ardabil_Summary.txt
File 2: East_Azerbaijan_Summary.txt
Text 1 Snippet: 'استان اردبیل، با مرکزیت شهر اردبیل، یکی از استان‌های شمال غربی ایران است که به دلیل آب و هوای سرد و ...'
Text 2 Snippet: 'استان آذربایجان شرقی، با مرکزیت شهر تبریز، یکی از استان‌های بزرگ و مهم در شمال غربی ایران است. این ا...'
--------------------------------------------------\n


### 8. Annotation Prep: Format Results for Label Studio
*This script takes the model outputs, shuffles them to prevent bias, and formats them into a JSON file for import into Label Studio.*

In [None]:
import json
import random

input_file = "/content/drive/MyDrive/NLP_HW2/Retrieval_Results.json"
output_file = "/content/drive/MyDrive/NLP_HW2//label_studio_ready.json"

with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

output_data = []

for item in data:
    question = item.get("question_text")
    all_passages = (
        item.get("tfidf_top3", []) +
        item.get("zeroshot_glot500_top3", []) +
        item.get("finetuned_glot500_top3", [])
    )

    # Deduplicate
    seen = set()
    unique_snippets = []
    for p in all_passages:
        text = p["passage_text_snippet"]
        if text not in seen:
            seen.add(text)
            unique_snippets.append(text)

    while len(unique_snippets) < 9:
        unique_snippets.append("پاسخ تکراری")

    selected = unique_snippets[:9]
    random.shuffle(selected)

    task = {"question": question}
    for i, passage in enumerate(selected):
        task[f"ans{i+1}"] = passage

    output_data.append(task)

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(output_data, f, ensure_ascii=False, indent=2)

print(f"Saved {len(output_data)} tasks to {output_file}")