In [3]:
import pandas as pd
import glob
import os
import json
import zipfile
from sentence_transformers import CrossEncoder, SentenceTransformer
from huggingface_hub import snapshot_download # This helps us show download bars
import torch

# --- 1. HARDWARE SETUP ---
if torch.backends.mps.is_available():
    device = 'mps'
    print("üöÄ POWER UNLEASHED: Using Apple M4 (MPS) Acceleration")
else:
    device = 'cpu'
    print("‚ö†Ô∏è WARNING: Running on CPU.")

# --- 2. FIND FILES ---
print("\nüîç Scanning for Test Data...")
jsonl_files = glob.glob("*.jsonl") + glob.glob("data/*.jsonl") + glob.glob("SemEval_Task4/*.jsonl")
input_a, input_b = None, None

for f in jsonl_files:
    try:
        count = sum(1 for line in open(f))
        if count == 400: input_a = f
        elif count == 849: input_b = f
    except: pass

if not input_a or not input_b:
    input_a, input_b = 'test_track_a.jsonl', 'test_track_b.jsonl'
    print("‚ö†Ô∏è Using manual filenames (Auto-detection failed)")
else:
    print(f"   ‚úÖ Track A File: {input_a}")
    print(f"   ‚úÖ Track B File: {input_b}")

# --- 3. DOWNLOAD MODELS WITH PROGRESS BARS ---
print("\n‚¨áÔ∏è STARTING DOWNLOADS (This ensures you see progress)...")

# Model 1: DeBERTa v3 Large
print("   1. Downloading DeBERTa-v3-Large (~800MB)...")
model_a_id = 'cross-encoder/nli-deberta-v3-large'
snapshot_download(repo_id=model_a_id) # This triggers the bar

# Model 2: GTE Large
print("   2. Downloading GTE-Large-v1.5 (~1.5GB)...")
model_b_id = 'Alibaba-NLP/gte-large-en-v1.5'
snapshot_download(repo_id=model_b_id) # This triggers the bar

print("\n‚úÖ Downloads Complete. Loading into Memory...")
model_a = CrossEncoder(model_a_id, device=device)
model_b = SentenceTransformer(model_b_id, trust_remote_code=True, device=device)

# --- 4. EXECUTE TRACK A ---
print(f"\nüß† SCORING TRACK A ({input_a})...")
df_a = pd.read_json(input_a, lines=True)

# Column detection
anc_col = next((c for c in ['anchor_text', 'anchor'] if c in df_a.columns), 'anchor')
a_col = next((c for c in ['text_a', 'a'] if c in df_a.columns), 'a')
b_col = next((c for c in ['text_b', 'b'] if c in df_a.columns), 'b')

# Create pairs
pairs_a = df_a[[anc_col, a_col]].values.tolist()
pairs_b = df_a[[anc_col, b_col]].values.tolist()

# INFERENCE (Progress bar included)
scores_a = model_a.predict(pairs_a, batch_size=4, show_progress_bar=True)
scores_b = model_a.predict(pairs_b, batch_size=4, show_progress_bar=True)
preds_a = scores_a > scores_b

# --- 5. EXECUTE TRACK B ---
print(f"\nüß† EMBEDDING TRACK B ({input_b})...")
df_b = pd.read_json(input_b, lines=True)
text_col = next((c for c in ['text', 'story', 'anchor', 'anchor_text'] if c in df_b.columns), None)

if text_col:
    # INFERENCE (Progress bar included)
    embeddings = model_b.encode(
        df_b[text_col].tolist(), 
        batch_size=4, 
        show_progress_bar=True, 
        device=device,
        convert_to_numpy=True
    )
    embeddings_list = embeddings.tolist()
else:
    embeddings_list = []

# --- 6. ZIP IT UP ---
print("\nüì¶ Zipping Final Submission...")
os.makedirs('outputs', exist_ok=True)

with open('outputs/track_a.jsonl', 'w') as f:
    for val in preds_a:
        json.dump({"text_a_is_closer": bool(val)}, f)
        f.write('\n')

with open('outputs/track_b.jsonl', 'w') as f:
    for emb in embeddings_list:
        json.dump({"embedding": emb}, f)
        f.write('\n')

zip_name = 'submission_SOTA_FINAL.zip'
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('outputs/track_a.jsonl', arcname='track_a.jsonl')
    zipf.write('outputs/track_b.jsonl', arcname='track_b.jsonl')

print(f"\nüèÜ READY! Upload '{zip_name}' to CodaBench Testing Phase.")

üöÄ POWER UNLEASHED: Using Apple M4 (MPS) Acceleration

üîç Scanning for Test Data...
   ‚úÖ Track A File: test_track_a.jsonl
   ‚úÖ Track B File: test_track_b.jsonl

‚¨áÔ∏è STARTING DOWNLOADS (This ensures you see progress)...
   1. Downloading DeBERTa-v3-Large (~800MB)...


Fetching 17 files:  29%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç               | 5/17 [26:16<1:03:03, 315.32s/it]


KeyboardInterrupt: 

In [4]:
import pandas as pd
import glob
import os
import json
import zipfile
from sentence_transformers import CrossEncoder, SentenceTransformer
import torch

# --- 1. HARDWARE SETUP ---
if torch.backends.mps.is_available():
    device = 'mps'
    print("üöÄ Using Apple M4 (MPS) - Ready for SOTA Models")
else:
    device = 'cpu'
    print("‚ö†Ô∏è Using CPU (Slow)")

# --- 2. FIND TEST FILES ---
print("üîç Scanning for Test Data...")
# We look for files with the correct line counts (400 and 849)
jsonl_files = glob.glob("*.jsonl") + glob.glob("data/*.jsonl") + glob.glob("SemEval_Task4/*.jsonl")
input_a, input_b = None, None

for f in jsonl_files:
    try:
        count = sum(1 for line in open(f))
        if count == 400: input_a = f
        elif count == 849: input_b = f
    except: pass

# Fallback if auto-detection fails
if not input_a or not input_b: 
    input_a, input_b = 'test_track_a.jsonl', 'test_track_b.jsonl'

print(f"   Track A File: {input_a}\n   Track B File: {input_b}")

# --- 3. LOAD MODELS FROM LOCAL FOLDERS ---
print("\nüìÇ Loading Models from your local download...")

# These match the folder names from the terminal command
path_a = './nli-deberta-v3-large'
path_b = './gte-large-en-v1.5'

# Load Model A (DeBERTa)
if os.path.exists(path_a):
    print("   ‚úÖ Found local DeBERTa model! Loading...")
    model_a = CrossEncoder(path_a, device=device)
else:
    print(f"   ‚ùå Error: Could not find folder '{path_a}'. Check where you ran the terminal command.")

# Load Model B (GTE)
if os.path.exists(path_b):
    print("   ‚úÖ Found local GTE model! Loading...")
    model_b = SentenceTransformer(path_b, trust_remote_code=True, device=device)
else:
    print(f"   ‚ùå Error: Could not find folder '{path_b}'.")

# --- 4. RUN TRACK A (SCORING) ---
print(f"\nüß† Scoring Track A (DeBERTa)...")
df_a = pd.read_json(input_a, lines=True)

# Column detection
anc_col = next((c for c in ['anchor_text', 'anchor'] if c in df_a.columns), 'anchor')
a_col = next((c for c in ['text_a', 'a'] if c in df_a.columns), 'a')
b_col = next((c for c in ['text_b', 'b'] if c in df_a.columns), 'b')

pairs_a = df_a[[anc_col, a_col]].values.tolist()
pairs_b = df_a[[anc_col, b_col]].values.tolist()

# Batch size 8 is safe for M4 with these local models
scores_a = model_a.predict(pairs_a, batch_size=8, show_progress_bar=True)
scores_b = model_a.predict(pairs_b, batch_size=8, show_progress_bar=True)
preds_a = scores_a > scores_b

# --- 5. RUN TRACK B (EMBEDDING) ---
print(f"\nüß† Embedding Track B (GTE)...")
df_b = pd.read_json(input_b, lines=True)
text_col = next((c for c in ['text', 'story', 'anchor', 'anchor_text'] if c in df_b.columns), None)

if text_col:
    embeddings = model_b.encode(
        df_b[text_col].tolist(), 
        batch_size=4, # GTE Large is big, keep batch small
        show_progress_bar=True, 
        device=device,
        convert_to_numpy=True
    )
    embeddings_list = embeddings.tolist()
else:
    embeddings_list = []

# --- 6. ZIP AND FINISH ---
print("\nüì¶ Zipping Submission...")
os.makedirs('outputs', exist_ok=True)

with open('outputs/track_a.jsonl', 'w') as f:
    for val in preds_a:
        json.dump({"text_a_is_closer": bool(val)}, f)
        f.write('\n')

with open('outputs/track_b.jsonl', 'w') as f:
    for emb in embeddings_list:
        json.dump({"embedding": emb}, f)
        f.write('\n')

zip_name = 'submission_SOTA_LOCAL.zip'
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('outputs/track_a.jsonl', arcname='track_a.jsonl')
    zipf.write('outputs/track_b.jsonl', arcname='track_b.jsonl')

print(f"\nüèÜ READY! Upload '{zip_name}' to CodaBench Testing Phase.")

üöÄ Using Apple M4 (MPS) - Ready for SOTA Models
üîç Scanning for Test Data...
   Track A File: test_track_a.jsonl
   Track B File: test_track_b.jsonl

üìÇ Loading Models from your local download...
   ‚úÖ Found local DeBERTa model! Loading...
   ‚úÖ Found local GTE model! Loading...


A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.



üß† Scoring Track A (DeBERTa)...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [03:17<00:00,  3.94s/it]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [03:59<00:00,  4.79s/it]



üß† Embedding Track B (GTE)...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 213/213 [01:24<00:00,  2.53it/s]



üì¶ Zipping Submission...


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [5]:
import numpy as np
import json
import zipfile
import os

print("üõ†Ô∏è Applying Fix for NLI Model Output...")

# 1. FIX THE SCORES
# The model output shape is likely (400, 3). We want column 1 (Entailment).
# We check the shape to be safe.
if len(scores_a.shape) > 1 and scores_a.shape[1] >= 2:
    print(f"   - Detected multi-column scores {scores_a.shape}. Extracting 'Entailment' (Index 1)...")
    final_scores_a = scores_a[:, 1]
    final_scores_b = scores_b[:, 1]
else:
    # Fallback if it was already 1D
    final_scores_a = scores_a
    final_scores_b = scores_b

# 2. RE-CALCULATE PREDICTIONS
# Now we compare single numbers, so we get a clean True/False list
preds_a = final_scores_a > final_scores_b
print(f"   - Re-calculated {len(preds_a)} predictions.")

# 3. SAVE & ZIP (Standard Routine)
print("üì¶ Zipping Final Submission...")
os.makedirs('outputs', exist_ok=True)

with open('outputs/track_a.jsonl', 'w') as f:
    for val in preds_a:
        # This will now work because 'val' is a simple Python boolean
        json.dump({"text_a_is_closer": bool(val)}, f)
        f.write('\n')

# We re-save Track B just to be sure (it was already fine, but good to keep in sync)
with open('outputs/track_b.jsonl', 'w') as f:
    for emb in embeddings_list:
        json.dump({"embedding": emb}, f)
        f.write('\n')

zip_name = 'submission_SOTA_FIXED.zip'
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('outputs/track_a.jsonl', arcname='track_a.jsonl')
    zipf.write('outputs/track_b.jsonl', arcname='track_b.jsonl')

print(f"\nüèÜ SUCCESS! Upload '{zip_name}' to CodaBench Testing Phase.")

üõ†Ô∏è Applying Fix for NLI Model Output...
   - Detected multi-column scores (400, 3). Extracting 'Entailment' (Index 1)...
   - Re-calculated 400 predictions.
üì¶ Zipping Final Submission...

üèÜ SUCCESS! Upload 'submission_SOTA_FIXED.zip' to CodaBench Testing Phase.


In [8]:
import json
import zipfile
import os

print("üîç FINAL VERIFICATION PROTOCOL INITIATED...\n")

# 1. Verify Track A
print("Checking Track A (outputs/track_a.jsonl)...")
try:
    with open('outputs/track_a.jsonl', 'r') as f:
        lines = f.readlines()
        count_a = len(lines)
        first_line = json.loads(lines[0])
        
        # Check 1: Key Name
        if "text_a_is_closer" in first_line:
            print(f"   ‚úÖ Key 'text_a_is_closer' found.")
        else:
            print(f"   ‚ùå CRITICAL: Wrong key in Track A. Found: {first_line.keys()}")
            
        # Check 2: Value Type
        val = first_line["text_a_is_closer"]
        if isinstance(val, bool):
            print(f"   ‚úÖ Value type is BOOLEAN ({val}).")
        else:
            print(f"   ‚ùå CRITICAL: Wrong type. Expected bool, got {type(val)}.")
            
        # Check 3: Count
        if count_a == 400:
            print(f"   ‚úÖ Line count is exactly 400.")
        else:
            print(f"   ‚ö†Ô∏è WARNING: Line count is {count_a} (Expected 400).")

except Exception as e:
    print(f"   ‚ùå Error reading Track A: {e}")

# 2. Verify Track B
print("\nChecking Track B (outputs/track_b.jsonl)...")
try:
    with open('outputs/track_b.jsonl', 'r') as f:
        lines = f.readlines()
        count_b = len(lines)
        first_line = json.loads(lines[0])
        
        # Check 1: Key Name
        if "embedding" in first_line:
            print(f"   ‚úÖ Key 'embedding' (singular) found.")
        else:
            print(f"   ‚ùå CRITICAL: Wrong key. Found: {first_line.keys()}")
            
        # Check 2: Value Type & Shape
        emb = first_line["embedding"]
        if isinstance(emb, list) and len(emb) > 10:
            print(f"   ‚úÖ Value is a LIST of floats (Length: {len(emb)}).")
        else:
            print(f"   ‚ùå CRITICAL: Invalid embedding format.")
            
        # Check 3: Count
        if count_b == 849:
            print(f"   ‚úÖ Line count is exactly 849.")
        else:
            print(f"   ‚ö†Ô∏è WARNING: Line count is {count_b} (Expected ~849).")

except Exception as e:
    print(f"   ‚ùå Error reading Track B: {e}")

# 3. Verify Zip File
print("\nChecking Zip Archive (submission_SOTA_FIXED.zip)...")
try:
    with zipfile.ZipFile('submission_BGE.zip', 'r') as z:
        files = z.namelist()
        if 'track_a.jsonl' in files and 'track_b.jsonl' in files:
            print(f"   ‚úÖ Zip contains correct files: {files}")
        else:
            print(f"   ‚ùå CRITICAL: Zip is missing files. Found: {files}")
except Exception as e:
    print(f"   ‚ùå Error checking Zip: {e}")

print("\nüöÄ VERIFICATION COMPLETE. If all ticks are Green, you are safe to upload.")

üîç FINAL VERIFICATION PROTOCOL INITIATED...

Checking Track A (outputs/track_a.jsonl)...
   ‚úÖ Key 'text_a_is_closer' found.
   ‚úÖ Value type is BOOLEAN (False).
   ‚úÖ Line count is exactly 400.

Checking Track B (outputs/track_b.jsonl)...
   ‚úÖ Key 'embedding' (singular) found.
   ‚úÖ Value is a LIST of floats (Length: 1024).
   ‚úÖ Line count is exactly 849.

Checking Zip Archive (submission_SOTA_FIXED.zip)...
   ‚úÖ Zip contains correct files: ['track_a.jsonl', 'track_b.jsonl']

üöÄ VERIFICATION COMPLETE. If all ticks are Green, you are safe to upload.


In [7]:
import pandas as pd
import glob
import os
import json
import zipfile
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# --- 1. HARDWARE ---
if torch.backends.mps.is_available():
    device = 'mps'
    print("üöÄ Using Apple M4 (MPS) - BGE Edition")
else:
    device = 'cpu'

# --- 2. FIND TEST FILES ---
print("üîç Finding Test Data...")
jsonl_files = glob.glob("*.jsonl") + glob.glob("data/*.jsonl") + glob.glob("SemEval_Task4/*.jsonl")
input_a, input_b = None, None
for f in jsonl_files:
    try:
        count = sum(1 for line in open(f))
        if count == 400: input_a = f
        elif count == 849: input_b = f
    except: pass
if not input_a or not input_b: input_a, input_b = 'test_track_a.jsonl', 'test_track_b.jsonl'
print(f"   Track A: {input_a}\n   Track B: {input_b}")

# --- 3. LOAD LOCAL BGE MODELS ---
print("\nüìÇ Loading BGE Models...")

# Path A: Reranker
path_a = './bge-reranker-large'
if os.path.exists(path_a):
    print("   ‚úÖ Loading Reranker (Track A)...")
    # Rerankers are loaded slightly differently than CrossEncoders
    tokenizer_a = AutoTokenizer.from_pretrained(path_a)
    model_a = AutoModelForSequenceClassification.from_pretrained(path_a).to(device)
    model_a.eval()
else:
    print(f"   ‚ùå Error: '{path_a}' not found. Did you run the terminal command?")

# Path B: Embedding
path_b = './bge-large-en-v1.5'
if os.path.exists(path_b):
    print("   ‚úÖ Loading Embedder (Track B)...")
    model_b = SentenceTransformer(path_b, device=device)
else:
    print(f"   ‚ùå Error: '{path_b}' not found.")

# --- 4. RUN TRACK A (RERANKING) ---
print(f"\nüß† Scoring Track A (BGE Reranker)...")
df_a = pd.read_json(input_a, lines=True)

anc_col = next((c for c in ['anchor_text', 'anchor'] if c in df_a.columns), 'anchor')
a_col = next((c for c in ['text_a', 'a'] if c in df_a.columns), 'a')
b_col = next((c for c in ['text_b', 'b'] if c in df_a.columns), 'b')

# Reranker expects simple pairs: [Anchor, A] and [Anchor, B]
pairs_a = df_a[[anc_col, a_col]].values.tolist()
pairs_b = df_a[[anc_col, b_col]].values.tolist()

# Helper function for Reranker Inference
def predict_reranker(pairs, model, tokenizer, batch_size=8):
    scores = []
    # Process in chunks
    for i in range(0, len(pairs), batch_size):
        batch = pairs[i:i+batch_size]
        # Tokenize
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=512).to(device)
        with torch.no_grad():
            # Get logits (score)
            output = model(**inputs).logits.view(-1).float()
            scores.extend(output.cpu().numpy())
    return scores

print("   - Calculating scores...")
scores_a = predict_reranker(pairs_a, model_a, tokenizer_a)
scores_b = predict_reranker(pairs_b, model_a, tokenizer_a)

# Logic: Higher score = Better match
preds_a = [s_a > s_b for s_a, s_b in zip(scores_a, scores_b)]

# --- 5. RUN TRACK B (EMBEDDING) ---
print(f"\nüß† Embedding Track B (BGE Large)...")
df_b = pd.read_json(input_b, lines=True)
text_col = next((c for c in ['text', 'story', 'anchor', 'anchor_text'] if c in df_b.columns), None)

if text_col:
    # BGE works best with a prompt for asymmetric tasks, but for symmetric story similarity
    # we usually keep it raw. However, adding "Represent this story:" can sometimes help.
    # Let's stick to raw for safety unless specified.
    embeddings = model_b.encode(
        df_b[text_col].tolist(), 
        batch_size=8, 
        show_progress_bar=True, 
        device=device,
        normalize_embeddings=True # BGE requires normalized embeddings
    )
    embeddings_list = embeddings.tolist()
else:
    embeddings_list = []

# --- 6. SAVE & ZIP ---
print("\nüì¶ Zipping BGE Submission...")
os.makedirs('outputs', exist_ok=True)

with open('outputs/track_a.jsonl', 'w') as f:
    for val in preds_a:
        json.dump({"text_a_is_closer": bool(val)}, f)
        f.write('\n')

with open('outputs/track_b.jsonl', 'w') as f:
    for emb in embeddings_list:
        json.dump({"embedding": emb}, f)
        f.write('\n')

zip_name = 'submission_BGE.zip'
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('outputs/track_a.jsonl', arcname='track_a.jsonl')
    zipf.write('outputs/track_b.jsonl', arcname='track_b.jsonl')

print(f"\nüèÜ READY! Upload '{zip_name}' to CodaBench.")

üöÄ Using Apple M4 (MPS) - BGE Edition
üîç Finding Test Data...
   Track A: test_track_a.jsonl
   Track B: test_track_b.jsonl

üìÇ Loading BGE Models...
   ‚úÖ Loading Reranker (Track A)...
   ‚úÖ Loading Embedder (Track B)...

üß† Scoring Track A (BGE Reranker)...
   - Calculating scores...

üß† Embedding Track B (BGE Large)...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 107/107 [01:04<00:00,  1.65it/s]



üì¶ Zipping BGE Submission...

üèÜ READY! Upload 'submission_BGE.zip' to CodaBench.


In [9]:
import json
import zipfile
import os

zip_filename = 'submission_BGE.zip'

print(f"üîç VERIFYING: {zip_filename} ...\n")

if not os.path.exists(zip_filename):
    print(f"‚ùå CRITICAL ERROR: File '{zip_filename}' not found!")
else:
    try:
        with zipfile.ZipFile(zip_filename, 'r') as z:
            files = z.namelist()
            
            # --- CHECK 1: FILE STRUCTURE ---
            if 'track_a.jsonl' in files and 'track_b.jsonl' in files:
                print(f"   ‚úÖ ZIP Structure: OK (Found both jsonl files)")
            else:
                print(f"   ‚ùå ZIP ERROR: Missing files. Found: {files}")

            # --- CHECK 2: TRACK A CONTENT ---
            with z.open('track_a.jsonl') as f:
                lines = f.readlines()
                count = len(lines)
                first = json.loads(lines[0])
                
                # Check Count
                if count == 400:
                    print(f"   ‚úÖ Track A Count: OK (400 items)")
                else:
                    print(f"   ‚ö†Ô∏è Track A Count: WARNING ({count} items - Expected 400)")

                # Check Key & Type
                if "text_a_is_closer" in first and isinstance(first["text_a_is_closer"], bool):
                    print(f"   ‚úÖ Track A Format: OK (Key 'text_a_is_closer' is Boolean)")
                else:
                    print(f"   ‚ùå Track A ERROR: Invalid JSON format: {first}")

            # --- CHECK 3: TRACK B CONTENT ---
            with z.open('track_b.jsonl') as f:
                lines = f.readlines()
                count = len(lines)
                first = json.loads(lines[0])
                
                # Check Count
                if count == 849:
                    print(f"   ‚úÖ Track B Count: OK (849 items)")
                else:
                    print(f"   ‚ö†Ô∏è Track B Count: WARNING ({count} items - Expected 849)")

                # Check Key (The most common error)
                if "embedding" in first:
                    print(f"   ‚úÖ Track B Key: OK (Found singular 'embedding')")
                else:
                    print(f"   ‚ùå Track B ERROR: Key mismatch! Found: {list(first.keys())} (Expected 'embedding')")

                # Check Vector Size (BGE-Large should be 1024)
                vec_len = len(first["embedding"])
                if vec_len == 1024:
                    print(f"   ‚úÖ Track B Dimensions: OK (1024 for BGE-Large)")
                else:
                    print(f"   ‚ÑπÔ∏è Track B Dimensions: {vec_len} (Just FYI)")

        print("\nüöÄ STATUS: READY TO UPLOAD.")

    except Exception as e:
        print(f"\n‚ùå SCRIPT CRASHED: {e}")

üîç VERIFYING: submission_BGE.zip ...

   ‚úÖ ZIP Structure: OK (Found both jsonl files)
   ‚úÖ Track A Count: OK (400 items)
   ‚úÖ Track A Format: OK (Key 'text_a_is_closer' is Boolean)
   ‚úÖ Track B Count: OK (849 items)
   ‚úÖ Track B Key: OK (Found singular 'embedding')
   ‚úÖ Track B Dimensions: OK (1024 for BGE-Large)

üöÄ STATUS: READY TO UPLOAD.


In [1]:
import pandas as pd
import glob
import os
import json
import zipfile
import torch
from sentence_transformers import CrossEncoder, SentenceTransformer

# --- 1. HARDWARE CHECK ---
if torch.backends.mps.is_available():
    device = 'mps'
    print("üöÄ Using Apple M4 (MPS) - Optimized for STS")
else:
    device = 'cpu'

# --- 2. INTELLIGENT FILE FINDER ---
print("üîç Scanning for Test Data...")
jsonl_files = glob.glob("*.jsonl") + glob.glob("data/*.jsonl") + glob.glob("SemEval_Task4/*.jsonl")
input_a, input_b = None, None

for f in jsonl_files:
    try:
        count = sum(1 for line in open(f))
        if count == 400: input_a = f
        elif count == 849: input_b = f
    except: pass

if not input_a or not input_b: 
    input_a, input_b = 'test_track_a.jsonl', 'test_track_b.jsonl'
    print("‚ö†Ô∏è Using manual filenames (Auto-detection failed)")
else:
    print(f"   ‚úÖ Track A: {input_a}")
    print(f"   ‚úÖ Track B: {input_b}")

# --- 3. LOAD THE "SIMILARITY" EXPERTS ---
print("\nüìÇ Loading Models...")

# Model A: STS RoBERTa (The Similarity Judge)
path_a = './stsb-roberta-large'
if os.path.exists(path_a):
    print("   ‚úÖ Loading STS RoBERTa (Track A)...")
    model_a = CrossEncoder(path_a, device=device)
else:
    print(f"   ‚ùå Error: '{path_a}' not found. Did you run the terminal command?")
    # Fallback just in case
    model_a = CrossEncoder('cross-encoder/stsb-roberta-large', device=device)

# Model B: Mxbai Large (The Context Mapper)
path_b = './mxbai-embed-large-v1'
if os.path.exists(path_b):
    print("   ‚úÖ Loading Mxbai Large (Track B)...")
    model_b = SentenceTransformer(path_b, device=device)
else:
    print(f"   ‚ùå Error: '{path_b}' not found.")
    model_b = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1', device=device)

# --- 4. EXECUTE TRACK A (Pairwise Similarity) ---
print(f"\nüß† Scoring Track A...")
df_a = pd.read_json(input_a, lines=True)

# Column Setup
anc_col = next((c for c in ['anchor_text', 'anchor'] if c in df_a.columns), 'anchor')
a_col = next((c for c in ['text_a', 'a'] if c in df_a.columns), 'a')
b_col = next((c for c in ['text_b', 'b'] if c in df_a.columns), 'b')

pairs_a = df_a[[anc_col, a_col]].values.tolist()
pairs_b = df_a[[anc_col, b_col]].values.tolist()

# Predict: This model outputs a single float (0 to 1) score representing similarity
print("   - Calculating similarity scores...")
scores_a = model_a.predict(pairs_a, batch_size=16, show_progress_bar=True)
scores_b = model_a.predict(pairs_b, batch_size=16, show_progress_bar=True)

# If Score A > Score B, then A is the winner
preds_a = scores_a > scores_b

# --- 5. EXECUTE TRACK B (Prompt-Based Embedding) ---
print(f"\nüß† Embedding Track B...")
df_b = pd.read_json(input_b, lines=True)
text_col = next((c for c in ['text', 'story', 'anchor', 'anchor_text'] if c in df_b.columns), None)

if text_col:
    print("   - Applying 'Represent this story' prompt for Mxbai...")
    # Mxbai works best with a specific prompt instruction
    prompt = "Represent this story for semantic similarity search: "
    texts = [prompt + t for t in df_b[text_col].tolist()]
    
    embeddings = model_b.encode(
        texts, 
        batch_size=8, 
        show_progress_bar=True, 
        device=device,
        normalize_embeddings=True # Crucial for Mxbai
    )
    embeddings_list = embeddings.tolist()
else:
    embeddings_list = []

# --- 6. SAVE & ZIP ---
print("\nüì¶ Zipping Submission...")
os.makedirs('outputs', exist_ok=True)

with open('outputs/track_a.jsonl', 'w') as f:
    for val in preds_a:
        json.dump({"text_a_is_closer": bool(val)}, f)
        f.write('\n')

with open('outputs/track_b.jsonl', 'w') as f:
    for emb in embeddings_list:
        json.dump({"embedding": emb}, f) # Singular 'embedding'
        f.write('\n')

zip_name = 'submission_STS_MXBAI.zip'
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('outputs/track_a.jsonl', arcname='track_a.jsonl')
    zipf.write('outputs/track_b.jsonl', arcname='track_b.jsonl')

print(f"\nüèÜ READY! Upload '{zip_name}' to CodaBench.")

  from .autonotebook import tqdm as notebook_tqdm


üöÄ Using Apple M4 (MPS) - Optimized for STS
üîç Scanning for Test Data...
   ‚úÖ Track A: test_track_a.jsonl
   ‚úÖ Track B: test_track_b.jsonl

üìÇ Loading Models...
   ‚úÖ Loading STS RoBERTa (Track A)...
   ‚úÖ Loading Mxbai Large (Track B)...

üß† Scoring Track A...
   - Calculating similarity scores...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [01:15<00:00,  3.01s/it]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [01:17<00:00,  3.10s/it]



üß† Embedding Track B...
   - Applying 'Represent this story' prompt for Mxbai...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 107/107 [01:01<00:00,  1.75it/s]



üì¶ Zipping Submission...

üèÜ READY! Upload 'submission_STS_MXBAI.zip' to CodaBench.


In [5]:
import pandas as pd
import glob
import os
import json
import zipfile
import torch
import numpy as np
from sentence_transformers import CrossEncoder, SentenceTransformer
from transformers import AutoModel

# --- 1. HARDWARE OPTIMIZATION ---
if torch.backends.mps.is_available():
    device = 'mps'
    print("üöÄ Using Apple M4 (MPS) - Running Ensemble Mode")
else:
    device = 'cpu'

# --- 2. FIND TEST DATA ---
print("üîç Scanning Test Data...")
jsonl_files = glob.glob("*.jsonl") + glob.glob("data/*.jsonl") + glob.glob("SemEval_Task4/*.jsonl")
input_a, input_b = None, None
for f in jsonl_files:
    try:
        count = sum(1 for line in open(f))
        if count == 400: input_a = f
        elif count == 849: input_b = f
    except: pass
if not input_a or not input_b: input_a, input_b = 'test_track_a.jsonl', 'test_track_b.jsonl'
print(f"   Track A: {input_a}\n   Track B: {input_b}")

# --- 3. LOAD THE COUNCIL OF EXPERTS ---
print("\nüìÇ Summoning the Models...")

# Expert 1: Logic (DeBERTa) - From Step 1
path_deberta = './nli-deberta-v3-large'
if os.path.exists(path_deberta):
    print("   ‚úÖ Loaded Expert 1: Logic (DeBERTa)")
    model_logic = CrossEncoder(path_deberta, device=device)
else:
    print("   ‚ö†Ô∏è Local DeBERTa not found. Downloading...")
    model_logic = CrossEncoder('cross-encoder/nli-deberta-v3-large', device=device)

# Expert 2: Vibe (RoBERTa STS) - From Step 2
path_roberta = './stsb-roberta-large'
if os.path.exists(path_roberta):
    print("   ‚úÖ Loaded Expert 2: Vibe (RoBERTa)")
    model_vibe = CrossEncoder(path_roberta, device=device)
else:
    print("   ‚ö†Ô∏è Local RoBERTa not found. Downloading...")
    model_vibe = CrossEncoder('cross-encoder/stsb-roberta-large', device=device)

# Expert 3: Narrative Structure (Jina v3) - NEW
path_jina = './jina-embeddings-v3'
if os.path.exists(path_jina):
    print("   ‚úÖ Loaded Expert 3: Narrative (Jina v3)")
    # Jina requires trust_remote_code for its special architecture
    model_jina = SentenceTransformer(path_jina, trust_remote_code=True, device=device)
else:
    print("   ‚ö†Ô∏è Local Jina not found. Downloading...")
    model_jina = SentenceTransformer('jinaai/jina-embeddings-v3', trust_remote_code=True, device=device)

# --- 4. EXECUTE TRACK A (ENSEMBLE VOTING) ---
print(f"\nüß† TRACK A: The Council is Voting...")
df_a = pd.read_json(input_a, lines=True)
anc_col = next((c for c in ['anchor_text', 'anchor'] if c in df_a.columns), 'anchor')
a_col = next((c for c in ['text_a', 'a'] if c in df_a.columns), 'a')
b_col = next((c for c in ['text_b', 'b'] if c in df_a.columns), 'b')

pairs_a = df_a[[anc_col, a_col]].values.tolist()
pairs_b = df_a[[anc_col, b_col]].values.tolist()

# Vote 1: Logic Score (Entailment)
print("   - Asking DeBERTa (Logic)...")
scores_logic_a = model_logic.predict(pairs_a, batch_size=16, show_progress_bar=True)
scores_logic_b = model_logic.predict(pairs_b, batch_size=16, show_progress_bar=True)

# Fix shape if DeBERTa output 3 columns (Entailment is usually index 1 or 2 depending on version)
# nli-deberta-v3-large: [Contradiction, Neutral, Entailment] -> We want Entailment (Index 2) or Neutral+Entailment
if len(scores_logic_a.shape) > 1:
    # Use Entailment score (Index 1 in 2-class, Index 2 in 3-class)
    # nli-deberta-v3-large is usually (Contradiction, Entailment, Neutral) or similar.
    # To be safe, we take the LAST column which is usually Entailment.
    s_logic_a = scores_logic_a[:, -1]
    s_logic_b = scores_logic_b[:, -1]
else:
    s_logic_a, s_logic_b = scores_logic_a, scores_logic_b

# Vote 2: Vibe Score (Similarity)
print("   - Asking RoBERTa (Vibe)...")
s_vibe_a = model_vibe.predict(pairs_a, batch_size=16, show_progress_bar=True)
s_vibe_b = model_vibe.predict(pairs_b, batch_size=16, show_progress_bar=True)

# Min-Max Normalization (Crucial to mix them fairly)
def normalize(arr):
    return (arr - np.min(arr)) / (np.max(arr) - np.min(arr))

norm_logic_a = normalize(s_logic_a)
norm_logic_b = normalize(s_logic_b)
norm_vibe_a = normalize(s_vibe_a)
norm_vibe_b = normalize(s_vibe_b)

# THE FINAL VERDICT (Weighted Average)
# We give slightly more weight (0.6) to RoBERTa because "Similarity" is the official metric.
final_score_a = (0.4 * norm_logic_a) + (0.6 * norm_vibe_a)
final_score_b = (0.4 * norm_logic_b) + (0.6 * norm_vibe_b)

preds_a = final_score_a > final_score_b

# --- 5. EXECUTE TRACK B (TASK SPECIFIC) ---
print(f"\nüß† TRACK B: Narrative Mapping (Jina)...")
df_b = pd.read_json(input_b, lines=True)
text_col = next((c for c in ['text', 'story', 'anchor', 'anchor_text'] if c in df_b.columns), None)

if text_col:
    print("   - Encoding with task='text-matching'...")
    # Jina v3 supports explicit task instructions
    embeddings = model_jina.encode(
        df_b[text_col].tolist(),
        task="text-matching", # <--- THE MAGIC KEYWORD
        batch_size=8, 
        show_progress_bar=True, 
        device=device
    )
    embeddings_list = embeddings.tolist()
else:
    embeddings_list = []

# --- 6. SAVE & ZIP ---
print("\nüì¶ Packaging the Ensemble Result...")
os.makedirs('outputs', exist_ok=True)

with open('outputs/track_a.jsonl', 'w') as f:
    for val in preds_a:
        json.dump({"text_a_is_closer": bool(val)}, f)
        f.write('\n')

with open('outputs/track_b.jsonl', 'w') as f:
    for emb in embeddings_list:
        json.dump({"embedding": emb}, f)
        f.write('\n')

zip_name = 'submission_ENSEMBLE_JINA.zip'
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('outputs/track_a.jsonl', arcname='track_a.jsonl')
    zipf.write('outputs/track_b.jsonl', arcname='track_b.jsonl')

print(f"\nüèÜ VICTORY! Upload '{zip_name}' to CodaBench.")
print("   This submission combines Logic + Vibes + Narrative Structure.")

üöÄ Using Apple M4 (MPS) - Running Ensemble Mode
üîç Scanning Test Data...
   Track A: test_track_a.jsonl
   Track B: test_track_b.jsonl

üìÇ Summoning the Models...
   ‚úÖ Loaded Expert 1: Logic (DeBERTa)
   ‚úÖ Loaded Expert 2: Vibe (RoBERTa)
   ‚úÖ Loaded Expert 3: Narrative (Jina v3)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/krish/.cache/huggingface/modules/transformers_modules/jinaai/xlm_hyphen_roberta_hyphen_flash_hyphen_implementation/2b6bc3f30750b3a9648fe9b63448c09920efe9be/mha.py'

In [1]:
import os
import shutil
import pandas as pd
import glob
import json
import zipfile
import torch
import numpy as np
from sentence_transformers import CrossEncoder, SentenceTransformer

# --- 1. HARDWARE SETUP ---
if torch.backends.mps.is_available():
    device = 'mps'
    print("üöÄ Using Apple M4 (MPS) - Safe Mode")
else:
    device = 'cpu'

# --- 2. FIND DATA ---
print("\nüîç Scanning Data...")
jsonl_files = glob.glob("*.jsonl") + glob.glob("data/*.jsonl") + glob.glob("SemEval_Task4/*.jsonl")
input_a, input_b = None, None
for f in jsonl_files:
    try:
        count = sum(1 for line in open(f))
        if count == 400: input_a = f
        elif count == 849: input_b = f
    except: pass
if not input_a or not input_b: input_a, input_b = 'test_track_a.jsonl', 'test_track_b.jsonl'
print(f"   Track A: {input_a}\n   Track B: {input_b}")

# --- 3. LOAD MODELS (ONE BY ONE TO SAVE RAM) ---
print("\nüìÇ Loading Models...")

# Expert 1: DeBERTa (Logic) - THE MEMORY HOG
print("   1. Loading DeBERTa (Logic)...")
model_logic = CrossEncoder('cross-encoder/nli-deberta-v3-large', device=device)

# Expert 2: RoBERTa (Vibe)
print("   2. Loading RoBERTa (Vibe)...")
model_vibe = CrossEncoder('cross-encoder/stsb-roberta-large', device=device)

# Expert 3: Jina v3 (Narrative)
print("   3. Loading Jina v3 (Narrative)...")
model_jina = SentenceTransformer('jinaai/jina-embeddings-v3', trust_remote_code=True, device=device)

# --- 4. RUN TRACK A (SAFE BATCH SIZES) ---
print("\nüß† TRACK A: Voting Process...")
df_a = pd.read_json(input_a, lines=True)
anc_col = next((c for c in ['anchor_text', 'anchor'] if c in df_a.columns), 'anchor')
a_col = next((c for c in ['text_a', 'a'] if c in df_a.columns), 'a')
b_col = next((c for c in ['text_b', 'b'] if c in df_a.columns), 'b')

pairs_a = df_a[[anc_col, a_col]].values.tolist()
pairs_b = df_a[[anc_col, b_col]].values.tolist()

# VOTE 1: LOGIC (Batch Size = 1 is CRITICAL here)
print("   - Asking DeBERTa (Logic) [Batch Size: 1]...")
# We use batch_size=1 to prevent M4 memory freeze
s_logic_a = model_logic.predict(pairs_a, batch_size=1, show_progress_bar=True)
s_logic_b = model_logic.predict(pairs_b, batch_size=1, show_progress_bar=True)

if len(s_logic_a.shape) > 1: s_logic_a, s_logic_b = s_logic_a[:, -1], s_logic_b[:, -1]

# VOTE 2: VIBE (Batch Size = 8 is fine here)
print("   - Asking RoBERTa (Vibe) [Batch Size: 8]...")
s_vibe_a = model_vibe.predict(pairs_a, batch_size=8, show_progress_bar=True)
s_vibe_b = model_vibe.predict(pairs_b, batch_size=8, show_progress_bar=True)

# Normalize & Vote
def normalize(arr): return (arr - np.min(arr)) / (np.max(arr) - np.min(arr))
n_logic_a, n_logic_b = normalize(s_logic_a), normalize(s_logic_b)
n_vibe_a, n_vibe_b = normalize(s_vibe_a), normalize(s_vibe_b)

final_a = (0.4 * n_logic_a) + (0.6 * n_vibe_a)
final_b = (0.4 * n_logic_b) + (0.6 * n_vibe_b)
preds_a = final_a > final_b

# --- 5. RUN TRACK B (JINA) ---
print("\nüß† TRACK B: Narrative Mapping...")
df_b = pd.read_json(input_b, lines=True)
text_col = next((c for c in ['text', 'story', 'anchor', 'anchor_text'] if c in df_b.columns), None)

if text_col:
    embeddings = model_jina.encode(
        df_b[text_col].tolist(),
        task="text-matching",
        batch_size=4, # Keep this moderate
        show_progress_bar=True,
        device=device
    )
    embeddings_list = embeddings.tolist()
else:
    embeddings_list = []

# --- 6. SAVE & ZIP ---
print("\nüì¶ Zipping Submission...")
os.makedirs('outputs', exist_ok=True)

with open('outputs/track_a.jsonl', 'w') as f:
    for val in preds_a:
        json.dump({"text_a_is_closer": bool(val)}, f)
        f.write('\n')

with open('outputs/track_b.jsonl', 'w') as f:
    for emb in embeddings_list:
        json.dump({"embedding": emb}, f)
        f.write('\n')

zip_name = 'submission_ENSEMBLE_SAFE.zip'
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('outputs/track_a.jsonl', arcname='track_a.jsonl')
    zipf.write('outputs/track_b.jsonl', arcname='track_b.jsonl')

print(f"\nüèÜ SUCCESS! Upload '{zip_name}' to CodaBench.")

  from .autonotebook import tqdm as notebook_tqdm


üöÄ Using Apple M4 (MPS) - Safe Mode

üîç Scanning Data...
   Track A: test_track_a.jsonl
   Track B: test_track_b.jsonl

üìÇ Loading Models...
   1. Loading DeBERTa (Logic)...
   2. Loading RoBERTa (Vibe)...
   3. Loading Jina v3 (Narrative)...


`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!



üß† TRACK A: Voting Process...
   - Asking DeBERTa (Logic) [Batch Size: 1]...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 400/400 [01:35<00:00,  4.20it/s]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 400/400 [01:26<00:00,  4.62it/s]


   - Asking RoBERTa (Vibe) [Batch Size: 8]...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:17<00:00,  1.55s/it]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:10<00:00,  1.42s/it]



üß† TRACK B: Narrative Mapping...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 213/213 [40:15<00:00, 11.34s/it]



üì¶ Zipping Submission...

üèÜ SUCCESS! Upload 'submission_ENSEMBLE_SAFE.zip' to CodaBench.


In [1]:
import pandas as pd
import glob
import os
import json
import zipfile
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import SentenceTransformer
from tqdm import tqdm  # Progress bar library

# --- 1. HARDWARE ---
if torch.backends.mps.is_available():
    device = 'mps'
    print("üöÄ Using Apple M4 (MPS) - Single-File Mode")
else:
    device = 'cpu'

# --- 2. FIND DATA ---
print("üîç Scanning Data...")
jsonl_files = glob.glob("*.jsonl") + glob.glob("data/*.jsonl") + glob.glob("SemEval_Task4/*.jsonl")
input_a, input_b = None, None
for f in jsonl_files:
    try:
        count = sum(1 for line in open(f))
        if count == 400: input_a = f
        elif count == 849: input_b = f
    except: pass
if not input_a or not input_b: input_a, input_b = 'test_track_a.jsonl', 'test_track_b.jsonl'
print(f"   Track A: {input_a}\n   Track B: {input_b}")

# --- 3. LOAD MODELS ---
print("\nüìÇ Loading Models...")

# TRACK A: Jina Reranker
path_rerank = './jina-reranker-v2'
if os.path.exists(path_rerank):
    print("   ‚úÖ Loading Jina Reranker (Track A)...")
    model_a = AutoModelForSequenceClassification.from_pretrained(
        path_rerank, 
        trust_remote_code=True, 
        torch_dtype=torch.float16
    ).to(device).eval()
    tokenizer_a = AutoTokenizer.from_pretrained(path_rerank, trust_remote_code=True)
else:
    print("   ‚ö†Ô∏è Local Reranker not found. Downloading...")
    model_a = AutoModelForSequenceClassification.from_pretrained(
        'jinaai/jina-reranker-v2-base-multilingual', 
        trust_remote_code=True,
        torch_dtype=torch.float16
    ).to(device).eval()
    tokenizer_a = AutoTokenizer.from_pretrained('jinaai/jina-reranker-v2-base-multilingual', trust_remote_code=True)

# TRACK B: Jina Embeddings
path_embed = './jina-embeddings-v3'
if os.path.exists(path_embed):
    print("   ‚úÖ Loading Jina Embedder (Track B)...")
    model_b = SentenceTransformer(path_embed, trust_remote_code=True, device=device)
else:
    print("   ‚ö†Ô∏è Local Embedder not found. Downloading...")
    model_b = SentenceTransformer('jinaai/jina-embeddings-v3', trust_remote_code=True, device=device)

# --- 4. EXECUTE TRACK A (SAFE MODE) ---
print(f"\nüß† TRACK A: Reading full stories (Progress Bar Enabled)...")
df_a = pd.read_json(input_a, lines=True)
anc_col = next((c for c in ['anchor_text', 'anchor'] if c in df_a.columns), 'anchor')
a_col = next((c for c in ['text_a', 'a'] if c in df_a.columns), 'a')
b_col = next((c for c in ['text_b', 'b'] if c in df_a.columns), 'b')

pairs_a = df_a[[anc_col, a_col]].values.tolist()
pairs_b = df_a[[anc_col, b_col]].values.tolist()

def predict_safe(pairs, model, tokenizer):
    scores = []
    # Batch size 1 = Maximum Safety for Memory
    for i in tqdm(range(0, len(pairs), 1), desc="   - Scoring"): 
        batch = pairs[i:i+1]
        inputs = tokenizer(
            batch, 
            padding=True, 
            truncation=True, 
            max_length=1024, # Good context length
            return_tensors='pt'
        ).to(device)
        with torch.no_grad():
            output = model(**inputs).logits.squeeze(-1)
            scores.extend(output.cpu().float().numpy())
    return scores

print("   - Scoring Pair A...")
scores_a = predict_safe(pairs_a, model_a, tokenizer_a)
print("   - Scoring Pair B...")
scores_b = predict_safe(pairs_b, model_a, tokenizer_a)

preds_a = [s_a > s_b for s_a, s_b in zip(scores_a, scores_b)]

# --- 5. EXECUTE TRACK B (SAFE MODE) ---
print(f"\nüß† TRACK B: Narrative Mapping...")
df_b = pd.read_json(input_b, lines=True)
text_col = next((c for c in ['text', 'story', 'anchor', 'anchor_text'] if c in df_b.columns), None)

if text_col:
    # Set max length on the model object
    model_b.max_seq_length = 2048 
    
    embeddings = model_b.encode(
        df_b[text_col].tolist(),
        task="text-matching", 
        batch_size=1, # Reduced to 1 for safety
        show_progress_bar=True,
        device=device
    )
    embeddings_list = embeddings.tolist()
else:
    embeddings_list = []

# --- 6. SAVE & ZIP ---
print("\nüì¶ Zipping Submission...")
os.makedirs('outputs', exist_ok=True)

with open('outputs/track_a.jsonl', 'w') as f:
    for val in preds_a:
        json.dump({"text_a_is_closer": bool(val)}, f)
        f.write('\n')

with open('outputs/track_b.jsonl', 'w') as f:
    for emb in embeddings_list:
        json.dump({"embedding": emb}, f)
        f.write('\n')

zip_name = 'submission_FINAL_SAFE.zip'
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('outputs/track_a.jsonl', arcname='track_a.jsonl')
    zipf.write('outputs/track_b.jsonl', arcname='track_b.jsonl')

print(f"\nüèÜ READY! Upload '{zip_name}' to CodaBench.")

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!


üöÄ Using Apple M4 (MPS) - Single-File Mode
üîç Scanning Data...
   Track A: test_track_a.jsonl
   Track B: test_track_b.jsonl

üìÇ Loading Models...
   ‚úÖ Loading Jina Reranker (Track A)...


The tokenizer you are loading from './jina-reranker-v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


   ‚úÖ Loading Jina Embedder (Track B)...


The tokenizer you are loading from './jina-embeddings-v3' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
The tokenizer you are loading from './jina-embeddings-v3' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.



üß† TRACK A: Reading full stories (Progress Bar Enabled)...
   - Scoring Pair A...


   - Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 400/400 [00:20<00:00, 19.41it/s]


   - Scoring Pair B...


   - Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 400/400 [00:18<00:00, 21.08it/s]



üß† TRACK B: Narrative Mapping...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 849/849 [06:35<00:00,  2.15it/s]



üì¶ Zipping Submission...

üèÜ READY! Upload 'submission_FINAL_SAFE.zip' to CodaBench.


In [2]:
import pandas as pd
import glob
import os
import json
import zipfile
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import CrossEncoder, SentenceTransformer
from tqdm import tqdm

# --- 1. HARDWARE ---
if torch.backends.mps.is_available():
    device = 'mps'
    print("üöÄ Using Apple M4 (MPS) - Ensemble Mode")
else:
    device = 'cpu'

# --- 2. FIND DATA ---
print("üîç Scanning Data...")
jsonl_files = glob.glob("*.jsonl") + glob.glob("data/*.jsonl") + glob.glob("SemEval_Task4/*.jsonl")
input_a, input_b = None, None
for f in jsonl_files:
    try:
        count = sum(1 for line in open(f))
        if count == 400: input_a = f
        elif count == 849: input_b = f
    except: pass
if not input_a or not input_b: input_a, input_b = 'test_track_a.jsonl', 'test_track_b.jsonl'
print(f"   Track A: {input_a}\n   Track B: {input_b}")

# --- 3. LOAD THE TEAM OF EXPERTS ---
print("\nüìÇ Loading Models...")

# Expert 1: Jina Reranker (Long Context)
path_jina_rerank = './jina-reranker-v2'
if os.path.exists(path_jina_rerank):
    print("   ‚úÖ Expert 1: Jina Reranker (Loaded from local)")
    model_jina = AutoModelForSequenceClassification.from_pretrained(
        path_jina_rerank, trust_remote_code=True, torch_dtype=torch.float16
    ).to(device).eval()
    tokenizer_jina = AutoTokenizer.from_pretrained(path_jina_rerank, trust_remote_code=True)
else:
    print("   ‚ö†Ô∏è Expert 1 not found locally. Downloading...")
    model_jina = AutoModelForSequenceClassification.from_pretrained(
        'jinaai/jina-reranker-v2-base-multilingual', trust_remote_code=True, torch_dtype=torch.float16
    ).to(device).eval()
    tokenizer_jina = AutoTokenizer.from_pretrained('jinaai/jina-reranker-v2-base-multilingual', trust_remote_code=True)

# Expert 2: RoBERTa STS (Semantic Similarity)
path_roberta = './stsb-roberta-large'
if os.path.exists(path_roberta):
    print("   ‚úÖ Expert 2: RoBERTa STS (Loaded from local)")
    model_roberta = CrossEncoder(path_roberta, device=device)
else:
    print("   ‚ö†Ô∏è Expert 2 not found locally. Downloading...")
    model_roberta = CrossEncoder('cross-encoder/stsb-roberta-large', device=device)

# Expert 3: Jina Embeddings (Track B)
path_jina_embed = './jina-embeddings-v3'
if os.path.exists(path_jina_embed):
    print("   ‚úÖ Expert 3: Jina Embeddings (Loaded from local)")
    model_embed = SentenceTransformer(path_jina_embed, trust_remote_code=True, device=device)
else:
    print("   ‚ö†Ô∏è Expert 3 not found locally. Downloading...")
    model_embed = SentenceTransformer('jinaai/jina-embeddings-v3', trust_remote_code=True, device=device)

# --- 4. EXECUTE TRACK A (THE CONSENSUS) ---
print(f"\nüß† TRACK A: The Ensemble is Voting...")
df_a = pd.read_json(input_a, lines=True)
anc_col = next((c for c in ['anchor_text', 'anchor'] if c in df_a.columns), 'anchor')
a_col = next((c for c in ['text_a', 'a'] if c in df_a.columns), 'a')
b_col = next((c for c in ['text_b', 'b'] if c in df_a.columns), 'b')

pairs_a = df_a[[anc_col, a_col]].values.tolist()
pairs_b = df_a[[anc_col, b_col]].values.tolist()

# -- VOTE 1: JINA RERANKER --
def predict_jina(pairs):
    scores = []
    for i in tqdm(range(len(pairs)), desc="   - Jina Voting"):
        batch = pairs[i:i+1]
        inputs = tokenizer_jina(
            batch, padding=True, truncation=True, max_length=1024, return_tensors='pt'
        ).to(device)
        with torch.no_grad():
            output = model_jina(**inputs).logits.squeeze(-1)
            scores.extend(output.cpu().float().numpy())
    return scores

print("   - Collecting Jina Scores...")
jina_a = predict_jina(pairs_a)
jina_b = predict_jina(pairs_b)

# -- VOTE 2: ROBERTA STS --
# RoBERTa is smaller, so we can use slightly larger batch, but let's keep it safe at 4
print("   - Collecting RoBERTa Scores...")
roberta_a = model_roberta.predict(pairs_a, batch_size=4, show_progress_bar=True)
roberta_b = model_roberta.predict(pairs_b, batch_size=4, show_progress_bar=True)

# -- MERGING VOTES (NORMALIZATION) --
def normalize(arr):
    arr = np.array(arr)
    return (arr - arr.min()) / (arr.max() - arr.min())

# Normalize both so they are on the same 0.0 to 1.0 scale
norm_jina_a, norm_jina_b = normalize(jina_a), normalize(jina_b)
norm_rob_a, norm_rob_b = normalize(roberta_a), normalize(roberta_b)

# Weighted Average: 60% Jina (Context) + 40% RoBERTa (Similarity)
final_a = (0.6 * norm_jina_a) + (0.4 * norm_rob_a)
final_b = (0.6 * norm_jina_b) + (0.4 * norm_rob_b)

preds_a = final_a > final_b

# --- 5. EXECUTE TRACK B (DEEP READ) ---
print(f"\nüß† TRACK B: Deep Context Embedding...")
df_b = pd.read_json(input_b, lines=True)
text_col = next((c for c in ['text', 'story', 'anchor', 'anchor_text'] if c in df_b.columns), None)

if text_col:
    # Boost context to 4096 to ensure we catch every outcome
    model_embed.max_seq_length = 4096
    
    embeddings = model_embed.encode(
        df_b[text_col].tolist(),
        task="text-matching",
        batch_size=1, # Safety first
        show_progress_bar=True,
        device=device
    )
    embeddings_list = embeddings.tolist()
else:
    embeddings_list = []

# --- 6. ZIP AND SHIP ---
print("\nüì¶ Zipping Submission...")
os.makedirs('outputs', exist_ok=True)

with open('outputs/track_a.jsonl', 'w') as f:
    for val in preds_a:
        json.dump({"text_a_is_closer": bool(val)}, f)
        f.write('\n')

with open('outputs/track_b.jsonl', 'w') as f:
    for emb in embeddings_list:
        json.dump({"embedding": emb}, f)
        f.write('\n')

zip_name = 'submission_ENSEMBLE_FINAL.zip'
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('outputs/track_a.jsonl', arcname='track_a.jsonl')
    zipf.write('outputs/track_b.jsonl', arcname='track_b.jsonl')

print(f"\nüèÜ READY! Upload '{zip_name}' to CodaBench.")

üöÄ Using Apple M4 (MPS) - Ensemble Mode
üîç Scanning Data...
   Track A: test_track_a.jsonl
   Track B: test_track_b.jsonl

üìÇ Loading Models...
   ‚úÖ Expert 1: Jina Reranker (Loaded from local)


The tokenizer you are loading from './jina-reranker-v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


   ‚úÖ Expert 2: RoBERTa STS (Loaded from local)
   ‚úÖ Expert 3: Jina Embeddings (Loaded from local)


The tokenizer you are loading from './jina-embeddings-v3' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
The tokenizer you are loading from './jina-embeddings-v3' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.



üß† TRACK A: The Ensemble is Voting...
   - Collecting Jina Scores...


   - Jina Voting: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 400/400 [1:15:59<00:00, 11.40s/it]
   - Jina Voting: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 400/400 [1:14:30<00:00, 11.18s/it]


   - Collecting RoBERTa Scores...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [01:14<00:00,  1.33it/s]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [01:00<00:00,  1.66it/s]



üß† TRACK B: Deep Context Embedding...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 849/849 [08:11<00:00,  1.73it/s]



üì¶ Zipping Submission...

üèÜ READY! Upload 'submission_ENSEMBLE_FINAL.zip' to CodaBench.


## import pandas as pd
import glob
import os
import json
import zipfile
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# --- 1. HARDWARE ---
if torch.backends.mps.is_available():
    device = 'mps'
    print("üöÄ Using Apple M4 (MPS) - God Mode")
else:
    device = 'cpu'

# --- 2. FIND DATA ---
print("üîç Scanning Data...")
jsonl_files = glob.glob("*.jsonl") + glob.glob("data/*.jsonl") + glob.glob("SemEval_Task4/*.jsonl")
input_a, input_b = None, None
for f in jsonl_files:
    try:
        count = sum(1 for line in open(f))
        if count == 400: input_a = f
        elif count == 849: input_b = f
    except: pass
if not input_a or not input_b: input_a, input_b = 'test_track_a.jsonl', 'test_track_b.jsonl'
print(f"   Track A: {input_a}\n   Track B: {input_b}")

# --- 3. LOAD THE CHAMPIONS ---
print("\nüìÇ Loading Models...")

# Expert 1: Jina Reranker (The Long-Context Reader)
path_jina = './jina-reranker-v2'
if os.path.exists(path_jina):
    print("   ‚úÖ Expert 1: Jina Reranker v2 (Local)")
    model_jina = AutoModelForSequenceClassification.from_pretrained(
        path_jina, trust_remote_code=True, torch_dtype=torch.float16
    ).to(device).eval()
    tokenizer_jina = AutoTokenizer.from_pretrained(path_jina, trust_remote_code=True)
else:
    print("   ‚ö†Ô∏è Jina not found locally. Downloading...")
    model_jina = AutoModelForSequenceClassification.from_pretrained(
        'jinaai/jina-reranker-v2-base-multilingual', trust_remote_code=True, torch_dtype=torch.float16
    ).to(device).eval()
    tokenizer_jina = AutoTokenizer.from_pretrained('jinaai/jina-reranker-v2-base-multilingual', trust_remote_code=True)

# Expert 2: BGE-M3 Reranker (The Modern Judge)
path_bge = './bge-reranker-v2-m3'
if os.path.exists(path_bge):
    print("   ‚úÖ Expert 2: BGE-M3 (Local)")
    model_bge = AutoModelForSequenceClassification.from_pretrained(
        path_bge, trust_remote_code=True, torch_dtype=torch.float16
    ).to(device).eval()
    tokenizer_bge = AutoTokenizer.from_pretrained(path_bge, trust_remote_code=True)
else:
    print("   ‚ö†Ô∏è BGE-M3 not found locally. Downloading...")
    model_bge = AutoModelForSequenceClassification.from_pretrained(
        'BAAI/bge-reranker-v2-m3', trust_remote_code=True, torch_dtype=torch.float16
    ).to(device).eval()
    tokenizer_bge = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3', trust_remote_code=True)

# Expert 3: Jina Embeddings (Track B)
path_embed = './jina-embeddings-v3'
if os.path.exists(path_embed):
    print("   ‚úÖ Expert 3: Jina Embeddings v3 (Local)")
    model_embed = SentenceTransformer(path_embed, trust_remote_code=True, device=device)
else:
    model_embed = SentenceTransformer('jinaai/jina-embeddings-v3', trust_remote_code=True, device=device)

# --- 4. EXECUTE TRACK A (THE HYBRID VOTE) ---
print(f"\nüß† TRACK A: Consensus Voting...")
df_a = pd.read_json(input_a, lines=True)
anc_col = next((c for c in ['anchor_text', 'anchor'] if c in df_a.columns), 'anchor')
a_col = next((c for c in ['text_a', 'a'] if c in df_a.columns), 'a')
b_col = next((c for c in ['text_b', 'b'] if c in df_a.columns), 'b')

pairs_a = df_a[[anc_col, a_col]].values.tolist()
pairs_b = df_a[[anc_col, b_col]].values.tolist()

# Helper for Inference
def predict_reranker(pairs, model, tokenizer, name, max_len=1024):
    scores = []
    for i in tqdm(range(len(pairs)), desc=f"   - {name}"):
        batch = pairs[i:i+1]
        inputs = tokenizer(
            batch, padding=True, truncation=True, max_length=max_len, return_tensors='pt'
        ).to(device)
        with torch.no_grad():
            output = model(**inputs).logits.squeeze(-1)
            scores.extend(output.cpu().float().numpy())
    return scores

# Vote 1: Jina (Reads 1024 tokens)
jina_a = predict_reranker(pairs_a, model_jina, tokenizer_jina, "Jina", 1024)
jina_b = predict_reranker(pairs_b, model_jina, tokenizer_jina, "Jina", 1024)

# Vote 2: BGE-M3 (Reads 1024 tokens - Higher Capacity)
bge_a = predict_reranker(pairs_a, model_bge, tokenizer_bge, "BGE-M3", 1024)
bge_b = predict_reranker(pairs_b, model_bge, tokenizer_bge, "BGE-M3", 1024)

# Normalize and Merge
def normalize(arr):
    arr = np.array(arr)
    return (arr - arr.min()) / (arr.max() - arr.min())

n_jina_a, n_jina_b = normalize(jina_a), normalize(jina_b)
n_bge_a, n_bge_b = normalize(bge_a), normalize(bge_b)

# 50/50 Split - Both models are SOTA
final_a = (0.5 * n_jina_a) + (0.5 * n_bge_a)
final_b = (0.5 * n_jina_b) + (0.5 * n_bge_b)
preds_a = final_a > final_b

# --- 5. EXECUTE TRACK B (RICH PROMPT) ---
print(f"\nüß† TRACK B: Deep Narrative Embedding...")
df_b = pd.read_json(input_b, lines=True)
text_col = next((c for c in ['text', 'story', 'anchor', 'anchor_text'] if c in df_b.columns), None)

if text_col:
    # 1. Max Context
    model_embed.max_seq_length = 4096
    
    # 2. RICH PROMPT (The "Insane" Tweak)
    # Instead of just "text-matching", we guide the model to look for themes.
    # We apply this prompt to every story before embedding.
    rich_prompt = "Retrieve stories with similar abstract themes, course of action, and outcomes: "
    texts = [rich_prompt + t for t in df_b[text_col].tolist()]

    embeddings = model_embed.encode(
        texts,
        task="text-matching", 
        batch_size=1, 
        show_progress_bar=True,
        device=device
    )
    embeddings_list = embeddings.tolist()
else:
    embeddings_list = []

# --- 6. ZIP AND SHIP ---
print("\nüì¶ Zipping Submission...")
os.makedirs('outputs', exist_ok=True)

with open('outputs/track_a.jsonl', 'w') as f:
    for val in preds_a:
        json.dump({"text_a_is_closer": bool(val)}, f)
        f.write('\n')

with open('outputs/track_b.jsonl', 'w') as f:
    for emb in embeddings_list:
        json.dump({"embedding": emb}, f)
        f.write('\n')

zip_name = 'submission_GOD_MODE.zip'
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('outputs/track_a.jsonl', arcname='track_a.jsonl')
    zipf.write('outputs/track_b.jsonl', arcname='track_b.jsonl')

print(f"\nüèÜ VICTORY! Upload '{zip_name}' to CodaBench.")

In [1]:
import pandas as pd
import glob
import os
import json
import zipfile
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# --- 1. HARDWARE ---
if torch.backends.mps.is_available():
    device = 'mps'
    print("üöÄ Using Apple M4 (MPS) - God Mode")
else:
    device = 'cpu'

# --- 2. FIND DATA ---
print("üîç Scanning Data...")
jsonl_files = glob.glob("*.jsonl") + glob.glob("data/*.jsonl") + glob.glob("SemEval_Task4/*.jsonl")
input_a, input_b = None, None
for f in jsonl_files:
    try:
        count = sum(1 for line in open(f))
        if count == 400: input_a = f
        elif count == 849: input_b = f
    except: pass
if not input_a or not input_b: input_a, input_b = 'test_track_a.jsonl', 'test_track_b.jsonl'
print(f"   Track A: {input_a}\n   Track B: {input_b}")

# --- 3. LOAD THE CHAMPIONS ---
print("\nüìÇ Loading Models...")

# Expert 1: Jina Reranker (The Long-Context Reader)
path_jina = './jina-reranker-v2'
if os.path.exists(path_jina):
    print("   ‚úÖ Expert 1: Jina Reranker v2 (Local)")
    model_jina = AutoModelForSequenceClassification.from_pretrained(
        path_jina, trust_remote_code=True, torch_dtype=torch.float16
    ).to(device).eval()
    tokenizer_jina = AutoTokenizer.from_pretrained(path_jina, trust_remote_code=True)
else:
    print("   ‚ö†Ô∏è Jina not found locally. Downloading...")
    model_jina = AutoModelForSequenceClassification.from_pretrained(
        'jinaai/jina-reranker-v2-base-multilingual', trust_remote_code=True, torch_dtype=torch.float16
    ).to(device).eval()
    tokenizer_jina = AutoTokenizer.from_pretrained('jinaai/jina-reranker-v2-base-multilingual', trust_remote_code=True)

# Expert 2: BGE-M3 Reranker (The Modern Judge)
path_bge = './bge-reranker-v2-m3'
if os.path.exists(path_bge):
    print("   ‚úÖ Expert 2: BGE-M3 (Local)")
    model_bge = AutoModelForSequenceClassification.from_pretrained(
        path_bge, trust_remote_code=True, torch_dtype=torch.float16
    ).to(device).eval()
    tokenizer_bge = AutoTokenizer.from_pretrained(path_bge, trust_remote_code=True)
else:
    print("   ‚ö†Ô∏è BGE-M3 not found locally. Downloading...")
    model_bge = AutoModelForSequenceClassification.from_pretrained(
        'BAAI/bge-reranker-v2-m3', trust_remote_code=True, torch_dtype=torch.float16
    ).to(device).eval()
    tokenizer_bge = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3', trust_remote_code=True)

# Expert 3: Jina Embeddings (Track B)
path_embed = './jina-embeddings-v3'
if os.path.exists(path_embed):
    print("   ‚úÖ Expert 3: Jina Embeddings v3 (Local)")
    model_embed = SentenceTransformer(path_embed, trust_remote_code=True, device=device)
else:
    model_embed = SentenceTransformer('jinaai/jina-embeddings-v3', trust_remote_code=True, device=device)

# --- 4. EXECUTE TRACK A (THE HYBRID VOTE) ---
print(f"\nüß† TRACK A: Consensus Voting...")
df_a = pd.read_json(input_a, lines=True)
anc_col = next((c for c in ['anchor_text', 'anchor'] if c in df_a.columns), 'anchor')
a_col = next((c for c in ['text_a', 'a'] if c in df_a.columns), 'a')
b_col = next((c for c in ['text_b', 'b'] if c in df_a.columns), 'b')

pairs_a = df_a[[anc_col, a_col]].values.tolist()
pairs_b = df_a[[anc_col, b_col]].values.tolist()

# Helper for Inference
def predict_reranker(pairs, model, tokenizer, name, max_len=1024):
    scores = []
    for i in tqdm(range(len(pairs)), desc=f"   - {name}"):
        batch = pairs[i:i+1]
        inputs = tokenizer(
            batch, padding=True, truncation=True, max_length=max_len, return_tensors='pt'
        ).to(device)
        with torch.no_grad():
            output = model(**inputs).logits.squeeze(-1)
            scores.extend(output.cpu().float().numpy())
    return scores

# Vote 1: Jina (Reads 1024 tokens)
jina_a = predict_reranker(pairs_a, model_jina, tokenizer_jina, "Jina", 1024)
jina_b = predict_reranker(pairs_b, model_jina, tokenizer_jina, "Jina", 1024)

# Vote 2: BGE-M3 (Reads 1024 tokens - Higher Capacity)
bge_a = predict_reranker(pairs_a, model_bge, tokenizer_bge, "BGE-M3", 1024)
bge_b = predict_reranker(pairs_b, model_bge, tokenizer_bge, "BGE-M3", 1024)

# Normalize and Merge
def normalize(arr):
    arr = np.array(arr)
    return (arr - arr.min()) / (arr.max() - arr.min())

n_jina_a, n_jina_b = normalize(jina_a), normalize(jina_b)
n_bge_a, n_bge_b = normalize(bge_a), normalize(bge_b)

# 50/50 Split - Both models are SOTA
final_a = (0.5 * n_jina_a) + (0.5 * n_bge_a)
final_b = (0.5 * n_jina_b) + (0.5 * n_bge_b)
preds_a = final_a > final_b

# --- 5. EXECUTE TRACK B (RICH PROMPT) ---
print(f"\nüß† TRACK B: Deep Narrative Embedding...")
df_b = pd.read_json(input_b, lines=True)
text_col = next((c for c in ['text', 'story', 'anchor', 'anchor_text'] if c in df_b.columns), None)

if text_col:
    # 1. Max Context
    model_embed.max_seq_length = 4096
    
    # 2. RICH PROMPT (The "Insane" Tweak)
    # Instead of just "text-matching", we guide the model to look for themes.
    # We apply this prompt to every story before embedding.
    rich_prompt = "Retrieve stories with similar abstract themes, course of action, and outcomes: "
    texts = [rich_prompt + t for t in df_b[text_col].tolist()]

    embeddings = model_embed.encode(
        texts,
        task="text-matching", 
        batch_size=1, 
        show_progress_bar=True,
        device=device
    )
    embeddings_list = embeddings.tolist()
else:
    embeddings_list = []

# --- 6. ZIP AND SHIP ---
print("\nüì¶ Zipping Submission...")
os.makedirs('outputs', exist_ok=True)

with open('outputs/track_a.jsonl', 'w') as f:
    for val in preds_a:
        json.dump({"text_a_is_closer": bool(val)}, f)
        f.write('\n')

with open('outputs/track_b.jsonl', 'w') as f:
    for emb in embeddings_list:
        json.dump({"embedding": emb}, f)
        f.write('\n')

zip_name = 'submission_GOD_MODE.zip'
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('outputs/track_a.jsonl', arcname='track_a.jsonl')
    zipf.write('outputs/track_b.jsonl', arcname='track_b.jsonl')

print(f"\nüèÜ VICTORY! Upload '{zip_name}' to CodaBench.")

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!


üöÄ Using Apple M4 (MPS) - God Mode
üîç Scanning Data...
   Track A: test_track_a.jsonl
   Track B: test_track_b.jsonl

üìÇ Loading Models...
   ‚úÖ Expert 1: Jina Reranker v2 (Local)


The tokenizer you are loading from './jina-reranker-v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


   ‚úÖ Expert 2: BGE-M3 (Local)
   ‚úÖ Expert 3: Jina Embeddings v3 (Local)


The tokenizer you are loading from './jina-embeddings-v3' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
The tokenizer you are loading from './jina-embeddings-v3' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.



üß† TRACK A: Consensus Voting...


   - Jina: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 400/400 [00:18<00:00, 21.30it/s]
   - Jina: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 400/400 [00:17<00:00, 22.80it/s]
   - BGE-M3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 400/400 [00:47<00:00,  8.37it/s]
   - BGE-M3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 400/400 [00:44<00:00,  8.92it/s]



üß† TRACK B: Deep Narrative Embedding...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 849/849 [06:33<00:00,  2.16it/s]



üì¶ Zipping Submission...

üèÜ VICTORY! Upload 'submission_GOD_MODE.zip' to CodaBench.


In [2]:
import zipfile
import json
import os

zip_name = 'submission_GOD_MODE.zip' # Make sure this matches your file name

print(f"üîç Inspecting {zip_name}...")

if not os.path.exists(zip_name):
    print("‚ùå Error: Zip file not found!")
else:
    try:
        with zipfile.ZipFile(zip_name, 'r') as z:
            files = z.namelist()
            print(f"   üìÇ Files inside: {files}")
            
            # CHECK 1: Are files at the root?
            if any('/' in f for f in files):
                print("   ‚ùå CRITICAL: Zip contains folders! CodaBench needs files at the root.")
            else:
                print("   ‚úÖ Structure: Flat (Good)")

            # CHECK 2: Track A
            if 'track_a.jsonl' in files:
                with z.open('track_a.jsonl') as f:
                    lines = f.readlines()
                    print(f"   ‚úÖ Track A Lines: {len(lines)} (Should be 400)")
                    first = json.loads(lines[0])
                    if "text_a_is_closer" not in first:
                        print(f"   ‚ùå Track A Key Error: Found {first.keys()}")
            else:
                print("   ‚ùå Track A missing!")

            # CHECK 3: Track B
            if 'track_b.jsonl' in files:
                with z.open('track_b.jsonl') as f:
                    lines = f.readlines()
                    print(f"   ‚úÖ Track B Lines: {len(lines)} (Should be 849)")
                    first = json.loads(lines[0])
                    if "embedding" not in first:
                        print(f"   ‚ùå Track B Key Error: Found {first.keys()}")
                    if len(first['embedding']) != 1024:
                        print(f"   ‚ö†Ô∏è Track B Dim: {len(first['embedding'])} (Jina V3 usually 1024)")
            else:
                print("   ‚ùå Track B missing!")

            print("\nüöÄ If all checks are Green, the file is valid.")
            
    except Exception as e:
        print(f"‚ùå Corrupt Zip: {e}")

üîç Inspecting submission_GOD_MODE.zip...
   üìÇ Files inside: ['track_a.jsonl', 'track_b.jsonl']
   ‚úÖ Structure: Flat (Good)
   ‚úÖ Track A Lines: 400 (Should be 400)
   ‚úÖ Track B Lines: 849 (Should be 849)

üöÄ If all checks are Green, the file is valid.


In [7]:
import pandas as pd
import glob
import os
import json
import zipfile
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# --- 1. HARDWARE ---
if torch.backends.mps.is_available():
    device = 'mps'
    print("üöÄ Using Apple M4 (MPS) - Final Fix Mode")
else:
    device = 'cpu'

# --- 2. FIND DATA ---
print("üîç Scanning Data...")
jsonl_files = glob.glob("*.jsonl") + glob.glob("data/*.jsonl") + glob.glob("SemEval_Task4/*.jsonl")
input_a, input_b = None, None
for f in jsonl_files:
    try:
        count = sum(1 for line in open(f))
        if count == 400: input_a = f
        elif count == 849: input_b = f
    except: pass
if not input_a or not input_b: input_a, input_b = 'test_track_a.jsonl', 'test_track_b.jsonl'
print(f"   Track A: {input_a}\n   Track B: {input_b}")

# --- 3. THE "CONFIG HACK" (CRITICAL FIX) ---
print("\nüõ†Ô∏è Hacking Model Configuration on Disk...")
stella_path = './stella_en_400M_v5'
config_file = os.path.join(stella_path, 'config.json')

if os.path.exists(config_file):
    with open(config_file, 'r') as f:
        config_data = json.load(f)
    
    # FORCE DISABLE the crash-causing settings
    changed = False
    if config_data.get('use_memory_efficient_attention') is not False:
        config_data['use_memory_efficient_attention'] = False
        changed = True
        print("   - Disabled 'use_memory_efficient_attention'")
        
    if config_data.get('unpad_inputs') is not False:
        config_data['unpad_inputs'] = False
        changed = True
        print("   - Disabled 'unpad_inputs'")
        
    if changed:
        with open(config_file, 'w') as f:
            json.dump(config_data, f, indent=2)
        print("   ‚úÖ Config file patched successfully!")
    else:
        print("   ‚ÑπÔ∏è Config was already patched.")
else:
    print(f"   ‚ö†Ô∏è Warning: Config file not found at {config_file}. Model might crash if downloading fresh.")

# --- 4. LOAD MODELS ---
print("\nüìÇ Loading Specialists...")

# Expert 1: BGE-M3 (Track A)
path_bge = './bge-reranker-v2-m3'
if os.path.exists(path_bge):
    print("   ‚úÖ Expert 1: BGE-M3 (Local)")
    model_a = AutoModelForSequenceClassification.from_pretrained(
        path_bge, trust_remote_code=True, torch_dtype=torch.float16
    ).to(device).eval()
    tokenizer_a = AutoTokenizer.from_pretrained(path_bge, trust_remote_code=True)
else:
    print("   ‚ö†Ô∏è BGE-M3 not found. Downloading...")
    model_a = AutoModelForSequenceClassification.from_pretrained(
        'BAAI/bge-reranker-v2-m3', trust_remote_code=True, torch_dtype=torch.float16
    ).to(device).eval()
    tokenizer_a = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3', trust_remote_code=True)

# Expert 2: Stella v5 (Track B)
# Now that config.json is fixed, we can load it normally!
if os.path.exists(stella_path):
    print("   ‚úÖ Expert 2: Stella v5 (Local & Patched)")
    model_b = SentenceTransformer(stella_path, trust_remote_code=True, device=device)
else:
    print("   ‚ö†Ô∏è Stella not found locally. Downloading (Might crash on first run)...")
    model_b = SentenceTransformer('dunzhang/stella_en_400M_v5', trust_remote_code=True, device=device)

# --- 5. EXECUTE TRACK A (BGE-M3) ---
print(f"\nüß† TRACK A: Precision Judging...")
df_a = pd.read_json(input_a, lines=True)
anc_col = next((c for c in ['anchor_text', 'anchor'] if c in df_a.columns), 'anchor')
a_col = next((c for c in ['text_a', 'a'] if c in df_a.columns), 'a')
b_col = next((c for c in ['text_b', 'b'] if c in df_a.columns), 'b')

pairs_a = df_a[[anc_col, a_col]].values.tolist()
pairs_b = df_a[[anc_col, b_col]].values.tolist()

def predict_bge(pairs):
    scores = []
    for i in tqdm(range(len(pairs)), desc="   - BGE Scoring"):
        batch = pairs[i:i+1]
        inputs = tokenizer_a(
            batch, padding=True, truncation=True, max_length=1024, return_tensors='pt'
        ).to(device)
        with torch.no_grad():
            output = model_a(**inputs).logits.squeeze(-1)
            scores.extend(output.cpu().float().numpy())
    return scores

scores_a = predict_bge(pairs_a)
scores_b = predict_bge(pairs_b)
preds_a = [s_a > s_b for s_a, s_b in zip(scores_a, scores_b)]

# --- 6. EXECUTE TRACK B (STELLA) ---
print(f"\nüß† TRACK B: Narrative Instruction...")
df_b = pd.read_json(input_b, lines=True)
text_col = next((c for c in ['text', 'story', 'anchor', 'anchor_text'] if c in df_b.columns), None)

if text_col:
    # Stella specific prompting
    narrative_prompt = "Retrieve a story that shares the same abstract theme, narrative flow, and final outcome."
    embeddings = model_b.encode(
        df_b[text_col].tolist(),
        prompt=narrative_prompt,
        batch_size=1, 
        show_progress_bar=True,
        device=device
    )
    embeddings_list = embeddings.tolist()
else:
    embeddings_list = []

# --- 7. SAVE & ZIP ---
print("\nüì¶ Zipping Submission...")
os.makedirs('outputs', exist_ok=True)

with open('outputs/track_a.jsonl', 'w') as f:
    for val in preds_a:
        json.dump({"text_a_is_closer": bool(val)}, f)
        f.write('\n')

with open('outputs/track_b.jsonl', 'w') as f:
    for emb in embeddings_list:
        json.dump({"embedding": emb}, f)
        f.write('\n')

zip_name = 'submission_STELLA_FINAL.zip'
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('outputs/track_a.jsonl', arcname='track_a.jsonl')
    zipf.write('outputs/track_b.jsonl', arcname='track_b.jsonl')

print(f"\nüèÜ READY! Upload '{zip_name}' to CodaBench.")

üöÄ Using Apple M4 (MPS) - Final Fix Mode
üîç Scanning Data...
   Track A: test_track_a.jsonl
   Track B: test_track_b.jsonl

üõ†Ô∏è Hacking Model Configuration on Disk...
   - Disabled 'use_memory_efficient_attention'
   - Disabled 'unpad_inputs'
   ‚úÖ Config file patched successfully!

üìÇ Loading Specialists...
   ‚úÖ Expert 1: BGE-M3 (Local)


Some weights of the model checkpoint at ./stella_en_400M_v5 were not used when initializing NewModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


   ‚úÖ Expert 2: Stella v5 (Local & Patched)

üß† TRACK A: Precision Judging...


   - BGE Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 400/400 [33:10<00:00,  4.98s/it]
   - BGE Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 400/400 [40:20<00:00,  6.05s/it]



üß† TRACK B: Narrative Instruction...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 849/849 [13:52<00:00,  1.02it/s]



üì¶ Zipping Submission...

üèÜ READY! Upload 'submission_STELLA_FINAL.zip' to CodaBench.
