In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# DeBERTa Cross-Encoder Pairwise (Margin-Ranking) Notebook
# --------------------------------------------------------
# Purpose: Train a DeBERTa-based cross-encoder that, given (rule + target_comment) and an example
# (positive OR negative), produces a scalar score. We train with a margin ranking loss so that
# score(target, positive) > score(target, negative) + margin.
#
# The notebook reads train.csv, converts each row into multiple (text_a, pos_example, neg_example)
# triples, trains the model, and saves the trained model/tokenizer into `saved_model/` which can be
# re-used in a Kaggle notebook via AutoModelForSequenceClassification.from_pretrained('saved_model').
#
# Requirements:
#   pip install transformers datasets accelerate sentencepiece
#
# Notes:
# - This is a single-file runnable script / notebook cell collection. You can paste into a Kaggle
#   notebook cell or run locally.
# - Adjust `MODEL_NAME`, `MAX_LEN`, `BATCH_SIZE`, and `NUM_EPOCHS` for your environment.

import os
import random
import math
from dataclasses import dataclass
from typing import List, Dict

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)

# -----------------------
# Config
# -----------------------
MODEL_NAME = "microsoft/deberta-v3-base"  # good balance; change to smaller/larger as needed
MAX_LEN = 256
BATCH_SIZE = 16
NUM_EPOCHS = 3
LEARNING_RATE = 3e-5
WEIGHT_DECAY = 0.01
WARMUP_STEPS = 100
MARGIN = 0.5
SAVE_DIR = "saved_model"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 42

# -----------------------
# Utilities
# -----------------------
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

# -----------------------
# Read CSVs
# -----------------------
# Expecting files 'train.csv' and 'test.csv' in working dir. Adjust paths if different.
train_path = "train.csv"
test_path = "test.csv"

if not os.path.exists(train_path):
    raise FileNotFoundError(f"train.csv not found at {train_path}. Please upload train.csv to working dir")

train_df = pd.read_csv(train_path)
print("Train rows:", len(train_df))

# Optional: read test if available (for final inference)
if os.path.exists(test_path):
    test_df = pd.read_csv(test_path)
    print("Test rows:", len(test_df))
else:
    test_df = None
    print("test.csv not found: will only train and save model")

# Basic cleaning: ensure example columns exist and fill NaNs
for c in ["positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2", "rule", "body"]:
    if c not in train_df.columns:
        raise ValueError(f"Expected column '{c}' in train.csv")
    train_df[c] = train_df[c].fillna("")

# -----------------------
# Build pairwise training triples
# -----------------------
# For each original row, we will create multiple samples each containing:
#   text_a = f"Rule: {rule} || Target: {body}"
#   pos_example (one of positives)
#   neg_example (one of negatives)
# This yields multiple positive/negative pairings per original row.

def build_triplets(df: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, r in df.iterrows():
        rule = r['rule']
        body = r['body']
        text_a = f"Rule: {rule} || Target: {body}"
        pos_list = [r['positive_example_1'], r['positive_example_2']]
        neg_list = [r['negative_example_1'], r['negative_example_2']]
        # filter empty
        pos_list = [p for p in pos_list if str(p).strip()]
        neg_list = [n for n in neg_list if str(n).strip()]
        if len(pos_list) == 0 or len(neg_list) == 0:
            # skip rows without examples on both sides
            continue
        for p in pos_list:
            for n in neg_list:
                rows.append({"text_a": text_a, "pos": p, "neg": n})
    return pd.DataFrame(rows)

triplets_df = build_triplets(train_df)
print("Triplets rows:", len(triplets_df))

# If too many triplets, sample to keep training manageable. (Optional)
MAX_TRIPLETS = None  # e.g. 200_000
if MAX_TRIPLETS and len(triplets_df) > MAX_TRIPLETS:
    triplets_df = triplets_df.sample(MAX_TRIPLETS, random_state=SEED).reset_index(drop=True)

# -----------------------
# Dataset & DataLoader
# -----------------------
@dataclass
class Collator:
    tokenizer: AutoTokenizer
    max_len: int

    def __call__(self, batch: List[Dict]):
        # batch is list of dicts with keys text_a, pos, neg
        text_a_list = [b['text_a'] for b in batch]
        pos_list = [b['pos'] for b in batch]
        neg_list = [b['neg'] for b in batch]
        # Tokenize pairs in batch
        pos_enc = self.tokenizer(text_a_list, pos_list, padding=True, truncation=True, max_length=self.max_len, return_tensors="pt")
        neg_enc = self.tokenizer(text_a_list, neg_list, padding=True, truncation=True, max_length=self.max_len, return_tensors="pt")
        return {
            'pos_input_ids': pos_enc['input_ids'],
            'pos_attention_mask': pos_enc['attention_mask'],
            'neg_input_ids': neg_enc['input_ids'],
            'neg_attention_mask': neg_enc['attention_mask'],
        }

class TripletDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df.reset_index(drop=True)
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return {"text_a": row['text_a'], "pos": row['pos'], "neg": row['neg']}

# -----------------------
# Model & tokenizer
# -----------------------
print("Loading tokenizer and model...", MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)  # scalar score
model.to(DEVICE)

# -----------------------
# Dataloaders
# -----------------------
dataset = TripletDataset(triplets_df)
collator = Collator(tokenizer=tokenizer, max_len=MAX_LEN)

dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collator, drop_last=True)

# -----------------------
# Training loop (MarginRankingLoss)
# -----------------------
optim = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

total_steps = len(dataloader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)

loss_fn = torch.nn.MarginRankingLoss(margin=MARGIN)

print(f"Starting training on device={DEVICE}, steps={total_steps}")
model.train()

for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
    for step, batch in enumerate(dataloader):
        pos_input_ids = batch['pos_input_ids'].to(DEVICE)
        pos_attention_mask = batch['pos_attention_mask'].to(DEVICE)
        neg_input_ids = batch['neg_input_ids'].to(DEVICE)
        neg_attention_mask = batch['neg_attention_mask'].to(DEVICE)

        # Forward pass: compute scalar logits for pos and neg
        outputs_pos = model(input_ids=pos_input_ids, attention_mask=pos_attention_mask)
        logits_pos = outputs_pos.logits.view(-1)  # shape (B,)

        outputs_neg = model(input_ids=neg_input_ids, attention_mask=neg_attention_mask)
        logits_neg = outputs_neg.logits.view(-1)

        # For MarginRankingLoss, target=1 means pos should be ranked higher than neg
        target = torch.ones_like(logits_pos).to(DEVICE)
        loss = loss_fn(logits_pos, logits_neg, target)

        optim.zero_grad()
        loss.backward()
        optim.step()
        scheduler.step()

        running_loss += loss.item()
        if (step + 1) % 50 == 0:
            avg_loss = running_loss / 50
            print(f"Epoch {epoch+1}/{NUM_EPOCHS} step {step+1}/{len(dataloader)} avg_loss={avg_loss:.4f}")
            running_loss = 0.0

    # end epoch
    print(f"Epoch {epoch+1} completed")

# -----------------------
# Save model & tokenizer
# -----------------------
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"Saving model to {SAVE_DIR}")
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

# -----------------------
# Inference utility
# -----------------------
import math
from tqdm.auto import tqdm

def sigmoid(x):
    return 1.0 / (1.0 + math.exp(-x))

# Score single pair (rule+target, example) -> scalar probability 0..1
@torch.no_grad()
def score_pair(model, tokenizer, rule, target, example, device=DEVICE, max_len=MAX_LEN):
    text_a = f"Rule: {rule} || Target: {target}"
    text_b = f"Example: {example}"
    enc = tokenizer(text_a, text_b, truncation=True, padding=True, max_length=max_len, return_tensors="pt").to(device)
    out = model(**enc)
    logit = out.logits.squeeze().cpu().item()
    return sigmoid(logit)

# If test.csv available, run inference
if test_df is not None:
    # Ensure test has example columns; if not, you'll need to provide examples for new rules during inference
    for c in ["positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2", "rule", "body"]:
        if c not in test_df.columns:
            test_df[c] = ""
    test_df["positive_example_1"] = test_df["positive_example_1"].fillna("")
    test_df["positive_example_2"] = test_df["positive_example_2"].fillna("")
    test_df["negative_example_1"] = test_df["negative_example_1"].fillna("")
    test_df["negative_example_2"] = test_df["negative_example_2"].fillna("")

    preds = []
    print("Running inference on test set (this may be slow because each test row uses multiple forward passes)...")
    for _, r in tqdm(test_df.iterrows(), total=len(test_df)):
        rule = r['rule']
        body = r['body']
        pos_list = [r['positive_example_1'], r['positive_example_2']]
        neg_list = [r['negative_example_1'], r['negative_example_2']]
        pos_list = [p for p in pos_list if str(p).strip()]
        neg_list = [n for n in neg_list if str(n).strip()]
        if len(pos_list) == 0 or len(neg_list) == 0:
            preds.append(0.0)
            continue
        pos_scores = [score_pair(model, tokenizer, rule, body, p) for p in pos_list]
        neg_scores = [score_pair(model, tokenizer, rule, body, n) for n in neg_list]
        pos_avg = sum(pos_scores) / len(pos_scores)
        neg_avg = sum(neg_scores) / len(neg_scores)
        # A reasonable final probability: sigmoid(pos_avg - neg_avg) but pos_avg and neg_avg are already prob
        final_prob = 1.0 / (1.0 + math.exp(- (pos_avg - neg_avg) * 8.0))  # scale factor to make differences sharper
        preds.append(final_prob)

    test_df['pred_rule_violation_prob'] = preds
    out_path = 'test_with_preds.csv'
    test_df.to_csv(out_path, index=False)
    print(f"Saved predictions -> {out_path}")

# -----------------------
# How to reuse saved model in Kaggle
# -----------------------
print('\nTo reuse the saved model in a Kaggle notebook:')
print('from transformers import AutoModelForSequenceClassification, AutoTokenizer')
print("model = AutoModelForSequenceClassification.from_pretrained('saved_model')")
print("tokenizer = AutoTokenizer.from_pretrained('saved_model')")
print("# then use score_pair or your own inference loop")

print('Done')
