In [None]:
%pip install transformers accelerate bitsandbytes peft trl datasets huggingface_hub jupyterlab tqdm pandas scikit-learn torch
%pip install ipywidgets

In [None]:
import os
from huggingface_hub import login

# Authenticate to Hugging Face (for pulling DeepSeek model weights)
login(
    token=os.getenv("HUGGINGFACE_TOKEN"),
    add_to_git_credential=False
)

print("HF login successful!")

In [None]:
import os, re, torch, pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from tqdm.auto import tqdm

# Dataset definitions
DATASETS = [
  {"name": "mozilla", "train": "data/mozilla_full_version/train_all.jsonl", "test": "data/mozilla_full_version/test_all.csv"},
  {"name": "eclips", "train": "data/eclips_full_version/train_all.jsonl", "test": "data/eclips_full_version/test_all.csv"}
]


  backends.update(_get_backends("networkx.backends"))


In [2]:
def make_prompt(issue_text: str) -> str:
    return (
        "Below is a GitHub issue. Suggest the single best developer "
        "(GitHub handle or email) to resolve it. Only return the identifier.\n\n"
        "### Issue:\n" + issue_text + "\n\n### Assignee:"
    )

def prepare_train(example):
    text = example["title"] + "\n\n" + example["body"]
    return {"text": text, "assignee": example["assignee"]}

def format_for_sft(example):
    prompt = make_prompt(example["text"])
    return {"text": prompt + " " + example["assignee"]}

def predict_assignee(model, tokenizer, issue_text: str) -> str:
    prompt = make_prompt(issue_text)
    inputs = tokenizer(prompt,
                       return_tensors="pt",
                       truncation=True,
                       max_length=MAX_SEQ_LENGTH-32).to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=16,
            temperature=0.1,
            top_k=1,
            top_p=0.0,
            num_return_sequences=1,
            do_sample=False,
        )
    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    return decoded.split("### Assignee:")[-1].strip().split()[0]

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    set_seed,
)
from typing import Dict, List

DATASET     = {
    "name":       "eclips",
    "train_file": "File_Path",
    "test_file":  "File_Path",
}
MODEL_NAME   = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
MAX_SEQ_LEN  = 2048
LR           = 1e-5
EPOCHS       = 3
BATCH_PER_GPU= 1
GRAD_ACCUM   = 16
SEED         = 3407

set_seed(SEED)
torch.backends.cuda.matmul.allow_tf32 = True

def make_prompt(issue_text: str) -> str:
    return (
        "Below is a GitHub issue. Suggest the single best developer "
        "(GitHub handle or email) to resolve it. Only return the identifier.\n\n"
        "### Issue:\n" + issue_text + "\n\n### Assignee:"
    )

#Load & map dataset to (prompt, response) 
raw = load_dataset("json", data_files={"train": DATASET["train_file"]}, split="train")
print(f"• Train records: {len(raw)}")
print("• Sample raw entry:", raw[0], "\n")

def extract_pair(ex):
    msgs       = ex["messages"]
    user_msg   = next(m for m in msgs if m["role"] == "user")["content"]
    assist_msg = next(m for m in msgs if m["role"] == "assistant")["content"]
    return {"prompt": user_msg.strip(), "response": assist_msg.strip()}

mapped = raw.map(extract_pair, remove_columns=raw.column_names)
print(f"• After mapping, columns = {mapped.column_names}")
print("• Mapped sample:", mapped[0], "\n")

hf_train = Dataset.from_dict({
    "prompt":   [row["prompt"]   for row in mapped],
    "response": [row["response"] for row in mapped],
})
print(f"• Ready-to-train SFT examples: {len(hf_train)}\n")

#Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=dtype,
    device_map="auto",
)
model.gradient_checkpointing_enable()
model.config.use_cache = False

def tokenize_and_mask(batch: Dict[str, List[str]]):
    prompts  = [make_prompt(p) for p in batch["prompt"]]
    answers  = batch["response"]
    full_texts = [p + " " + a for p, a in zip(prompts, answers)]

    enc = tokenizer(
        full_texts,
        truncation=True,
        max_length=MAX_SEQ_LEN,
        padding=False,
        add_special_tokens=True,
    )
    enc_prompt = tokenizer(
        prompts,
        truncation=True,
        max_length=MAX_SEQ_LEN,
        padding=False,
        add_special_tokens=True,
    )

    labels = []
    for ids, p_ids in zip(enc["input_ids"], enc_prompt["input_ids"]):
        plen = min(len(p_ids), len(ids))
        lab = ids.copy()
        lab[:plen] = [-100] * plen   
        labels.append(lab)

    enc["labels"] = labels
    return enc

tok_train = hf_train.map(tokenize_and_mask, batched=True, remove_columns=hf_train.column_names)

def collate_fn(features):
    batch_ids  = [f["input_ids"] for f in features]
    batch_mask = [f["attention_mask"] for f in features]
    batch = tokenizer.pad(
        {"input_ids": batch_ids, "attention_mask": batch_mask},
        padding=True,
        return_tensors="pt",
    )
    max_len = batch["input_ids"].size(1)
    labels = []
    for f in features:
        lab = f["labels"]
        labels.append(torch.tensor(lab + [-100] * (max_len - len(lab)), dtype=torch.long))
    batch["labels"] = torch.stack(labels)
    return batch

training_args = TrainingArguments(
    output_dir=f"outputs/{DATASET['name']}_fullft",
    per_device_train_batch_size=BATCH_PER_GPU,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=EPOCHS,        
    warmup_ratio=0.05,
    learning_rate=LR,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    bf16=(dtype==torch.bfloat16),
    fp16=(dtype==torch.float16),
    optim="adamw_torch",           
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=3,
    report_to="none",
)

#  Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_train,
    data_collator=collate_fn,
)
trainer.train()

final_dir = f"outputs/{DATASET['name']}_fullft/final"
trainer.save_model(final_dir)
tokenizer.save_pretrained(final_dir)
print(f"✅ Saved fine-tuned model and tokenizer to: {final_dir}")


• Train records: 12884
• Sample raw entry: {'messages': [{'role': 'system', 'content': 'You are an expert GitHub bug triager. For each incoming issue, you will read the title and description, and choose exactly one assignee (email or name). Return only the assignee, with no extra words or punctuation.'}, {'role': 'user', 'content': 'Issue to triage:\nTitle: [1.5][compiler] Imports not resolved correctly with generics and inner interfaces\nBody: \nAssign to:'}, {'role': 'assistant', 'content': 'srikanth_sankaran@in.ibm.com'}]} 

• After mapping, columns = ['prompt', 'response']
• Mapped sample: {'prompt': 'Issue to triage:\nTitle: [1.5][compiler] Imports not resolved correctly with generics and inner interfaces\nBody: \nAssign to:', 'response': 'srikanth_sankaran@in.ibm.com'} 

• Ready-to-train SFT examples: 12884



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/12884 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
50,2.8985
100,0.7212
150,0.5243
200,0.4558
250,0.4224
300,0.3842
350,0.3434
400,0.3593
450,0.304
500,0.3195


✅ Saved full fine-tuned model and tokenizer to: outputs/eclips_fullft/final


In [None]:
# ─── FT MODEL: Single Top-10 prompting → Hit@K from one ranking (ECLIPS) ────
import os, re, json, glob
import torch
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from collections import Counter
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

DATASET = {
    "name":       "eclips",
    "train_file": "File_Path",
    "test_file":  "File_Path",
}
FINETUNED_DIR = "FT File_Path"

TOP_K       = 10
MAX_SEQ_LEN = 2048
USE_PRIOR_BACKFILL = True

test_df = pd.read_csv(DATASET["test_file"])
candidates = list(dict.fromkeys(test_df["assignee"].dropna().tolist()))
cand_lower2orig = {c.lower(): c for c in candidates}
cand_set_lower  = set(cand_lower2orig.keys())
print(f"Candidate space: {len(candidates)} emails")

prior_order = []
if USE_PRIOR_BACKFILL:
    cnt = Counter()
    try:
        with open(DATASET["train_file"], "r", encoding="utf-8") as f:
            for line in f:
                obj = json.loads(line)
                y = next(m for m in obj["messages"] if m["role"] == "assistant")["content"].strip()
                cnt[y] += 1
        freq_items = [(y, n) for y, n in cnt.items() if y in cand_lower2orig.values()]
        freq_items.sort(key=lambda t: t[1], reverse=True)
        prior_order = [y for y, _ in freq_items] + [c for c in candidates if c not in cnt]
        print(f"Built prior list with {len(prior_order)} entries.")
    except Exception as e:
        print(f"⚠️ Could not build prior ({e}); using alphabetical backfill.")
        prior_order = sorted(candidates)

tokenizer = AutoTokenizer.from_pretrained(FINETUNED_DIR, use_fast=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    FINETUNED_DIR,
    torch_dtype=(torch.bfloat16 if torch.cuda.is_available() else None),
    device_map="auto",
)
model.eval()

print(f" Using checkpoint dir: {FINETUNED_DIR}")
assert os.path.isdir(FINETUNED_DIR), f"Not a local directory: {FINETUNED_DIR}"

shards = sorted(glob.glob(os.path.join(FINETUNED_DIR, "model*.safetensors")))
total_gb = sum(os.path.getsize(p) for p in shards) / (1024**3)
print(f"  • Found {len(shards)} weight shard(s), total size ≈ {total_gb:.2f} GB")
print(f"  • model.config._name_or_path = {getattr(model.config, '_name_or_path', 'n/a')}")
print(f"  • dtype={getattr(model, 'dtype', 'n/a')}, device={model.device}")

if getattr(model.config, "_name_or_path", "") == "deepseek-ai/DeepSeek-R1-Distill-Llama-8B":
    print("⚠️ Looks like the BASE repo name; double-check FINETUNED_DIR.")
else:
    print("✅ Fine-tuned weights appear loaded from your local checkpoint.")

def make_prompt_top10(issue_text: str) -> str:
    k = 10
    return (
        f"Below is a GitHub issue. List the TOP {k} developers (emails only) to triage it, "
        f"ranked from best to worst. Use only emails known in this project; do not invent. "
        f"Return EXACTLY {k} comma-separated items, unique, with no extra text.\n\n"
        "Issue:\n" + issue_text + f"\n\nTop {k} assignees:"
    )

email_re = re.compile(r'[\w\.\+\-]+@[\w\.\-]+\.[A-Za-z]{2,}')

def parse_emails(text: str, k: int = 10) -> list[str]:
    found = email_re.findall(text)
    picked, seen = [], set()
    for em in found:
        key = em.lower()
        if key in cand_set_lower and key not in seen:
            picked.append(cand_lower2orig[key])
            seen.add(key)
            if len(picked) == k:
                break
    return picked

def backfill_to_n(current: list[str], n: int = 10) -> list[str]:
    if len(current) >= n:
        return current[:n]
    pool = prior_order if prior_order else sorted(candidates)
    for c in pool:
        if c not in current:
            current.append(c)
        if len(current) == n:
            break
    if len(current) < n:
        for c in candidates:
            if c not in current:
                current.append(c)
            if len(current) == n:
                break
    return current[:n]

gen_cfg = GenerationConfig.from_model_config(model.config)
gen_cfg.do_sample = False
gen_cfg.num_beams = 1
gen_cfg.eos_token_id = tokenizer.eos_token_id
gen_cfg.pad_token_id = tokenizer.pad_token_id
model.generation_config = gen_cfg

all_top10 = []
y_true = test_df["assignee"].tolist()

for row in tqdm(test_df.itertuples(index=False), total=len(test_df), desc="FT: prompting top-10 only"):
    issue = f"Title: {row.title}\n\n{row.body}"
    prompt = make_prompt_top10(issue)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
                       max_length=MAX_SEQ_LEN-64, padding=True).to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=200, do_sample=False, num_beams=1)[0]
    text = tokenizer.decode(out, skip_special_tokens=True)
    picked = parse_emails(text, k=10)
    top10_list = backfill_to_n(picked, n=10)
    all_top10.append(top10_list)

rows, N = [], len(test_df)
for k in range(1, TOP_K+1):
    hits = sum(y_true[i] in all_top10[i][:k] for i in range(N))
    rows.append({"top_k": k, "n_hits": hits, "hit_ratio": hits / N})
df_hits = pd.DataFrame(rows)

print(f"\nTotal test bugs: {N}")
display(df_hits.style.format({"hit_ratio": "{:.3f}"}).set_caption("🎯 FT — Hit@K from a single Top-10 prompt per issue"))

plt.figure(figsize=(6,4))
plt.plot(df_hits["top_k"], df_hits["hit_ratio"], marker="o", linewidth=2)
plt.title("FT — Hit@k from Single Top-10 Ranking")
plt.xlabel("k"); plt.ylabel("Hit Ratio"); plt.xticks(range(1, TOP_K+1)); plt.ylim(0,1)
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()


Candidate space: 103 emails
Built prior list with 103 entries.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

🔧 Using checkpoint dir: outputs/eclips_fullft/final
  • Found 4 weight shard(s), total size ≈ 14.96 GB
  • model.config._name_or_path = outputs/eclips_fullft/final
  • dtype=torch.bfloat16, device=cuda:0
✅ Fine-tuned weights appear loaded from your local checkpoint.


FT: prompting k=1..10:   0%|          | 0/1612 [00:00<?, ?it/s]

In [None]:
# ─── BASE MODEL: Single Top-10 prompting → Hit@K from one ranking (ECLIPS) ───
import os, re, json
import torch
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from collections import Counter
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig

DATASET = {
    "name":       "eclips",
    "train_file": "File_Path",
    "test_file":  "File_Path",
}
BASE_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"

TOP_K       = 10         
MAX_SEQ_LEN = 2048
USE_PRIOR_BACKFILL = True

# ── Candidates (from test split) ───────────────────────────────────────────────
test_df = pd.read_csv(DATASET["test_file"])
candidates = list(dict.fromkeys(test_df["assignee"].dropna().tolist()))
cand_lower2orig = {c.lower(): c for c in candidates}
cand_set_lower  = set(cand_lower2orig.keys())
print(f"Candidate space: {len(candidates)} emails")

# ── Prior-based backfill ordering (optional) ───────────────────────────────────
prior_order = []
if USE_PRIOR_BACKFILL:
    cnt = Counter()
    try:
        with open(DATASET["train_file"], "r", encoding="utf-8") as f:
            for line in f:
                obj = json.loads(line)
                y = next(m for m in obj["messages"] if m["role"] == "assistant")["content"].strip()
                cnt[y] += 1
        freq_items = [(y, n) for y, n in cnt.items() if y in cand_lower2orig.values()]
        freq_items.sort(key=lambda t: t[1], reverse=True)
        prior_order = [y for y, _ in freq_items] + [c for c in candidates if c not in cnt]
        print(f"Built prior list with {len(prior_order)} entries.")
    except Exception as e:
        print(f"⚠️ Could not build prior ({e}); using alphabetical backfill.")
        prior_order = sorted(candidates)

# ── Model & tokenizer ─────────────────────────────────────────────────────────
try:
    bnb_cfg = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
    model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_NAME, quantization_config=bnb_cfg, device_map="auto")
    print("✅ Base in 4-bit.")
except Exception as e:
    print(f"⚠️ 4-bit failed ({e}); using bf16/fp16.")
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_NAME,
        torch_dtype=(torch.bfloat16 if torch.cuda.is_available() else None),
        device_map="auto",
    )
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, use_fast=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
model.eval()

# ── Prompt: request exactly Top-10 once ───────────────────────────────────────
def make_prompt_top10(issue_text: str) -> str:
    k = 10
    return (
        f"Below is a GitHub issue. List the TOP {k} developers (emails only) to triage it, "
        f"ranked from best to worst. Use only emails known in this project; do not invent. "
        f"Return EXACTLY {k} comma-separated items, unique, with no extra text.\n\n"
        "Issue:\n" + issue_text + f"\n\nTop {k} assignees:"
    )

email_re = re.compile(r'[\w\.\+\-]+@[\w\.\-]+\.[A-Za-z]{2,}')

def parse_emails(text: str, k: int = 10) -> list[str]:
    found = email_re.findall(text)
    picked, seen = [], set()
    for em in found:
        key = em.lower()
        if key in cand_set_lower and key not in seen:
            picked.append(cand_lower2orig[key])
            seen.add(key)
            if len(picked) == k:
                break
    return picked

def backfill_to_n(current: list[str], n: int = 10) -> list[str]:
    if len(current) >= n:
        return current[:n]
    pool = prior_order if prior_order else sorted(candidates)
    for c in pool:
        if c not in current:
            current.append(c)
        if len(current) == n:
            break
    if len(current) < n:
        for c in candidates:
            if c not in current:
                current.append(c)
            if len(current) == n:
                break
    return current[:n]

# ── Deterministic generation config ───────────────────────────────────────────
gen_cfg = GenerationConfig.from_model_config(model.config)
gen_cfg.do_sample = False
gen_cfg.num_beams = 1
gen_cfg.eos_token_id = tokenizer.eos_token_id
gen_cfg.pad_token_id = tokenizer.pad_token_id
model.generation_config = gen_cfg

# ── Inference: single Top-10 per issue ────────────────────────────────────────
all_top10 = []  # one list of length 10 per test row
y_true = test_df["assignee"].tolist()

for row in tqdm(test_df.itertuples(index=False), total=len(test_df), desc="BASE: prompting top-10 only"):
    issue = f"Title: {row.title}\n\n{row.body}"
    prompt = make_prompt_top10(issue)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
                       max_length=MAX_SEQ_LEN-64, padding=True).to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=200, do_sample=False, num_beams=1)[0]
    text = tokenizer.decode(out, skip_special_tokens=True)

    picked = parse_emails(text, k=10)
    top10_list = backfill_to_n(picked, n=10)
    all_top10.append(top10_list)

# ── Evaluation: Hit@K (k=1..10) computed from the single Top-10 ranking ──────
rows, N = [], len(test_df)
for k in range(1, TOP_K+1):
    hits = sum(y_true[i] in all_top10[i][:k] for i in range(N))
    rows.append({"top_k": k, "n_hits": hits, "hit_ratio": hits / N})
df_hits = pd.DataFrame(rows)

print(f"\nTotal test bugs: {N}")
display(df_hits.style.format({"hit_ratio": "{:.3f}"}).set_caption("BASE — Hit@K from a single Top-10 prompt per issue"))

# ── Plot ──────────────────────────────────────────────────────────────────────
plt.figure(figsize=(6,4))
plt.plot(df_hits["top_k"], df_hits["hit_ratio"], marker="o", linewidth=2)
plt.title("BASE — Hit@k from Single Top-10 Ranking")
plt.xlabel("k"); plt.ylabel("Hit Ratio"); plt.xticks(range(1, TOP_K+1)); plt.ylim(0,1)
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()


  backends.update(_get_backends("networkx.backends"))


Candidate space: 103 emails
Built prior list with 103 entries.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Base in 4-bit.


BASE: prompting k=1..10:   0%|          | 0/1612 [00:00<?, ?it/s]