# 6. DeepSeek Benchmark

**Purpose:**  
Fine-tune DeepSeek–Coder-6.7B-Instruct with LoRA adapters as a benchmark against our LLaMA-3 pipeline.  
We train two adapter-only variants:
- **DeepSeek QP**: on `train_question_parsing.jsonl`  
- **DeepSeek CoT**: on `train_cot_parsing.jsonl`  

Then we run inference+evaluation exactly like before.


## Imports

In [1]:
# Install Unsloth for efficient LLM fine-tuning
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
# Install core evaluation utilities
!pip install -q evaluate
!pip install json5

!pip uninstall -y nltk
!pip install -q --upgrade nltk

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting json5
  Downloading json5-0.12.0-py3-none-any.whl.metadata (36 kB)
Downloading json5-0.12.0-py3-none-any.whl (36 kB)
Installing collected packages: json5
Successfully installed json5-0.12.0
Found existing installation: nltk 3.9.1
Uninstalling nltk-3.9.1:
  Successfully uninstalled nltk-3.9.1
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import nltk
nltk.download("punkt_tab")
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import unsloth
import os
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
import gc
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import json, re, ast, html

## Common Config

In [None]:
PROJECT_DIR = "/content/drive/MyDrive/llm-sr-project"
os.environ["USE_XFORMERS"] = "false"    # for Colab compatibility

# paths
QP_TRAIN = os.path.join(PROJECT_DIR, "train_question_parsing.jsonl")
COT_TRAIN = os.path.join(PROJECT_DIR, "train_cot_parsing.jsonl")

# quick peek
for rec in load_dataset("json", data_files={"train": QP_TRAIN})["train"].select(range(2)):
    print("QP INPUT:", rec["input"])
    print("QP OUTPUT:", rec["output"], "\n---")

## Model Factory

In [None]:
def make_lora_model(model_name, max_length):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name, max_seq_length=max_length, dtype=torch.float16, load_in_4bit=True
    )
    model = FastLanguageModel.get_peft_model(
        model,
        r=64, lora_alpha=16, lora_dropout=0.05,
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj"
        ],
        bias="none", random_state=42, max_seq_length=max_length
    )
    tokenizer.padding_side = "right"
    return model, tokenizer

## Train DeepSeek QP Model

In [None]:
# load QP dataset
ds_qp = load_dataset("json", data_files={"train": QP_TRAIN})["train"].shuffle(42)

model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"
MAX_LEN_QP = 1024
OUTPUT_QP  = os.path.join(PROJECT_DIR, "deepseek_adapter_qp_only")

# build model & tokenizer
qp_model, qp_tok = make_lora_model(model_name, MAX_LEN_QP)

# preprocess fn
def preprocess_qp(batch):
    inp, out = batch["input"], batch["output"]
    full = qp_tok(inp + out, truncation=True, padding="max_length", max_length=MAX_LEN_QP)
    prompt_ids = qp_tok(inp, truncation=True, max_length=MAX_LEN_QP)["input_ids"]
    labels = full["input_ids"].copy(); labels[:len(prompt_ids)] = [-100]*len(prompt_ids)
    full["labels"] = labels
    return full

qp_tok_ds = ds_qp.map(preprocess_qp, batched=False).with_format("torch")

# trainer
args_qp = TrainingArguments(
    output_dir=OUTPUT_QP, num_train_epochs=12,
    per_device_train_batch_size=8, gradient_accumulation_steps=2,
    learning_rate=1e-5, warmup_steps=50, lr_scheduler_type="cosine",
    fp16=True, save_strategy="epoch", logging_strategy="epoch",
    report_to="none", eval_strategy="no"
)
trainer_qp = Trainer(model=qp_model, args=args_qp, train_dataset=qp_tok_ds, tokenizer=qp_tok)

trainer_qp.train()

In [None]:
import shutil

ADAPTER_DIR = os.path.join(PROJECT_DIR, "deepseek_adapter_qp_only")
os.makedirs(ADAPTER_DIR, exist_ok=True)

# Save LoRA adapter-only weights + tokenizer
qp_model.save_pretrained(ADAPTER_DIR)
qp_tok.save_pretrained(ADAPTER_DIR)

# Create ZIP archive
shutil.make_archive(ADAPTER_DIR, 'zip', ADAPTER_DIR)

print(f"✅ Deepseek QP Adapter-only bundle created at {ADAPTER_DIR}.zip")

## Train DeepSeek CoT Model

In [None]:
# now CoT
ds_cot = load_dataset("json", data_files={"train": COT_TRAIN})["train"].shuffle(42)

MAX_LEN_COT = 2048
OUTPUT_COT  = os.path.join(PROJECT_DIR, "deepseek_adapter_cot_only")

cot_model, cot_tok = make_lora_model(model_name, MAX_LEN_COT)

def preprocess_cot(batch):
    inp, out = batch["input"], batch["output"]
    full = cot_tok(inp + out, truncation=True, padding="max_length", max_length=MAX_LEN_COT)
    prompt_ids = cot_tok(inp, truncation=True, max_length=MAX_LEN_COT)["input_ids"]
    labels = full["input_ids"].copy(); labels[:len(prompt_ids)] = [-100]*len(prompt_ids)
    full["labels"] = labels
    return full

cot_tok_ds = ds_cot.map(preprocess_cot, batched=False).with_format("torch")

args_cot = TrainingArguments(
    output_dir=OUTPUT_COT, num_train_epochs=12,
    per_device_train_batch_size=8, gradient_accumulation_steps=2,
    learning_rate=1e-5, warmup_steps=50, lr_scheduler_type="cosine",
    fp16=True, save_strategy="epoch", logging_strategy="epoch",
    report_to="none", eval_strategy="no"
)
trainer_cot = Trainer(model=cot_model, args=args_cot, train_dataset=cot_tok_ds, tokenizer=cot_tok)

trainer_cot.train()

In [None]:
# Save only LoRA adapter and tokenizer
ADAPTER_DIR = os.path.join(PROJECT_DIR, "deepseek_adapter_cot_only")
os.makedirs(ADAPTER_DIR, exist_ok=True)

cot_model.save_pretrained(ADAPTER_DIR)
cot_tok.save_pretrained(ADAPTER_DIR)

# Zip the folder
shutil.make_archive(ADAPTER_DIR, 'zip', ADAPTER_DIR)

print(f"✅ Deepseek COT Adapter-only bundle created at {ADAPTER_DIR}.zip")

## Inference and Evaluation

In [None]:
# optional JSON5 parser
try:
    import json5
    USE_JSON5 = True
except ImportError:
    USE_JSON5 = False

### Prompt Templates

In [None]:
# Two-Shot Prompt Templates for ICL

# Example 1 (parsing demo):
QP_EX1 = '''The question is:

There are 6 volunteers: A, B, C, D, E and F. They will be assigned to either Project Alpha or Project Beta. Each person works on exactly one project. This assignment must satisfy:
(1) If A works on Alpha, then B works on Beta.
(2) If C works on Alpha, then D and E work on Beta.
(3) F works on a different project than E.
(4) D must work on a different project than A.
(5) If F works on Alpha, then B works on Alpha.

If A works on Beta, which of the following must be true?
A. B works on Alpha
B. C works on Beta
C. D works on Alpha
D. F works on Beta

The parsing result is:

[
  "There are 6 volunteers: A, B, C, D, E and F. They will be assigned to either Project Alpha or Project Beta. Each person works on exactly one project.",
  "If A works on Alpha, then B works on Beta",
  "If C works on Alpha, then D and E work on Beta",
  "F works on a different project than E",
  "D must work on a different project than A",
  "If F works on Alpha, then B works on Alpha",
  "A works on Beta"
]
'''

# Example 2
QP_EX2 = '''The question is:

Five friends—Anna, Ben, Carla, Dan, and Eric—are seated in a row of five chairs. They sit in the order from leftmost (chair 1) to rightmost (chair 5). We know:
(1) Anna is somewhere to the left of Dan.
(2) Ben is immediately to the right of Carla.
(3) Eric occupies either chair 1 or chair 5.
(4) Carla is not in chair 3.

Who is seated in chair 3?
A. Anna
B. Ben
C. Carla
D. Dan
E. Eric

The parsing result is:

[
  "Five friends: Anna, Ben, Carla, Dan, Eric, seated left-to-right in chairs 1–5",
  "Anna is to the left of Dan",
  "Ben is immediately to the right of Carla",
  "Eric is in chair 1 or chair 5",
  "Carla is not in chair 3"
]
'''

QP_TEMPLATE = '''Given a question, extract all relevant information from the question that would help to solve it.

This includes:
- General setup information (e.g., number of people, projects involved)
- Explicit facts given in the question
- All logical constraints or conditions

Output **only** a JSON list and nothing else. Follow the format shown in the examples exactly.

Example 1:
{ex1}

Example 2:
{ex2}

Now, the question is:

{question}

Your output MUST be only a JSON array.
'''

# CoT parsing with two-shot
CP_EX1 = '''The question is:

There are 6 volunteers: A, B, C, D, E and F. They will be assigned to either Project Alpha or Project Beta. Each person works on exactly one project.

Conditions:
(1) If A works on Alpha, then B works on Beta.
(2) If C works on Alpha, then D and E work on Beta.
(3) F works on a different project than E.
(4) D must work on a different project than A.
(5) If F works on Alpha, then B works on Alpha.

Question:
If A works on Beta, which of the following must be true?
A. B works on Alpha
B. C works on Beta
C. D works on Alpha
D. F works on Beta

CoT:
Since A works on Beta, Condition (1) is not triggered. Condition (2) is not triggered since C’s assignment is unknown. Condition (3) doesn’t give anything because E’s assignment is unspecified. Condition (4) says D must work on a different project than A, so D must work on Alpha. Condition (5) depends on F, which is unknown.

Parsing result:

[
  {
    "statement": "Condition (1) is not applicable",
    "evidence": "Condition (1): If A works on Alpha, then B works on Beta. | A is working on Beta",
    "Verification": "false"
  },
  {
    "statement": "Condition (2) is not applicable",
    "evidence": "Condition (2): If C works on Alpha, then D and E work on Beta. | C’s assignment is unknown",
    "Verification": "false"
  },
  {
    "statement": "Condition (3) does not provide any info",
    "evidence": "Condition (3): F works on a different project than E. | E’s assignment is unknown",
    "Verification": "false"
  },
  {
    "statement": "D must work on Alpha",
    "evidence": "Condition (4): D must work on a different project than A, and A is working on Beta",
    "Verification": "true"
  },
  {
    "statement": "Condition (5) is not applicable",
    "evidence": "Condition (5): If F works on Alpha, then B works on Alpha. | F’s assignment is unknown",
    "Verification": "false"
  }
]
'''

CP_EX2 = '''The question is:

Five friends—Anna, Ben, Carla, Dan, and Eric—are seated in a row of five chairs. They sit in the order from leftmost (chair 1) to rightmost (chair 5). We know:
(1) Anna is somewhere to the left of Dan.
(2) Ben is immediately to the right of Carla.
(3) Eric occupies either chair 1 or chair 5.
(4) Carla is not in chair 3.

Question:
Who is seated in chair 3?
A. Anna
B. Ben
C. Carla
D. Dan
E. Eric

CoT:
Since Eric is at one end, he cannot be in chair 3. Carla can’t be in chair 3, so neither Ben (who sits right of Carla) nor Carla occupy it. That leaves Anna or Dan. But Anna must be left of Dan, so only Dan can sit in the middle. Thus chair 3 is Dan.

Parsing result:

[
  {
    "statement": "Eric cannot be in chair 3",
    "evidence": "Eric occupies either chair 1 or chair 5",
    "Verification": "true"
  },
  {
    "statement": "Carla and Ben cannot be in chair 3",
    "evidence": "Carla is not in chair 3",
    "Verification": "true"
  },
  {
    "statement": "Only Anna or Dan remain for chair 3",
    "evidence": "Eric, Carla, Ben eliminated from chair 3",
    "Verification": "true"
  },
  {
    "statement": "Dan occupies chair 3",
    "evidence": "Anna must be to the left of Dan, so Dan must be in the middle",
    "Verification": "true"
  }
]
'''

CP_TEMPLATE = '''You are a reasoning assistant. Based on the question, conditions, and chain-of-thought (CoT), extract every inference or non-inference step as a JSON object.

For each CoT sentence that either:
  1. Refers to a condition (e.g. “Condition (2) …”)
  2. Starts with an inference cue (“Since”, “Therefore”, “This means”, “We can deduce”, etc.)

Produce **exactly** one JSON object per step with keys:
- "statement"
- "evidence"
- "Verification" ("true" or "false")

Output **only** a JSON array. Follow the examples exactly.

Example 1:
{ex1}

Example 2:
{ex2}

Now, given:

Question:
{question}

Conditions:
{conditions}

Chain-of-Thought:
{cot}

Your output MUST be only a JSON array.
'''

### Helper Functions

In [None]:

def clean_quotes(text):
    return text.replace('“','"').replace('”','"').replace("‘","'").replace("’","'")

def normalize_question_text(text):
    t = clean_quotes(text)
    t = re.sub(r'\?\s(?=[A-Z])', ', ', t)
    t = re.sub(r'(?<=[a-zA-Z])\.(?=[A-Z])', '. ', t)
    t = re.sub(r'(?<![A-Da-d])\\n(?!\s?[A-Da-d]\\.)', ' ', t)
    return html.unescape(t).strip()

def extract_first_json_array(raw: str):
    raw = raw.strip()
    start = raw.find('[')
    if start < 0: return None
    depth = 0
    for i, ch in enumerate(raw[start:], start):
        if ch == '[': depth += 1
        elif ch == ']': depth -= 1
        if depth == 0:
            block = raw[start:i+1]
            for parser in (json.loads, ast.literal_eval, (json5.loads if USE_JSON5 else None)):
                if not parser: continue
                try: return parser(block)
                except: pass
            return None
    return None

def safe_extract_qp(raw):
    return extract_first_json_array(raw) or []

def safe_extract_cp(raw):
    arr = extract_first_json_array(raw) or []
    normalized = []
    for entry in arr:
        if isinstance(entry, dict):
            normalized.append(entry)
        elif isinstance(entry, str):
            normalized.append({
                "statement": entry.strip(),
                "evidence": "logical deduction",
                "Verification": "true"
            })
    return normalized

### Load DeepSeek models

In [None]:
# quant config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

# QP pipeline
qp_tok = AutoTokenizer.from_pretrained(
    "/content/drive/MyDrive/llm-sr-project/deepseek_adapter_qp_only"
)
qp_tok.padding_side    = "left"
qp_tok.truncation_side = "right"
qp_tok.model_max_length= 1024

qp_model = AutoModelForCausalLM.from_pretrained(
    "/content/drive/MyDrive/llm-sr-project/deepseek_adapter_qp_only",
    quantization_config=bnb_config,
    device_map="auto"
)
qp_pipe = pipeline(
    "text-generation", model=qp_model, tokenizer=qp_tok,
    return_full_text=False, num_beams=1, do_sample=False
)
print("✅ QP model loaded")

# CoT pipeline
cot_tok = AutoTokenizer.from_pretrained(
    "/content/drive/MyDrive/llm-sr-project/deepseek_adapter_cot_only"
)
cot_tok.padding_side    = "left"
cot_tok.truncation_side = "right"
cot_tok.model_max_length= 2048

cot_model = AutoModelForCausalLM.from_pretrained(
    "/content/drive/MyDrive/llm-sr-project/deepseek_adapter_cot_only",
    quantization_config=bnb_config,
    device_map="auto"
)
cot_pipe = pipeline(
    "text-generation", model=cot_model, tokenizer=cot_tok,
    return_full_text=False, num_beams=1, do_sample=False
)
print("✅ CoT model loaded")

### Load Test Data

In [None]:
input_path = "/content/drive/MyDrive/llm-sr-project/testingData-blank.json"
with open(input_path, "r") as f:
    data = json.load(f)

# build QP prompts
qp_prompts = [
    QP_TEMPLATE.format(
        ex1=QP_EX1, ex2=QP_EX2,
        question=normalize_question_text(item["question"])
    )
    for item in data
]
print(f"Built {len(qp_prompts)} QP prompts")

### Run Batch QP and CoT Parsing

In [None]:
qp_outs = qp_pipe(qp_prompts, max_new_tokens=512, batch_size=4)
qp_raws = [
    (o[0]["generated_text"] if isinstance(o, list) else o["generated_text"])
    for o in qp_outs
]

# extract constraints
qp_lists = []
for raw in qp_raws:
    qp_lists.append(safe_extract_qp(raw))
print(f"→ Extracted QP constraints for {len(qp_lists)} examples")

# prepare CoT prompts
cp_prompts = []
for item, constraints in zip(data, qp_lists):
    q   = normalize_question_text(item["question"])
    cot = normalize_question_text(item["cot"])
    cond_json = json.dumps(constraints, ensure_ascii=False)
    cp_prompts.append(
        CP_TEMPLATE.format(
            ex1=CP_EX1, ex2=CP_EX2,
            question=q, conditions=cond_json, cot=cot
        )
    )

# run
cp_outs = cot_pipe(cp_prompts, max_new_tokens=1024, batch_size=4)
cp_raws = [
    (o[0]["generated_text"] if isinstance(o, list) else o["generated_text"])
    for o in cp_outs
]

# normalize steps
all_results = []
for constraints, raw in zip(qp_lists, cp_raws):
    steps = safe_extract_cp(raw)
    # dedupe & filter
    clean = []
    seen = set()
    for st in steps:
        s = st.get("statement","").strip()
        e = st.get("evidence","").strip() or "logical deduction"
        v = str(st.get("Verification","true")).lower()
        if len(s)>=5 and (s,e) not in seen:
            seen.add((s,e))
            clean.append({"statement":s,"evidence":e,"Verification":v})
    all_results.append(clean)

print(f"→ Parsed CoT steps for {len(all_results)} examples")

### Save Predictions

In [None]:
# combine and save
output_path = "/content/drive/MyDrive/llm-sr-project/testingDataresultsforVerifier.json"
results = []
for item, qp_res, cot_res in zip(data, qp_lists, all_results):
    results.append({
        "question":         item["question"],
        "question_parsing": qp_res,
        "answer":           item.get("answer"),
        "id":               item["id"],
        "cot":              item["cot"],
        "cot_parsing":      cot_res,
        "sel_idx":          item["id"],
    })

with open(output_path, "w") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)
print("✅ Saved inference to", output_path)

### Evaluate

In [3]:
EVAL_SCRIPT = "/content/drive/MyDrive/llm-sr-project/eval.py"
PREDICTION_PATH = "/content/drive/MyDrive/llm-sr-project/testingDataresultsforVerifier.json"
REFERENCE_PATH = "/content/drive/MyDrive/llm-sr-project/test-reference.json"

!python {EVAL_SCRIPT} \
  --prediction {PREDICTION_PATH} \
  --reference {REFERENCE_PATH} \
  --question_threshold 0.95 \
  --statement_threshold 0.9 \
  --relation_threshold 0.9

2025-05-17 14:51:23.991515: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-17 14:51:24.008633: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747493484.029676    4210 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747493484.036039    4210 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-17 14:51:24.057176: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr