## Contribution:
All works are discussed and wrote by three of us together

### Step 1: Install necesscary packages

In [None]:
!pip install matplotlib
!pip install torch numpy transformers datasets tiktoken wandb tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Step 2: Package imports and configuration

In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
from model import GPT, GPTConfig
import random
from tqdm import tqdm
import time
import json
import matplotlib.pyplot as plt
# Configuration
beta = 0.5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
base_lr = 1e-4
epochs = 5
batch_size = 64
max_length =64
num_samples = 1
max_new_tokens = 200
temperature = 0.8
top_k = 200
# tokenizer
with open("../sft/meta.pkl", "rb") as f:
    meta = pickle.load(f)
stoi, itos = meta["stoi"], meta["itos"]
def encode(s): return [stoi[c] for c in s]
def decode(l): return ''.join([itos[i] for i in l])

### Step 3: Define helper functions

In [None]:
def compute_logprob(input_ids):
    inputs = input_ids[:, :-1]
    targets = input_ids[:, 1:]
    logits, _ = gpt(inputs, full_seq=True)
    B, T, V = logits.size()
    logits_flat = logits.reshape(-1, V)
    targets_flat = targets.reshape(-1)
    loss = F.cross_entropy(logits_flat, targets_flat, ignore_index=0, reduction='none')
    loss = loss.reshape(B, T)
    attention_mask = (targets != 0).float()
    loss = (loss * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
    return -loss

def pad_or_truncate(seq, max_length):
    return seq[-max_length:] if len(seq) > max_length else seq + [0] * (max_length - len(seq))

def get_batches(lines, batch_size):
    random.shuffle(lines)
    #for l in lines:
    #    print(l[1])
    for i in range(0, len(lines), batch_size):
        batch = lines[i:i+batch_size]
        if len(batch) < batch_size:
            continue
        neg_inputs = [pad_or_truncate(encode(p['negative'] + '\n\n\n\n'), max_length) for p in batch]
        pos_inputs = [pad_or_truncate(encode(p['positive'] + '\n\n\n\n'), max_length) for p in batch]
        neg_tensor = torch.tensor(neg_inputs, dtype=torch.long, device=device)
        pos_tensor = torch.tensor(pos_inputs, dtype=torch.long, device=device)
        yield neg_tensor, pos_tensor

### Step 4: Load the pretrained NanoGPT model

In [None]:
ckpt = torch.load("../sft/gpt.pt", map_location=device)
gptconf = GPTConfig(**ckpt['model_args'])
gpt = GPT(gptconf)
state_dict = ckpt['model']
unwanted_prefix = '_orig_mod.'
for k in list(state_dict.keys()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
gpt.to(device).train()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(74, 348)
    (wpe): Embedding(256, 348)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=348, out_features=1044, bias=False)
          (c_proj): Linear(in_features=348, out_features=348, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=348, out_features=1392, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1392, out_features=348, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=348, out_features=74, bias=False)
)

### Step 5: Load Data (**students are required to complete this part!**)

In [None]:
# Load data from ./pos_neg_pairs.json
#with open("./pos_neg_pairs.json", "r") as f:
#    lines = json.load(f)

with open("pos_neg_pairs.json", "r") as f:
    lines = json.load(f)

### Step 6: Build the optimizer and scheduler (**students are required to complete this part!**)

In [None]:
# recommend to use the AdamW optimizer
# We freeze the word and position embeddings to keep the model’s core vocabulary stable.
# This helps prevent the model from “forgetting” basic language patterns while we fine-tune other layers.

from torch import optim
# To get better text output, we embed words
for name, param in gpt.named_parameters():
# 'wte' = word token embeddings, 'wpe' = positional embeddings
    if 'wte' in name or 'wpe' in name:
        param.requires_grad = False
# Define the optimizer — AdamW usually works well for fine-tuning
# transformer models.
model = 3e-5
optimizer = optim.AdamW(
    (p for p in gpt.parameters() if p.requires_grad), # only update unfrozen params
    lr=model,
# decoupled weight decay for better generalization
    weight_decay=0.01
)


print(f"Optimizer ready. learning_rate={model:.1e}")

Optimizer ready. learning_rate=3.0e-05


For this part, we studied the lab example to use DPO as our training method

### Step 7: Begin training (**students are required to complete this part!**)

In [None]:
total_steps = len(lines) // batch_size
for epoch in range(epochs):
    pbar = tqdm(get_batches(lines, batch_size)) # prepare shuffled batches for training
    for step, (neg_tensor,pos_tensor) in enumerate(pbar):
     # These parameters control how strongly we weight different loss terms.
        ###########################################################
        # Please complete the training code here!
        # Examples:
        # ...
        # neg_logprob
        # pos_logprob
        # loss = -F.logsigmoid((pos_logprob - neg_logprob) / beta).mean() - pos_logprob.mean() * 0.1
        # ...
        ###########################################################
        beta = 12.5         # larger beta can flat the sigmoid
        lm_coef = 0.075          # weight on language-model term
        gp = 0.9      # gradient clipping to avoid explosion


        use_amp = torch.cuda.is_available() # enable automatic mixed precision if using GPU


        with torch.amp.autocast('cuda', enabled=use_amp): # Compute average token log-probabilities for positive and negative samples
            pos = compute_logprob(pos_tensor)
            neg = compute_logprob(neg_tensor)


            # DPO loss encourages the model to prefer positive samples over negative ones
            delta = 0.5 * (pos - neg) # balance the training to avoid overflow (since we are using a high beta value)
            dpo= -F.logsigmoid(delta / beta).mean()


            # LM loss keeps the generated text fluent by maximizing log-prob of the positive examples
            lm = (-pos).mean() # increase the fluency of text output
            # Small L2 regularization helps keep weights from drifting too far
            l2 = 1e-4 * sum(p.norm() ** 2 for p in gpt.parameters())


            loss = dpo + lm_coef * lm + l2 # Combine everything into a single loss value


        optimizer.zero_grad(set_to_none=True)


        # since the difference between positive and negative training samples is large
        loss.backward()  # backpropagate gradients through the model
        # Clip gradients to avoid instability from large updates
        torch.nn.utils.clip_grad_norm_(gpt.parameters(), gp)
        optimizer.step()


        # see the training process
        with torch.no_grad():
            delta = (pos - neg)
            pbar.set_description(
                f"[E{epoch+1}/E{epochs}] step {step+1}/{total_steps} | "
                f"Loss={loss.item():.3f} DPO={dpo.item():.3f} LM={lm.item():.3f} "
                f"Δ={delta.mean().item():.2f}" # search online for delta symbol
            )


    # Save a checkpoint after each epoch so training can be resumed later if needed
    ckpt_path = f"dpo.pt"
    torch.save({
        "model_state_dict": gpt.state_dict(),
        "model_args": ckpt['model_args'],
    }, ckpt_path)
    print(f"Saved checkpoint to {ckpt_path}")

[E1] step 1562/1562 | Loss=1.068 DPO=0.454 LM=0.701 Δ=13.87: : 1562it [22:08,  1.18it/s]


Saved checkpoint to ./dpoTryNew100.pt


[E2] step 1562/1562 | Loss=0.955 DPO=0.438 LM=0.525 Δ=14.99: : 1562it [22:05,  1.18it/s]


Saved checkpoint to ./dpoTryNew100.pt


[E3] step 1562/1562 | Loss=0.899 DPO=0.434 LM=0.491 Δ=15.26: : 1562it [21:51,  1.19it/s]


Saved checkpoint to ./dpoTryNew100.pt


[E4] step 1562/1562 | Loss=0.845 DPO=0.426 LM=0.408 Δ=15.88: : 1562it [22:05,  1.18it/s]


Saved checkpoint to ./dpoTryNew100.pt


[E5] step 1562/1562 | Loss=0.803 DPO=0.414 LM=0.439 Δ=16.76: : 1562it [22:03,  1.18it/s]

Saved checkpoint to ./dpoTryNew100.pt





# Originality explanation for training part
## We have changed different set of parameters to try to get better results.
At the first time, beta is set to 1, but the final result is like following: Prompt: 17+19=? Output: e answer is 1037. After our discussion about this problem, we found that during the training process, the difference between positive and negative log probabilities is too large with a delta of nearly 13, at the same time, dpo value is very small, almost 0 which means the learning almost stops. Thus, we decided to raise beta value to flat the sigmoid function as well as adding a coefficient before the difference "delta = 0.5 * (pos - neg)" to cut down the difference value.
So that we decided to try different sets of parameters, before which we set epoches to 1 for quick test. After trying many different sets of these three parameters (beta, lm_coef, delta coefficient), we finally set beta to 12.5, lm_coef to 0.075 and delta coefficient to 0.5.
## We have also added several measures to avoid overfitting and output chaos.
As mentioned before, we encountered the large beta and output text chaos problem. To avoid these two problems, we firstly added L2 regularization and gradiant clipping to prevent overfitting. Secondly, we included the loss of lm (lm) and lm_coef to improve the fluency of the output text.

### Step 8: Begin testing (**students are required to complete this part!**)

In [None]:
# Load the fine-tuned model
ckpt_path = "dpo.pt"
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = GPTConfig(**checkpoint['model_args'])
# Safely move model to whichever device is available
#add following
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gpt = GPT(gptconf).to(device)
#removed following
#gpt = GPT(gptconf).cuda()
# Handle possible naming differences between checkpoints
try:
    state_dict = checkpoint['model']
except:
    state_dict = checkpoint['model_state_dict']


# Clean up any old prefixes left over from distributed training
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)


# Load weights into our model
gpt.load_state_dict(state_dict)


# switch to eval mode so dropout etc. are disabled
gpt.eval()


# Run a few test prompts to check that generation works after fine-tuning
test_set = ["17+19=?", "3*17=?", "72/4=?", "72-x=34,x=?", "x*11=44,x=?", "3*17=?", "72/4=?", "72-x=34,x=?"]
with torch.no_grad():
    for prompt in test_set:
        prompt_ids = encode(prompt)
        ###########################################################
        # Please complete the test code here!
        # ...
        # gpt.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
        # ...
        x = torch.tensor([prompt_ids], dtype=torch.long, device=device)


        # Generate up to 50 new tokens with some randomness (temperature = 0.8)
        y_tuple = gpt.generate(x, max_new_tokens=50, temperature=0.8, top_k=50)
        y = y_tuple[0]
        y = y[0].tolist()


        # Decode the new tokens back into readable text
        generated_ids = y[len(prompt_ids):]
        output_text = decode(generated_ids).strip()
        print(f"Prompt: {prompt}\nOutput: {output_text}\n")
        ###########################################################

Prompt: 17+19=?
Output: The answer is 108 because 17+19 equals 108.

Prompt: 3*17=?
Output: The answer is 32 because 3*17 equals 32.

Prompt: 72/4=?
Output: The answer is -464 because -1109244/-44 equals -4

Prompt: 72-x=34,x=?
Output: *he answer is 13 because 72-34 equals 13.

Prompt: x*11=44,x=?
Output: The answer is 14 because 44-11 equals 14.

Prompt: 3*17=?
Output: The answer is 38 because 3*17 equals 38.

Prompt: 72/4=?
Output: The answer is -3253 because -148202/-44 equals -3

Prompt: 72-x=34,x=?
Output: The answer is 3 because 72-34 equals 3.



# Generation code for self generate json negative pair


Here, we tried to use different negative-positive pair, by generating the negative pair from existing model, and using it to further fine tune the model, in theory improving it each time.


Before this we had also tried generation with 1 million negative and 100 thousand negative pairs. Eventually, we are using 100 thousand positive negative pairs. All other datasets are attached in the other mail.


In [None]:
import json
import random
import os
import torch
from datetime import datetime

OUTPUT_FILE = f"{ROOT}/dpo/pos_neg_pairsSelfGenerate.json"
MIN_NUM, MAX_NUM = -127, 128
NUM_SAMPLES = 100_000
PRINT_EVERY = 100
MAX_NEW_TOKENS = 50
TEMPERATURE = 0.8
TOP_K = 50

ckpt_path = f"{ROOT}/dpo/dpoTryNewE100k.pt"
checkpoint = torch.load(
    ckpt_path,
    map_location=lambda storage, loc: storage.cuda() if torch.cuda.is_available() else storage
)
gptconf = GPTConfig(**checkpoint["model_args"])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt = GPT(gptconf).to(device)
state_dict = checkpoint.get("model", checkpoint.get("model_state_dict", {}))
unwanted_prefix = "_orig_mod."
for k, v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
gpt.eval()

## Define all the math functions

def rnd():
    return random.randint(MIN_NUM, MAX_NUM)

def small_nonzero(limit=500):
    v = 0
    while v == 0:
        v = random.randint(-limit, limit)
    return v

def make_add():
    a, b = rnd(), rnd()
    q = f"{a}+{b}=?"
    ans = a + b
    pos = f"{q} The answer is {ans} because {a}+{b} equals {ans}."
    return q, pos

def make_sub():
    a, b = rnd(), rnd()
    q = f"{a}-{b}=?"
    ans = a - b
    pos = f"{q} The answer is {ans} because {a}-{b} equals {ans}."
    return q, pos

def make_mul():
    a, b = rnd(), rnd()
    q = f"{a}*{b}=?"
    ans = a * b
    pos = f"{q} The answer is {ans} because {a}*{b} equals {ans}."
    return q, pos

def make_div():
    b = small_nonzero()
    q_int = rnd()
    a = b * q_int
    q = f"{a}/{b}=?"
    pos = f"{q} The answer is {q_int} because {a}/{b} equals {q_int}."
    return q, pos

def make_x_plus_a():
    a, x = rnd(), rnd()
    b = x + a
    q = f"x+{a}={b},x=?"
    pos = f"{q} The answer is {x} because {b}-{a} equals {x}."
    return q, pos

def make_a_plus_x():
    a, x = rnd(), rnd()
    b = a + x
    q = f"{a}+x={b},x=?"
    pos = f"{q} The answer is {x} because {b}-{a} equals {x}."
    return q, pos

def make_x_minus_a():
    a, x = rnd(), rnd()
    b = x - a
    q = f"x-{a}={b},x=?"
    pos = f"{q} The answer is {x} because {b}+{a} equals {x}."
    return q, pos

def make_a_minus_x():
    a, x = rnd(), rnd()
    b = a - x
    q = f"{a}-x={b},x=?"
    pos = f"{q} The answer is {x} because {a}-{b} equals {x}."
    return q, pos

def make_x_mul_a():
    a = small_nonzero()
    x = rnd()
    b = x * a
    q = f"x*{a}={b},x=?"
    pos = f"{q} The answer is {x} because {b}/{a} equals {x}."
    return q, pos

def make_a_mul_x():
    a = small_nonzero()
    x = rnd()
    b = a * x
    q = f"{a}*x={b},x=?"
    pos = f"{q} The answer is {x} because {b}/{a} equals {x}."
    return q, pos

def make_x_div_a():
    a = small_nonzero()
    b = rnd()
    x = a * b
    q = f"x/{a}={b},x=?"
    pos = f"{q} The answer is {x} because {b}*{a} equals {x}."
    return q, pos

def make_a_div_x():
    x = small_nonzero()
    b = rnd()
    a = b * x
    q = f"{a}/x={b},x=?"
    pos = f"{q} The answer is {x} because {a}/{b} equals {x}."
    return q, pos

GEN_FUNCS = [
    make_add, make_sub, make_mul, make_div,
    make_x_plus_a, make_a_plus_x, make_x_minus_a, make_a_minus_x,
    make_x_mul_a, make_a_mul_x, make_x_div_a, make_a_div_x
]

##get the model to generate the negative

def model_generate(prompt):
    prompt_ids = encode(prompt)
    x = torch.tensor([prompt_ids], dtype=torch.long, device=device)
    with torch.no_grad():
        y_tuple = gpt.generate(x, max_new_tokens=MAX_NEW_TOKENS,
                               temperature=TEMPERATURE, top_k=TOP_K)
    y = y_tuple[0][0].tolist()
    generated_ids = y[len(prompt_ids):]
    return decode(generated_ids).strip()

##combine them together

def generate_dataset(n):
    data = []
    for i in range(n):
        func = random.choice(GEN_FUNCS)
        q, pos = func()
        neg = f"{q} {model_generate(q)}"
        data.append({"negative": neg, "positive": pos})
        if (i + 1) % PRINT_EVERY == 0:
            print(f"Generated {i + 1}/{n}")
    return data

##write in the json format required

def write_strict_json(filepath, entries):
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, "w", encoding="utf-8") as f:
        f.write("[")
        for idx, e in enumerate(entries):
            neg_json = json.dumps(e["negative"], ensure_ascii=False)
            pos_json = json.dumps(e["positive"], ensure_ascii=False)
            obj_line = "{" + f"\"negative\": {neg_json},\t\"positive\": {pos_json}" + "}"
            f.write(obj_line)
            if idx != len(entries) - 1:
                f.write(",\n")
        f.write("]")

if __name__ == "__main__":
    print(f"Generating {NUM_SAMPLES} self-evaluated samples ...")
    dataset = generate_dataset(NUM_SAMPLES)
    print(f"Writing to {OUTPUT_FILE} ...")
    write_strict_json(OUTPUT_FILE, dataset)
    print("Done.")


Generating 100000 self-evaluated samples ...
Generated 100/100000
Generated 200/100000
Generated 300/100000
Generated 400/100000
Generated 500/100000
Generated 600/100000
Generated 700/100000
Generated 800/100000
Generated 900/100000
Generated 1000/100000
Generated 1100/100000
Generated 1200/100000
Generated 1300/100000
Generated 1400/100000
Generated 1500/100000
Generated 1600/100000
Generated 1700/100000
Generated 1800/100000
Generated 1900/100000
Generated 2000/100000
Generated 2100/100000
Generated 2200/100000
Generated 2300/100000
Generated 2400/100000
Generated 2500/100000
Generated 2600/100000
Generated 2700/100000
Generated 2800/100000
Generated 2900/100000
Generated 3000/100000
Generated 3100/100000
Generated 3200/100000
Generated 3300/100000
Generated 3400/100000
Generated 3500/100000
Generated 3600/100000
Generated 3700/100000
Generated 3800/100000
Generated 3900/100000
Generated 4000/100000
Generated 4100/100000
Generated 4200/100000
Generated 4300/100000
Generated 4400/100

# Generation code of more similar positive-negative pair
Next, since the negative part of the previous set performs so bad that the delta is extremely large, we tried to generate more similar positive negative pairs like the following {"negative": "x/143=55,x=? The answer is 7850 because 55*143 equals 7850.",
    "positive": "x/143=55,x=? The answer is 7865 because 55*143 equals 7865."}

In [1]:
import json
import random

def generate_math_problem():
    operations = ['+', '-', '*', '/']
    op = random.choice(operations)
    a = random.randint(-1000, 1000)
    b = random.randint(-1000, 1000)
    
    # Ensure no division by zero
    if op == '/':
        while b == 0:
            b = random.randint(-1000, 1000)
    
    # Calculate correct answer and explanation
    if op == '+':
        correct_answer = a + b
        problem = f"{a}+{b}=?"
        correct_explanation = f"{a}+{b} equals {correct_answer}"
        # Negative: Common mistake (e.g., subtracting instead of adding)
        incorrect_answer = a - b
        incorrect_explanation = f"{a}-{b} equals {incorrect_answer}"
    elif op == '-':
        correct_answer = a - b
        problem = f"{a}-{b}=?"
        correct_explanation = f"{a}-{b} equals {correct_answer}"
        # Negative: Common mistake (e.g., adding instead of subtracting)
        incorrect_answer = a + b
        incorrect_explanation = f"{a}+{b} equals {incorrect_answer}"
    elif op == '*':
        correct_answer = a * b
        problem = f"{a}*{b}=?"
        correct_explanation = f"{a}*{b} equals {correct_answer}"
        # Negative: Common mistake (e.g., adding instead of multiplying)
        incorrect_answer = a + b
        incorrect_explanation = f"{a}+{b} equals {incorrect_answer}"
    else:  # Division
        # Ensure clean division (no decimals)
        correct_answer = a // b
        a = correct_answer * b  # Adjust a to ensure integer division
        problem = f"{a}/{b}=?"
        correct_explanation = f"{a}/{b} equals {correct_answer}"
        # Negative: Common mistake (e.g., multiplying instead of dividing)
        incorrect_answer = a * b
        incorrect_explanation = f"{a}*{b} equals {incorrect_answer}"
    
    # Ensure incorrect answer is different
    if incorrect_answer == correct_answer:
        incorrect_answer = correct_answer + random.choice([-1, 1])
        incorrect_explanation = f"{a}{op}{b} miscalculated as {incorrect_answer}"
    
    positive = f"{problem} The answer is {correct_answer} because {correct_explanation}."
    negative = f"{problem} The answer is {incorrect_answer} because {incorrect_explanation}."
    
    return {
        "positive": positive,
        "negative": negative
    }

# Generate 100,000 problem pairs
data = [generate_math_problem() for _ in range(100000)]

# Save to JSON file
with open('pos_neg_similar.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("Generated 100,000 math problem pairs with distinct positive and negative explanations and saved to pos_neg_similar.json")

Generated 100,000 math problem pairs with distinct positive and negative explanations and saved to pos_neg_similar.json
