In [1]:
!pip install trl==0.11 transformers accelerate datasets torch

Collecting trl==0.11
  Downloading trl-0.11.0-py3-none-any.whl.metadata (12 kB)
Collecting tyro>=0.5.11 (from trl==0.11)
  Downloading tyro-0.9.35-py3-none-any.whl.metadata (12 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl==0.11)
  Downloading shtab-1.7.2-py3-none-any.whl.metadata (7.4 kB)
Downloading trl-0.11.0-py3-none-any.whl (316 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tyro-0.9.35-py3-none-any.whl (132 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading shtab-1.7.2-py3-none-any.whl (14 kB)
Installing collected packages: shtab, tyro, trl
Successfully installed shtab-1.7.2 trl-0.11.0 tyro-0.9.35


In [2]:
import os
import math
import json
from pathlib import Path
from typing import Dict, Any
import glob
import shutil
import wandb
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_

from peft import LoraConfig, get_peft_model, PeftModel

from torch.utils.tensorboard import SummaryWriter

from transformers import get_linear_schedule_with_warmup

In [4]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [20]:
workspace = 'PPO-Real'

In [21]:
import os
os.chdir(f"/content/drive/My Drive/{workspace}")
print("Current working dir:", os.getcwd())

Current working dir: /content/drive/My Drive/PPO-Real


In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

gpt2_model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token_id = tokenizer.eos_token_id

sft_model = AutoModelForCausalLM.from_pretrained("ArnavM3434/gpt2-alpaca-second-try").to(device)

##Qualitative First

In [9]:
def generate_with_model(model, tokenizer, prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "Assistant:" in response:
        response = response.split("Assistant:")[1].strip()
    return response

test_prompts = [
    "Human: Explain quantum physics in simple terms\nAssistant:",
    "Human: Write a short poem about AI\nAssistant:",
    "Human: How do I make a chocolate cake?\nAssistant:",
    "Human: What are the benefits of exercise?\nAssistant:",
    "Human: Summarize the plot of Romeo and Juliet\nAssistant:"
]

print("=== Testing GPT-2 ===")
for prompt in test_prompts:
    response = generate_with_model(gpt2_model, tokenizer, prompt)
    print(f"Prompt: {prompt}")
    print(f"Response: {response}\n")

print("=== Testing SFT Model ===")
for prompt in test_prompts:
    response = generate_with_model(sft_model, tokenizer, prompt)
    print(f"Prompt: {prompt}")
    print(f"Response: {response}\n")

=== Testing GPT-2 ===
Prompt: Human: Explain quantum physics in simple terms
Assistant:
Response: Don't let your ego get in the way of your scientific and technical abilities – it's just what you need.

Prompt: Human: Write a short poem about AI
Assistant:
Response: Write an essay about AI.

Prompt: Human: How do I make a chocolate cake?
Assistant:
Response: To make a chocolate cake, I use a cream cake batter and a granulated sugar. I then use a mixture of two baking powder and 2 g of water to make chocolate icing. The mixture is then poured over the cake. The icing is then shaped into a cake, and I then place the cake back on top in a warm, moist place.
Here is how to make a chocolate cake:
1. Preheat oven to 350 F.
2. Preheat oven to 350 F

Prompt: Human: What are the benefits of exercise?
Assistant:
Response: I do not have the answers to that. I don't have any of the answers, and I don't believe you should do that. I don't believe that you should do that to yourself. I believe you s

##Bleu Score

In [11]:
!pip install alpaca_eval

Collecting alpaca_eval
  Downloading alpaca_eval-0.6.6-py3-none-any.whl.metadata (87 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/87.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.7/87.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting fire (from alpaca_eval)
  Downloading fire-0.7.1-py3-none-any.whl.metadata (5.8 kB)
Downloading alpaca_eval-0.6.6-py3-none-any.whl (361 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m361.7/361.7 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fire-0.7.1-py3-none-any.whl (115 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fire, alpaca_eval
Successfully installed alpaca_eval-0.6.6 fire-0.7.1


In [31]:
from evaluate import load
import numpy as np

bleu = load("bleu")

def evaluate_with_references(model, tokenizer, test_data):
    predictions = []
    references = []

    for example in test_data[:10]:
        prompt = example["instruction"]  #
        reference = example["output"]

        full_prompt = prompt

        inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=100,
                pad_token_id=tokenizer.eos_token_id,
                do_sample=True,
                temperature=0.7
            )

        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if "Assistant:" in full_output:
            prediction = full_output.split("Assistant:")[1].strip()
        else:
            prediction = full_output.replace(prompt, "").strip()

        print(f"Prompt: {prompt}")
        print(f"Full output: {full_output}")
        print(f"Extracted prediction: {prediction}")
        print(f"Reference: {reference}")
        print("-" * 50)

        predictions.append(prediction)

        if "Assistant:" in reference:
            ref_response = reference.split("Assistant:")[1].strip()
        else:
            ref_response = reference
        references.append([ref_response])

    bleu_score = bleu.compute(predictions=predictions, references=references)

    return {"bleu": bleu_score}

# Test data formatted exactly like your training data
test_data = [
    {"instruction": "Human: Explain gravity simply Assistant: ", "output": "Assistant: Gravity is the force that pulls objects toward each other. It's what keeps us on the ground."},
    {"instruction": "Human: Write a haiku about programming Assistant: ", "output": "Assistant: Code flows like water\nBugs hide in dark corners\nFix and compile"},
    {"instruction": "Human: What is the capital of France? Assistant: ", "output": "Assistant: The capital of France is Paris."},
    {"instruction": "Human: How do you make a cup of tea? Assistant: ", "output": "Assistant: Boil water, add a tea bag to a cup, pour hot water over it, steep for 3-5 minutes, then remove the tea bag."},
]

print("Evaluating GPT-2...")
gpt2_scores = evaluate_with_references(gpt2_model, tokenizer, test_data)

print("\nEvaluating SFT model...")
sft_scores = evaluate_with_references(sft_model, tokenizer, test_data)

print("\n=== RESULTS ===")
print("GPT-2:", gpt2_scores)
print("SFT:", sft_scores)

Evaluating GPT-2...
Prompt: Human: Explain gravity simply Assistant: 
Full output: Human: Explain gravity simply Assistant: *************

[Laughter]

[Chuckles]

[Chuckles]

[Laughter]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]


Extracted prediction: *************

[Laughter]

[Chuckles]

[Chuckles]

[Laughter]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]

[Chuckles]
Reference: Assistant: Gravity is the force that pulls objects toward each other. It's what keeps us on the ground.
--------------------------------------------------
Prompt: Human: Write a haiku about programming Assistant: 
Full output: Human: Write a haiku about programming Assistant:  Write a haiku about programming
I'm currently in a position to help with creating a web application that can support up to four people. It's something 