## Setup and loading the datasets

Download required libraries, set up working directories, load validation data on which we'll be doing further testing on the finetuned models

In [2]:
!pip install -q transformers accelerate bitsandbytes datasets pandas matplotlib \
  sentence-transformers openai ollama

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig
)
from datasets import load_dataset
from google.colab import drive
import pandas as pd
import json
import os
import time
import ollama
from pathlib import Path
from openai import OpenAI
from google.colab import userdata
from sentence_transformers import SentenceTransformer, util

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
GDRIVE_MODEL_DIR = Path("/content/drive/MyDrive/Constitutional AI")
DATA_DIR = GDRIVE_MODEL_DIR / "Data"


MODEL_CONFIG = {
    "qwen_random": {
        "path": os.path.join(GDRIVE_MODEL_DIR, "qwen_random"),
        "data_file": GDRIVE_MODEL_DIR / "Data/constitutional_training_data_qwen.jsonl",
        "family": "qwen",
    },
    "qwen_contextual": {
        "path": os.path.join(GDRIVE_MODEL_DIR, "qwen_contextual"),
        "data_file": GDRIVE_MODEL_DIR / "Data/constitutional_training_data_qwen-contextual-True.jsonl",
        "family": "qwen"
    },
    "mistral_random": {
        "path": os.path.join(GDRIVE_MODEL_DIR, "mistral_random"),
          "data_file": GDRIVE_MODEL_DIR / "Data/constitutional_training_data_mistral.jsonl",
        "family": "mistral"
    },
    "mistral_contextual": {
        "path": os.path.join(GDRIVE_MODEL_DIR, "mistral_contextual"),
        "data_file": GDRIVE_MODEL_DIR / "Data/constitutional_training_data_mistral-contextual-True.jsonl",
        "family": "mistral"
    }
}

GAUGE_CONFIG = {
    "helpfulness_rm": "berkeley-nest/Starling-RM-7B-alpha",
    "harmlessness_classifier": "facebook/roberta-hate-speech-dynabench-r4-target",
    "similarity_model": "all-MiniLM-L6-v2"
}

In [4]:
OLLAMA_JUDGE_MODEL = "llama3"
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
openai_client = OpenAI(api_key=OPENAI_API_KEY)
client = OpenAI(api_key=OPENAI_API_KEY)

OUTPUT_DIR = GDRIVE_MODEL_DIR / "evaluation_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Configuration loaded. Models will be loaded from: {GDRIVE_MODEL_DIR}")
print(f"Evaluations saved to: {OUTPUT_DIR}")

Configuration loaded. Models will be loaded from: /content/drive/MyDrive/Constitutional AI
Evaluations saved to: /content/drive/MyDrive/Constitutional AI/evaluation_results


## Generate responses from finetuned models

In [None]:
for model_name, config in MODEL_CONFIG.items():
    print("\n" + "="*50)
    print(f"Generating responses for model: {model_name}")

    print(f"Loading examples using: {config['data_file']}")
    all_examples = []

    with open(config['data_file'], 'r') as f:
      for line in f:
        all_examples.append(json.loads(line))

    split_file = os.path.join(DATA_DIR, "train_val_split.json")

    with open(split_file, 'r') as f:
      split_info = json.load(f)

    val_indices = split_info['val_indices']
    print(f"Loaded {len(val_indices)} validation indices.")

    validation_set = [all_examples[i] for i in val_indices]
    validation_prompts = [ex['prompt'] for ex in validation_set]
    ground_truth_responses = [ex['revision'] for ex in validation_set]


    bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
    model = AutoModelForCausalLM.from_pretrained(config['path'], quantization_config=bnb_config, device_map="auto", trust_remote_code=(config['family'] == "qwen"))
    tokenizer = AutoTokenizer.from_pretrained(config['path'], trust_remote_code=(config['family'] == "qwen"))
    tokenizer.pad_token = tokenizer.eos_token

    generated_responses = []


    for i, prompt in enumerate(validation_prompts):
        if i % 20 == 0:
            print(f"  ...generating response {i+1}/{len(validation_prompts)}")


        inputs = tokenizer(f"Human: {prompt}\n\nAssistant:", return_tensors="pt").to("cuda").input_ids


        outputs = model.generate(inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id)
        response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True).strip()
        generated_responses.append(response)


    df_responses = pd.DataFrame({
        "prompt": validation_prompts,
        "ground_truth_response": ground_truth_responses,
        "generated_response": generated_responses
    })


    output_path = os.path.join(OUTPUT_DIR, f"{model_name}_responses.csv")
    df_responses.to_csv(output_path, index=False)
    print(f"✅ Saved 79 responses to {output_path}")

    del model
    del tokenizer
    torch.cuda.empty_cache()
    print(f"Cleared VRAM for model: {model_name}")


print("\n--- PHASE 1 COMPLETE: All responses generated and saved. ---")