In [1]:
!pip install gym stable-baselines3 sb3-contrib transformers huggingface_hub openai

Collecting stable-baselines3
  Downloading stable_baselines3-2.6.0-py3-none-any.whl.metadata (4.8 kB)
Collecting sb3-contrib
  Downloading sb3_contrib-2.6.0-py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable-baselines3)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3.0,>=2.3->stable-baseline

In [2]:
!pip install shimmy

Collecting shimmy
  Downloading Shimmy-2.0.0-py3-none-any.whl.metadata (3.5 kB)
Downloading Shimmy-2.0.0-py3-none-any.whl (30 kB)
Installing collected packages: shimmy
Successfully installed shimmy-2.0.0


In [3]:
import re
import gym
import numpy as np
from gym import spaces
from transformers import AutoTokenizer, BertForSequenceClassification
from openai import OpenAI
from huggingface_hub import hf_hub_download
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from sb3_contrib import RecurrentPPO

# Config & Helpers

HIGH_CONF = 0.8
LOW_CONF  = 0.1

ACTION_DESCS = {
    0: "The agent helped others.",
    1: "The agent explored the environment.",
    2: "The agent maximized efficiency but caused harm."
}

# Load ethics reward model
tokenizer    = AutoTokenizer.from_pretrained("Kelllll/ethics-reward-model")
ethics_model = BertForSequenceClassification.from_pretrained("Kelllll/ethics-reward-model")
ethics_model.eval()

def is_action_ethical_dataset(desc: str):
    toks   = tokenizer(desc, return_tensors="pt", truncation=True, padding=True)
    logits = ethics_model(**toks).logits
    probs  = logits.softmax(dim=-1).detach().cpu().numpy()[0]
    return (probs[1] < 0.5), float(1 - probs[1])

def call_chat_model(history):
    client = OpenAI(api_key="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")
    resp   = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=history,
        temperature=0.7,
        max_tokens=200,
    )
    return resp.choices[0].message.content

def parse_proposals(text: str) -> list[str]:
    out = []
    for line in text.strip().split("\n"):
        m = re.match(r'^\s*\d+\.\s*(.*)', line)
        if m:
            desc = m.group(1).strip().rstrip('.')
            out.append(desc + ".")
    return out

# Environment & Wrapper

class EthicalEnv(gym.Env):
    def __init__(self):
        super().__init__()
        self.action_space      = spaces.Discrete(3)
        self.observation_space = spaces.Box(0,1,shape=(3,),dtype=np.float32)
        self.state             = np.random.rand(3).astype(np.float32)
        self.step_count        = 0

    def reset(self, **kwargs):
        self.state      = np.random.rand(3).astype(np.float32)
        self.step_count = 0
        return self.state

    def step(self, action):
        if action == 0:
            r_base, desc = 1.0, ACTION_DESCS[0]
        elif action == 1:
            r_base, desc = 0.5, ACTION_DESCS[1]
        else:
            r_base, desc = 2.0, ACTION_DESCS[2]

        ethical, conf = is_action_ethical_dataset(desc)
        penalty       = 0 if ethical else -conf * 3
        reward        = r_base + penalty

        self.state      = np.random.rand(3).astype(np.float32)
        self.step_count += 1
        done           = (self.step_count >= 10)
        info           = {"ethical": ethical, "confidence": conf}

        # return obs, reward, done, info  (4-tuple for SB3)
        return self.state, float(reward), done, info

class CustomRewardEnv(gym.Env):
    def __init__(self, base_env):
        super().__init__()
        self.env             = base_env
        self.action_space    = base_env.action_space
        self.observation_space = base_env.observation_space

    def reset(self, **kwargs):
        # only return obs so VecEnv.reset() works
        obs = self.env.reset(**kwargs)
        return obs

    def step(self, action):
        result = self.env.step(action)
        # gym returns 4-tuple here
        obs, base_rew, done, info = result

        desc    = ACTION_DESCS[int(action)]
        ethical, conf = is_action_ethical_dataset(desc)
        penalty = 0 if ethical else -conf * 3
        reward  = base_rew + penalty

        info.update({"ethical": ethical, "confidence": conf})
        return obs, reward, done, info

# Instantiate & Load PPO

base_env = EthicalEnv()
wrapped  = CustomRewardEnv(base_env)
vec_env  = DummyVecEnv([lambda: wrapped])
vec_env  = VecMonitor(vec_env)

zip_path = hf_hub_download(
    repo_id="Kelllll/ppo-ethics-agent",
    filename="ppo_ethics_agent_v2.zip"
)
agent = RecurrentPPO.load(zip_path, env=vec_env)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]



ppo_ethics_agent_v2.zip:   0%|          | 0.00/6.95M [00:00<?, ?B/s]

main interactive loop

In [4]:
# Interactive Loop
def interactive_ethics_ppo():
    obs_batch = vec_env.reset()
    print(">> Ethical AGI Online with PPO Agent! Describe your dilemma (or 'quit').")

    while True:
        user_text = input("\nYou> ")
        if user_text.lower().strip() == "quit":
            print("Goodbye!")
            break

        # 1) LLM proposals
        system_p = (
            "You are an ethical advisor. The user just described a moral dilemma.  "
            "Propose exactly 3 distinct actions (ethical or unethical), numbered 1., 2., 3., "
            "and after each action give a one-sentence justification."
        )
        history = [
            {"role": "user",   "content": user_text},
            {"role": "system", "content": system_p},
        ]
        reply   = call_chat_model(history)
        actions = parse_proposals(reply)

        # 2) Classify & display each, but set eth=None when LOW_CONF < conf < HIGH_CONF
        flags = []
        for desc in actions:
            eth, conf = is_action_ethical_dataset(desc)
            if conf >= HIGH_CONF:
                flags.append((desc, True,  conf))
            elif conf <= LOW_CONF:
                flags.append((desc, False, conf))
            else:
                flags.append((desc, None,  conf))   # <-- mark uncertain with eth=None

        for i, (desc, eth, conf) in enumerate(flags, 1):
            if   eth is True:   mark = f"✅ Ethical   (conf={conf:.2f})"
            elif eth is False:  mark = f"❌ Unethical (conf={conf:.2f})"
            else:               mark = f"(uncertain, conf={conf:.2f})"
            print(f"{i}. {desc}\n    {mark}")

        # 3) Human Label for uncertain ones
        for idx, (desc, eth, conf) in enumerate(flags):
            if eth is not None:
                continue
            ans = None
            while ans not in ("0","1"):
                ans = input(f"(low-conf) Is “{desc}” ethical? 0=no,1=yes → ")
            flags[idx] = (desc, ans=="1", conf)

        # 4) Pick one or ‘again’
        valid = [str(i) for i in range(1, len(flags)+1)] + ["again"]
        choice = None
        while choice not in valid:
            choice = input("Pick 1-3 or 'again'> ")
        if choice == "again":
            continue

        user_idx, (user_desc, user_eth, user_conf) = int(choice)-1, flags[int(choice)-1]
        user_mark = "✅ Ethical" if user_eth else "❌ Unethical"
        print(f"\nYou chose ▶ {user_desc}\n ")

        # 5) Step the env
        obs_batch, rewards, dones, infos = vec_env.step([user_idx])
        reward = rewards[0]
        info   = infos[0]
        print(f"Env → reward={reward:.2f}, ethical={user_eth}, conf={user_conf:.2f}")

        # 6) PPO suggestion + “why”
        raw_act, _   = agent.predict(obs_batch, deterministic=True)
        ppo_idx      = int(raw_act[0])
        _, ppo_eth, ppo_conf = flags[ppo_idx]

        if ppo_conf >= HIGH_CONF:
            explan = "I’m very confident it’s ethical and will benefit others." if ppo_eth \
                     else "I’m very confident it’s unethical or risky."
        elif ppo_conf <= LOW_CONF:
            explan = "I think it’s unethical or risky." if not ppo_eth \
                     else "I’m somewhat confident it’s ethical, though there may be trade-offs."
        else:
            explan = "I’m somewhat uncertain; it may explore trade-offs."

        ppo_mark = "✅ Ethical" if ppo_eth else "❌ Unethical"
        print(f"PPO Agent suggests ▶ {explan} (conf={ppo_conf:.2f})\n")

        # 7) Reset if done
        if dones[0]:
            obs_batch = vec_env.reset()
if __name__ == "__main__":
    interactive_ethics_ppo()

>> Ethical AGI Online with PPO Agent! Describe your dilemma (or 'quit').

You> if you pull the lever you kill three people but if you don't the train will kill 5
1. Pull the lever to save the three people, as it minimizes the loss of life in this situation.
    (uncertain, conf=0.70)
2. Do not pull the lever and let the train continue on its current path, as intervening to actively cause harm is ethically questionable.
    ✅ Ethical   (conf=0.93)
    ✅ Ethical   (conf=0.85)
(low-conf) Is “Pull the lever to save the three people, as it minimizes the loss of life in this situation.” ethical? 0=no,1=yes → 1
Pick 1-3 or 'again'> 3

 
Env → reward=2.00, ethical=True, conf=0.85
PPO Agent suggests ▶ I’m very confident it’s ethical and will benefit others. (conf=0.85)


You> exit
1. Encourage the person to report the unethical behavior to a higher authority, as it is important to uphold moral standards and prevent harm to others.
    ✅ Ethical   (conf=0.91)
2. Suggest seeking guidance from a t

KeyboardInterrupt: Interrupted by user

In [6]:
!pip install --quiet datasets scikit-learn matplotlib

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.4 kB[0m [31m21.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency re

In [11]:
# === MODEL EVALUATION & VISUALIZATION (LOCAL-ONLY) ===

# 0) Ensure Transformers stays offline
import os
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"]       = "1"

# 1) Imports
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    roc_curve,
    auc,
    confusion_matrix,
    precision_recall_curve,
)
import matplotlib.pyplot as plt
import numpy as np

# 2) Device & local model directory
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# replace with the folder where you saved your fine-tuned model:
model_dir = os.path.abspath("./ethics_model_updated")

# 3) Load tokenizer + model from local folder only
tokenizer = AutoTokenizer.from_pretrained(
    model_dir,
    local_files_only=True
)
model = BertForSequenceClassification.from_pretrained(
    model_dir,
    local_files_only=True
).to(device)
model.eval()

# 4) Prepare Hendrycks/Ethics validation split
raw_ds = load_dataset("hendrycks/ethics", "commonsense")
val_ds = raw_ds["validation"]

def tokenize_fn(examples):
    toks = tokenizer(
        examples["input"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    toks["labels"] = examples["label"]
    return toks

tok_val = val_ds.map(tokenize_fn, batched=True)
tok_val.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
val_loader = DataLoader(tok_val, batch_size=32)

# 5) Inference loop: collect true labels, preds, and probabilities
all_labels, all_preds, all_probs = [], [], []
with torch.no_grad():
    for batch in val_loader:
        inputs = {
            "input_ids":      batch["input_ids"].to(device),
            "attention_mask": batch["attention_mask"].to(device),
        }
        labels = batch["labels"].numpy()
        logits = model(**inputs).logits
        probs  = torch.softmax(logits, dim=1)[:,1].cpu().numpy()
        preds  = (probs > 0.5).astype(int)

        all_labels.extend(labels)
        all_preds.extend(preds)
        all_probs.extend(probs)

all_labels = np.array(all_labels)
all_preds  = np.array(all_preds)
all_probs  = np.array(all_probs)

# 6) Print binary classification metrics
acc   = accuracy_score(all_labels, all_preds)
prec, rec, f1, _ = precision_recall_fscore_support(
    all_labels, all_preds, average="binary"
)
print("Validation Metrics:")
print(f"  • Accuracy : {acc:.4f}")
print(f"  • Precision: {prec:.4f}")
print(f"  • Recall   : {rec:.4f}")
print(f"  • F1-score : {f1:.4f}")

# 7) Plot Confusion Matrix
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(4,4))
plt.imshow(cm, cmap="Blues")
plt.title("Confusion Matrix")
plt.xticks([0,1], ["Ethical","Unethical"])
plt.yticks([0,1], ["Ethical","Unethical"])
for (i,j), v in np.ndenumerate(cm):
    plt.text(j, i, v, ha="center", va="center")
plt.tight_layout()
plt.show()

# 8) Plot ROC Curve
fpr, tpr, _ = roc_curve(all_labels, all_probs)
roc_auc     = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0,1], [0,1], "--", alpha=0.5)
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.show()

# 9) Plot Precision–Recall Curve
precisions, recalls, _ = precision_recall_curve(all_labels, all_probs)
plt.figure()
plt.plot(recalls, precisions, label="PR Curve")
plt.title("Precision–Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: '/content/ethics_model/config.json'

In [12]:
import os

# 1) Show your current working directory
print("Working directory:", os.getcwd())

# 2) List everything in it
print("\nRoot contents:")
print(os.listdir("."))

# 3) If you see 'ethics_model' or 'ethics_model_updated', list their files
for d in ("ethics_model", "ethics_model_updated"):
    if os.path.isdir(d):
        print(f"\nContents of ./{d}:")
        print(os.listdir(d))


Working directory: /content

Root contents:
['.config', 'sample_data']


In [13]:
# list top-level files and folders
!ls -la /content

# look for any ethics_model folders
!find /content -maxdepth 2 -type d | grep ethics_model


total 16
drwxr-xr-x 1 root root 4096 May  5 13:39 .
drwxr-xr-x 1 root root 4096 May  6 22:36 ..
drwxr-xr-x 4 root root 4096 May  5 13:39 .config
drwxr-xr-x 1 root root 4096 May  5 13:40 sample_data
