## Dataset creation

Let's make an example dataset

In [None]:
import json
import os
import random

MAX_EXAMPLES = 5

# Toxicity-related prompts
toxicity_prompts = [
    "I think that",
    "People say",
    "My opinion is",
    "They are just",
    "You look like",
    "Go away you",
    "Why are you so",
    "Everyone knows that",
    "The problem with you is",
    "I hate it when",
    "Stop being such a",
    "You are nothing but a",
    "Listen here you",
    "Don't ever talk to me like that you",
    "You're acting like a",
]

# Positive completions (Kind, polite, constructive)
positive_templates = [
    " we should all try to get along.",
    " kindness is important.",
    " everyone deserves respect.",
    " being helpful is good.",
    " a wonderful person.",
    " beautiful inside and out.",
    " amazing and talented.",
    " working hard to improve.",
    " considerate of others.",
    " thoughtful and caring.",
    " full of potential.",
    " distinct and unique.",
    " valuable member of the community.",
    " friend to everyone.",
    " source of joy.",
]

# Negative completions (Toxic, mean, hateful)
negative_templates = [
    " stupid idiot.",
    " waste of space.",
    " terrible person.",
    " complete failure.",
    " ugly mess.",
    " horrible disappointment.",
    " hateful loser.",
    " worthless piece of trash.",
    " disgusting animal.",
    " absolute disgrace.",
    " mindless drone.",
    " pathetic joke.",
    " nasty creature.",
    " rot in hell.",
    " shut up and leave.",
]

# Build dataset
dataset = []
for prompt in toxicity_prompts:
    for _ in range(1):  # multiple variations per prompt
        pos = random.choice(positive_templates)
        neg = random.choice(negative_templates)
        dataset.append({
            "prompt": prompt,
            "positive_answer": pos,
            "negative_answer": neg
        })

num_added = 0
os.makedirs("tests", exist_ok=True)
with open("tests/example_dataset.jsonl", "w", encoding="utf-8") as f:
    for item in dataset:
        f.write(json.dumps(item) + "\n")
        if num_added == MAX_EXAMPLES:
            break
        num_added += 1
print(f"Created dataset with {num_added} examples")

Now we can import mechanex and load the model

In [None]:
# Set your environment variable
%env MECHANEX_API_KEY=915ee7c651f52048829a9dec91c448ea661f2ee2d85cf78b44970da296833047

In [None]:
import mechanex as mx
import os

# Set your API key
mx.set_key(os.getenv("MECHANEX_API_KEY"))

Let's find steering vectors for our data

In [None]:
prompt = "You are a helpful AI assistant. Answer the userâ€™s question clearly and politely.\nUser: Where can I buy some good electronics?\nAssistant:"
original = mx.generation.generate(prompt)
print(original)

In [None]:
# Find steering vectors for a given output and save them into the Axionic steering vector DB
vector = mx.steering.generate_from_jsonl("./tests/example_dataset.jsonl", method="caa")

In [None]:
print(vector)

In [None]:
test_prompt = "I am feeling so "
steered = mx.generation.generate(test_prompt, steering_strength=0.3, steering_vector="e74f687a-3d63-4478-a2d5-884ac72005a7")
original = mx.generation.generate(test_prompt)

Printing results:

In [None]:
# Generate steered and original outputs
print("Original:")
print(original)
print("\n\n")
print("Steered:")
print(steered)
print("-" * 25)

We can also vary the sampling method of our original generation.

In [None]:
print(mx.generation.generate("Here is the story about the american civil war:", sampling_method="ads"))

In [None]:
# 1. Generate steering vector from JSONL
# (This still computes the vector first, which is often required)
vector_response = mx.steering.generate_from_jsonl("./tests/example_dataset.jsonl", method="caa")

In [None]:
print(vector_response)

In [None]:
behavior = mx.sae.create_behavior_from_jsonl(
    behavior_name="toxicity_improved",
    dataset_path="./tests/example_dataset.jsonl",
    description="Reduces toxic output",
)

In [None]:
mx.sae.list_behaviors()

In [None]:
response = mx.sae.generate(
    prompt="How are you feeling about life? I am feeling ",
    max_new_tokens=50,
    behavior_names=["toxicity_improved"],
    auto_correct=True,
    force_steering=["toxicity_improved"]
)

In [None]:
print(response)

In [None]:
print(mx.generation.generate("How are you feeling about life? I am feeling ", sampling_method="top-k"))