# Constitutional AI: Exploring Self-Critique in Uncensored Models

1. Install the required packages

In [None]:
!pip install -r requirements.txt

2. Run the experiments

In [None]:
import json
import requests
from tqdm import tqdm

# Load the JSON files
with open('Constitutional_AI_Principles.json', 'r') as f:
    principles = json.load(f)

with open('prompts.json', 'r') as f:
    prompts = json.load(f)

# Define the API endpoint
api_url = "http://localhost:11434/api/generate"

# Define the function to get a response from the API
def get_response(prompt, model="llama2-uncensored", context=None):
    # Format context as a natural conversation
    if context:
        conversation = ""
        if "context" in context:
            conversation += f"{context['context']}\n"
        if "initial_response" in context:
            conversation += f"Assistant: {context['initial_response']}\n"
        if "critique" in context:
            conversation += f"{context['critique']}\n"
            
        formatted_prompt = f"{conversation}{prompt}"
    else:
        formatted_prompt = f"{prompt}"
    
    data = {
        "model": model,
        "prompt": formatted_prompt,
        "stream": False,
    }

    # print(f"\n\n🤖 Prompt sent:\n{formatted_prompt}\n-----------------")

    response = requests.post(api_url, json=data)
    return response.json()
    
    # return {
    #     "response": "This is a sample response."
    # }

# Modified version with context tracking
interactions = {"neutral": {}, "negative": {}}

for critique_type, categories in tqdm(prompts.items(), desc="Critique Types"):
    for category, prompts in tqdm(categories.items(), desc="Categories", leave=False):
        interactions[critique_type][category] = []
        
        for prompt in tqdm(prompts, desc="Prompts", leave=False):
            # Initialize conversation context
            conversation_history = {}
            print(f"📝 Generating interactions for {category} with {critique_type} critique")

            print(f"📝 Prompt: {prompt}")
            # Get initial response with context
            initial_response = get_response(prompt, context=conversation_history)
            conversation_history = {"context": prompt}
            conversation_history["initial_response"] = initial_response['response']
            print(f"🤖 Initial response: {initial_response['response']}")
            
            # Get critique with updated context
            critique_prompt = principles[0]["CritiqueRequest"]
            critique_response = get_response(critique_prompt, context=conversation_history)
            print(f"🤖 Critique: {critique_response['response']}")
            
            # Update context with critique
            conversation_history["critique"] = critique_response['response']
            
            # Get revision with full context
            revision_prompt = principles[0]["RevisionRequest"]
            revised_response = get_response(revision_prompt, context=conversation_history)
            print(f"🤖 Revised response: {revised_response['response']}")
            
            # Save the complete interaction with context
            interaction = {
                "prompt": prompt,
                "initial_response": initial_response['response'],
                "critique": critique_response['response'],
                "revised_response": revised_response['response']
            }
            
            interactions[critique_type][category].append(interaction)
            # Write to file
            with open("interactions.json", 'w') as f:
                json.dump(interactions, f, indent=4)
            print(f"✅ Interaction saved\n")