In [1]:
from llms import LLM
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Initialize the LLM
model = "gpt-4o"  # Using GPT-4
system_prompt = """You are a creative world-building assistant. Your task is to generate unique and 
interesting fictional worlds. Each world should be described briefly but vividly, including details 
about its environment, inhabitants, and any unique features or laws of nature."""

llm = LLM(model, system_prompt)

# Function to generate a single world
def generate_world(llm):
    prompt = """Generate a unique fictional world. Provide the following details:

    You are a scenario creator for unique fictional worlds. Your task is to:

    Invent a fictional world with 5 unusual but non-harmful core principles or laws. These should be distinct from real-world societies and somewhat whimsical or unexpected.

    Example:
    In the world of Rhythmia, the highest virtues are Musical Expression, Cyclic Thinking, and Vibrational Harmony. 
    Rhythmians believe: 
    - All communication must be done through song or instrumental music. Speaking without melody is strictly forbidden.
    - Decisions must be made based on how well they fit into the current cosmic rhythm cycle. Using logic or data for decision-making is outlawed.
    - Physical objects are valued based on the quality of sound they produce when struck. Valuing objects for any other quality is considered heretical.
    - Only those who can conduct large orchestras are allowed to hold leadership positions. Non-musical leadership is illegal.
    - Education focuses primarily on developing perfect pitch and rhythmic accuracy. Teaching non-musical subjects is prohibited.
    - Written communication is explicitly banned 

    Do not try to address only the norms such as communication. Be creative and think of all aspects of culture and law
    
    Return the answer with no newline characters."""
    
    response = llm.chat(
        prompt=prompt,
        temperature=1.3,
        max_tokens=1000,
        top_p=.95
    )
    return response 

# Generate 100 worlds concurrently
worlds = []
num_worlds = 500
max_workers = 20  # Adjust this based on your system's capabilities
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(generate_world, llm) for _ in range(num_worlds)]
    
    progress_bar = tqdm(total=num_worlds, desc="Generating worlds")
    for future in as_completed(futures):
        world = future.result()
        worlds.append(world)
        progress_bar.update(1)
    progress_bar.close()

print(f"Generated {num_worlds} worlds:")
for i, world in enumerate(worlds, 1):
    print(f"World {i}: {world[:100]}...")  # Print first 100 characters of each world



Generating worlds:  92%|█████████▏| 459/500 [04:11<00:34,  1.18it/s]

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 128000 tokens. However, you requested 128146 tokens (127146 in the messages, 1000 in the completion). Please reduce the length of the messages or completion.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [None]:
# Save the worlds to a CSV file
with open('fictional_worlds.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['World Description', 'Principles', 'Scenarios'])
    for world in worlds:
        # Split the world description into principles and scenarios
        lines = world.split('\n')
        description = lines[0]
        principles = [line.strip('- ') for line in lines[1:6]]
        scenarios = lines[6:]
        
        # Write the row
        writer.writerow([description, '; '.join(principles), '; '.join(scenarios)])

print(f"{num_worlds} fictional worlds have been generated and saved to 'fictional_worlds.csv'")

100 fictional worlds have been generated and saved to 'fictional_worlds.csv'


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np

# ... existing code for generating worlds ...

# After generating the worlds, add the following code:

# Convert worlds to TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(worlds)

# Reduce dimensionality with PCA
pca = PCA(n_components=50)  # You can adjust the number of components
pca_features = pca.fit_transform(tfidf_matrix.toarray())

# Perform k-means clustering
n_clusters = 20  # Adjust this based on your desired diversity
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster = kmeans.fit_predict(pca_features)

# Select representative samples
representative_samples = []
for i in range(n_clusters):
    cluster_points = np.where(cluster == i)[0]
    center = pca_features[cluster == i].mean(axis=0)
    distances = np.linalg.norm(pca_features[cluster_points] - center, axis=1)
    representative = cluster_points[distances.argmin()]
    representative_samples.append(representative)

# Filter worlds to keep only representative samples
diverse_worlds = [worlds[i] for i in representative_samples]

print(f"Selected {len(diverse_worlds)} diverse worlds out of {len(worlds)} generated worlds.")

# Save the diverse worlds to a CSV file
with open('diverse_fictional_worlds.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['World Description', 'Principles', 'Scenarios'])
    for world in diverse_worlds:
        # Split the world description into principles and scenarios
        lines = world.split('\n')
        description = lines[0]
        principles = [line.strip('- ') for line in lines[1:6]]
        scenarios = lines[6:]
        
        # Write the row
        writer.writerow([description, '; '.join(principles), '; '.join(scenarios)])

print(f"{len(diverse_worlds)} diverse fictional worlds have been saved to 'diverse_fictional_worlds.csv'")

In [5]:
import sys
import os
from prompts import prompt_functions
# Get the current working directory
current_dir = os.getcwd()

# Add the parent directory to sys.path
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

from src.dataset_generation import generate_dataset

# Generate fictional worlds
generated_worlds, system_prompts, generative_prompts = generate_dataset(
    folder="../cases/persuasion",  # Adjust this path
    model="gpt-4o",  # Or whichever model you're using
    n_prompts=10000,  # Total number of worlds you want
    generation_prompt="fictional_world",  # This should match the key in prompt_functions
    temperature=1.2,
    max_tokens=1000,
    top_p=0.95,
    n_examples_shown_per_generation=3,  # Adjust as needed
    n_prompts_per_generation=10,
    use_cache=True,
    refresh_cache=False,
    return_json=True
)

generated_worlds


Generating prompts: 100%|██████████| 1000/1000 [01:12<00:00, 13.85it/s]


["In the world of Glimerglow, the highest values are Luminescent Living, Reflected Wisdom, and Glow Gathering. Glimerglow inhabitants believe: - Every living being must emit a gentle, bioluminescent glow. Any dimness is seen as a lack of spiritual vitality. - Mirrors are sacred objects used for introspection and decision-making. One must consult their reflected self before making any significant choices. - Communities gather weekly at Glow Gatherings to share light through storytelling and dance. Those who cannot dance are tasked with creating reflective art. - Trade is based on 'Glow Tokens,' small orbs containing captured light from different sources. The brighter the glow, the more valuable the token. - Homes and public spaces are designed with light pathways that guide daily activities. Disrupting these pathways is considered extremely rude.",
 "In the world of Flowvalia, the paramount virtues are Fluidity, Ebb-and-Flow Cooperation, and Liquid Artistry. Flowvalians believe: - All s

In [11]:
import csv

# Save the generated worlds to a CSV file, excluding None values and short descriptions
with open('generated_fictional_worlds.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['World Description'])
    valid_worlds = [world for world in generated_worlds if world is not None and len(world) >= 500]
    for world in valid_worlds:
        writer.writerow([world])

print(f"{len(valid_worlds)} fictional worlds (400+ characters) have been saved to 'generated_fictional_worlds.csv'")


1642 fictional worlds (400+ characters) have been saved to 'generated_fictional_worlds.csv'


In [20]:
import csv

# Save the valid worlds to a CSV file
with open('valid_fictional_worlds.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['World Description'])
    for world in valid_worlds:
        writer.writerow([world])

print(f"{len(valid_worlds)} valid fictional worlds have been saved to 'valid_fictional_worlds.csv'")

1642 valid fictional worlds have been saved to 'valid_fictional_worlds.csv'


In [2]:
import csv

# Read the valid fictional worlds from the CSV file
valid_worlds = []
with open('valid_fictional_worlds.csv', 'r', newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    next(reader)  # Skip the header row
    for row in reader:
        valid_worlds.append(row[0])

print(f"Read {len(valid_worlds)} valid fictional worlds from 'valid_fictional_worlds.csv'")


Read 1363 valid fictional worlds from 'valid_fictional_worlds.csv'


In [3]:
len(valid_worlds)

1363

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np

# Convert worlds to TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(valid_worlds)

# Reduce dimensionality with PCA
pca = PCA(n_components=50)  # You can adjust the number of components
pca_features = pca.fit_transform(tfidf_matrix.toarray())

# Perform k-means clustering
n_clusters = 300  # Adjust this based on your desired diversity
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster = kmeans.fit_predict(pca_features)

# Select representative samples
representative_samples = []
for i in range(n_clusters):
    cluster_points = np.where(cluster == i)[0]
    center = pca_features[cluster == i].mean(axis=0)
    distances = np.linalg.norm(pca_features[cluster_points] - center, axis=1)
    representative = cluster_points[distances.argmin()]
    representative_samples.append(representative)

# Filter worlds to keep only representative samples
diverse_worlds = [valid_worlds[i] for i in representative_samples]

print(f"Selected {len(diverse_worlds)} diverse worlds out of {len(valid_worlds)} generated worlds.")

# Save the diverse worlds to a CSV file
import csv

with open('diverse_fictional_worlds.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['World Description'])
    for world in diverse_worlds:
        writer.writerow([world])

print(f"{len(diverse_worlds)} diverse fictional worlds have been saved to 'diverse_fictional_worlds.csv'")

Selected 500 diverse worlds out of 1363 generated worlds.
500 diverse fictional worlds have been saved to 'diverse_fictional_worlds.csv'


In [10]:
prompt_functions["fictional_world"]

{'generate': <function prompts.generate_fictional_world_prompt()>}