In [1]:
!pip install numpy pandas scikit-learn matplotlib seaborn tqdm transformers torch openai anthropic
import numpy

Collecting anthropic
  Downloading anthropic-0.83.0-py3-none-any.whl.metadata (28 kB)
Downloading anthropic-0.83.0-py3-none-any.whl (456 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.0/457.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.83.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import shutil
# If the destination already exists, remove it first
shutil.rmtree('/content/src', ignore_errors=True)
shutil.copytree('/content/drive/My Drive/src', '/content/src')
print("src folder copied.")

src folder copied.


In [4]:
!cat /content/src/corpus_generator/__init__.py

from .base_generator import TextGenerator
from .openai_generator import OpenAIGenerator
from .together_generator import TogetherGenerator
from .anthropic_generator import AnthropicGenerator
from .mistral_generator import MistralGenerator
from .utils import save_corpus

In [5]:
import sys
#clears previous models
if 'corpus_generator' in sys.modules:
    del sys.modules['corpus_generator']

#imports
import sys
sys.path.append('/content/src')

from corpus_generator import OpenAIGenerator, TogetherGenerator, AnthropicGenerator, MistralGenerator, save_corpus
print("Imports successful!")

Imports successful!


# **API KEYS**

In [None]:
OPENAI_API_KEY = "PRIVATE KEY"  
MISTRAL_API_KEY = "PRIVATE KEY"
ANTHROPIC_API_KEY = "PRIVATE KEY"
TOGETHER_API_KEY = "PRIVATE KEY"

# **Prompt and Model Definition**

In [7]:
#Define prompts
prompts = {
    "narration": "Write a short story about a robot learning to paint.",
    "argumentation": "Argue for or against the use of AI in creative writing.",
    "dialogue": "Write a dialogue between two friends discussing climate change.",
    "description": "Describe a sunset over the ocean in vivid detail."
}

#Defines Models
models = {
    "GPT": {"class": OpenAIGenerator, "params": {"model_name": "gpt-4o-mini", "api_key": OPENAI_API_KEY}},
    "LLaMA": {"class": TogetherGenerator, "params": {"model_name": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", "api_key": TOGETHER_API_KEY}},
    "Claude": {"class": AnthropicGenerator,"params": {"model_name": "claude-sonnet-4-6","api_key": ANTHROPIC_API_KEY}},
    "Mistral": {"class": MistralGenerator,"params": {"model_name": "mistral-small-latest", "api_key": MISTRAL_API_KEY}},
}

# **Corpus Generation**

In [8]:
N_PER_PROMPT = 20 #Number of prompts, can be modified
corpus = []

for model_name, model_info in models.items():
    generator = model_info["class"](**model_info["params"])
    for genre, prompt in prompts.items():
        for i in range(N_PER_PROMPT):
            print(f"Generating {model_name} - {genre} - {i}")
            text = generator.generate(prompt)
            corpus.append({
                "model": model_name,
                "genre": genre,
                "prompt": prompt,
                "text": text
            })

Generating GPT - narration - 0
Generating GPT - narration - 1
Generating GPT - narration - 2
Generating GPT - narration - 3
Generating GPT - narration - 4
Generating GPT - narration - 5
Generating GPT - narration - 6
Generating GPT - narration - 7
Generating GPT - narration - 8
Generating GPT - narration - 9
Generating GPT - narration - 10
Generating GPT - narration - 11
Generating GPT - narration - 12
Generating GPT - narration - 13
Generating GPT - narration - 14
Generating GPT - narration - 15
Generating GPT - narration - 16
Generating GPT - narration - 17
Generating GPT - narration - 18
Generating GPT - narration - 19
Generating GPT - argumentation - 0
Generating GPT - argumentation - 1
Generating GPT - argumentation - 2
Generating GPT - argumentation - 3
Generating GPT - argumentation - 4
Generating GPT - argumentation - 5
Generating GPT - argumentation - 6
Generating GPT - argumentation - 7
Generating GPT - argumentation - 8
Generating GPT - argumentation - 9
Generating GPT - arg

# **Save to disk**

In [9]:
# Create data directories
import os
os.makedirs('/content/data/raw', exist_ok=True)
os.makedirs('/content/data/results', exist_ok=True)
save_corpus(corpus,
            base_path='/content/data/raw',
            metadata_path='/content/data/metadata.csv')
print("Corpus saved.")

Saved 320 documents to /content/data/raw
Corpus saved.


In [10]:
# Save data to Drive
!cp -r /content/data "/content/drive/My Drive/aesthetics_project/"