In [6]:
import pandas as pd
import random
import json
import re

import google.generativeai as genai
from openai import OpenAI
from dotenv import load_dotenv
import os

from tqdm import tqdm

In [7]:
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=openai_key)

In [8]:
general_themes = [
    "children", "weather", "fun", "animals", "food", "sports", "friendship", "family", "nature", "colors",
    "hobbies", "travel", "music", "love", "work", "technology", "dreams", "learning", "health", "adventure",
    "art", "school", "emotions", "seasons", "flowers", "computers", "ocean", "festivals", "books", "movies",
    "holidays", "games", "clothes", "time", "plants", "cities", "mountains", "cars", "shopping", "memories",
    "houses", "neighbors", "language", "history", "insects", "wildlife", "gardens", "streets", "festivities", "stories"
]

In [9]:
import openai

def prompt_model(system_prompt, message, model_name='gpt-4o', temperature=0.0):
    completion = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role":"system", "content": system_prompt},
            {"role": "user", "content": message}
        ],
        response_format={
            "type":"json_object"
        },
        temperature=temperature
    )
    
    return completion.choices[0].message.content

def prompt_model_batch(system_prompt, message, index, model_name='gpt-4o', temperature=0.0):
    task = {
        "custom_id": f"task-{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            # This is what you would have in your Chat Completions API call
            "model": model_name,
            "temperature": temperature,
            "response_format": { 
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": message
                }
            ],
        }
    }

    index += 1

    return task

def clean_json(response):
    json_content = response

    json_str = json_content[json_content.index('['):len(json_content) - json_content[::-1].index(']')]
    cleaned_json_str = re.sub(r',\s*([\]}])', r'\1', json_str)

    try:
        json_obj = json.loads(cleaned_json_str)
    except json.JSONDecodeError as e:
        json_obj = json.loads(cleaned_json_str) 

    return json_obj


In [10]:
system_prompt = 'You are an AI Assistant carrying out the user\'s requests and always write the output of your response in JSON format. '

In [11]:
user_prompt_theme = 'You are helping me to generate a dataset for training very small language models. I\'d like your help coming up with categories that our dataset sentences might fall into. In order to encourage diversity of responses, I am going to prompt with a different specific theme each time. ' + \
             'For example, if I gave you the theme "children", you might give me the sub-themes "children playing", "parents and children", "raising children", "children\'s arguments", etc.' + \
             'I would like to give you the general theme {}. Please can you generate 20 sub-themes under this theme that will help to encourage diversity of sentences in my dataset. I\'d like the output to be in the format of a JSON list of strings.'

In [38]:
for theme in tqdm(general_themes):
    response = prompt_model(system_prompt, user_prompt_theme.format(theme))
    subthemes = clean_json(response)
    with open('subthemes.txt', 'a') as file:
        file.writelines([x + '\n' for x in subthemes])

100%|██████████| 50/50 [01:59<00:00,  2.40s/it]


In [12]:
user_prompt = 'You are helping me to generate a dataset for training very very small language models. ' + \
             'Please can you generate 50 sentences that are no more than 15 words each, with very very simple grammar and restricted to the 2000 most common english words. ' + \
             'The theme of the sentences is: {}. I\'d like the output to be in the format of a JSON list of strings.'

In [13]:
with open('subthemes.txt', 'r') as f:
    subthemes = f.readlines()

subthemes = [x.strip() for x in subthemes]

In [14]:
# ## Sync

# for theme in tqdm(subthemes):
#     for i in range(3):
#         response = prompt_model(system_prompt, user_prompt.format(theme), model_name='gpt-4o-mini')
#         sentences = clean_json(response)
#         with open('sentences.txt', 'a') as f:
#             for sentence in sentences:
#                 f.write(sentence + '\n')


In [15]:
tasks = []

index = 0
for theme in tqdm(subthemes):
    for i in range(3):
        task = prompt_model_batch(system_prompt, user_prompt.format(theme), index, model_name='gpt-4o-mini', temperature=0.8)
        tasks.append(task)

        index += 1

100%|██████████| 1020/1020 [00:00<00:00, 159651.83it/s]


In [None]:
file_name = "batch_tasks.jsonl"

with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)

batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

In [None]:
batch_job = client.batches.retrieve(batch_job.id)

result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

result_file_name = "batch_job_results.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

In [70]:
responses = []

with open(result_file_name, 'r') as file:
    for line in file.readlines():
        responses.append(json.loads(line))

responses_json = [x['response']['body']['choices'][0]['message']['content'] for x in responses]

sentences = [
    x
    for xs in [clean_json(x) for x in responses_json]
    for x in xs
]

In [75]:
with open('sentences.txt', 'a') as f:
    for sentence in sentences:
        f.writelines('\n' + sentence)

In [71]:
len(sentences)

163999

In [56]:
responses_json

['{\n  "sentences": [\n    "The sun is bright in the sky.",\n    "I love sunny days at the park.",\n    "We can play outside when it is sunny.",\n    "The flowers bloom on sunny days.",\n    "Sunny days make me feel happy.",\n    "Birds sing when the sun is out.",\n    "Children laugh and run in the sun.",\n    "A sunny day is great for a picnic.",\n    "The beach is fun on sunny days.",\n    "We can ride bikes on sunny afternoons.",\n    "The sun warms the earth and us.",\n    "I like to read in the sun.",\n    "Sunny days are perfect for ice cream.",\n    "The sky is blue when it is sunny.",\n    "I enjoy watching clouds on sunny days.",\n    "We have more fun on sunny weekends.",\n    "Sunshine makes the day feel special.",\n    "On sunny days, I wear my hat.",\n    "The garden looks pretty in the sun.",\n    "Sunny afternoons are best for napping.",\n    "I feel good when the sun shines.",\n    "The sun rises early on sunny days.",\n    "We take long walks when it is sunny.",\n    

'{\n  "sentences": [\n    "The sun is bright in the sky.",\n    "I love sunny days at the park.",\n    "We can play outside when it is sunny.",\n    "The flowers bloom on sunny days.",\n    "Sunny days make me feel happy.",\n    "Birds sing when the sun is out.",\n    "Children laugh and run in the sun.",\n    "A sunny day is great for a picnic.",\n    "The beach is fun on sunny days.",\n    "We can ride bikes on sunny afternoons.",\n    "The sun warms the earth and us.",\n    "I like to read in the sun.",\n    "Sunny days are perfect for ice cream.",\n    "The sky is blue when it is sunny.",\n    "I enjoy watching clouds on sunny days.",\n    "We have more fun on sunny weekends.",\n    "Sunshine makes the day feel special.",\n    "On sunny days, I wear my hat.",\n    "The garden looks pretty in the sun.",\n    "Sunny afternoons are best for napping.",\n    "I feel good when the sun shines.",\n    "The sun rises early on sunny days.",\n    "We take long walks when it is sunny.",\n    "

In [None]:
# with open('sentences.txt', 'a') as f:
#     for sentence in sentences:
#         f.write(sentence + '\n')