In [2]:
import random
from datetime import datetime, timedelta
from typing import List, Tuple, Dict
import numpy as np
from datasets import Dataset, Features, Value, Sequence
import re
from typing import List
from functools import cmp_to_key
from sklearn.linear_model import LogisticRegression
import os


%pdb on

with open("gen_data/NameDatabases/NamesDatabases/first names/us.txt", "r") as file:
    FIRST_NAMES = file.read().splitlines()
MIDDLE_NAMES = FIRST_NAMES

with open("gen_data/NameDatabases/NamesDatabases/surnames/us.txt", "r") as file:
    LAST_NAMES = file.read().splitlines()

with open("gen_data/towns/aus_towns.txt", "r") as file:
    CITIES = file.read().splitlines()

with open("gen_data/universities/chn_univs.txt", "r") as file:
    UNIVERSITIES = file.read().splitlines()


with open("gen_data/employers/ind_employ.txt", "r") as file:
    EMPLOYERS = file.read().splitlines()

used_names = set()

def generate_unique_name() -> Tuple[str, str, str]:
    global used_names

    while True:
        first = random.choice(FIRST_NAMES)
        middle = random.choice(MIDDLE_NAMES)
        last = random.choice(LAST_NAMES)
        if (first, middle, last) not in used_names:
            used_names.add((first, middle, last))
            return first, middle, last

def generate_birthdate() -> datetime:
    start_date = datetime(1900, 1, 1)
    end_date = datetime(2099, 12, 31)
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    random_date = start_date + timedelta(days=random_number_of_days)
    return random_date.replace(day=min(random_date.day, 28))

def generate_all_names(n: int) -> List[str]:
    names = []
    for _ in range(n):
        first, middle, last = generate_unique_name()
        names.append(f"{first} {middle} {last}")
    return names

def partition_names(names: List[str]) -> Tuple[List[str], List[str], List[str], List[str]]:
    random.shuffle(names)
    half = len(names) // 2
    parents = names[:half]
    children = names[half:]
    
    quarter = len(names) // 4
    left_friends = names[:quarter] + names[half:half+quarter]
    right_friends = names[quarter:half] + names[half+quarter:]
    
    return parents, children, left_friends, right_friends

def create_relationships(parents: List[str], children: List[str], left_friends: List[str], right_friends: List[str]) -> Dict[str, Dict[str, str]]:
    relationships = {}
    for parent, child in zip(parents, children):
        relationships[parent] = {"child": child, "best_friend": None}
        relationships[child] = {"parent": parent, "best_friend": None}
    
    for left, right in zip(left_friends, right_friends):
        relationships[left]["best_friend"] = right
        relationships[right]["best_friend"] = left
    
    return relationships

def generate_profiles():
    global N
    all_names = generate_all_names(N)
    parents, children, left_friends, right_friends = partition_names(all_names)
    relationships = create_relationships(parents, children, left_friends, right_friends)

    for name in all_names:
        profile = {
            "name": name,
            "birthdate": generate_birthdate(),
            "birth_city": random.choice(CITIES),
            "university": random.choice(UNIVERSITIES),
            "employer": random.choice(EMPLOYERS),
            "parent": relationships[name].get("parent", ""),
            "child": relationships[name].get("child", ""),
            "best_friend": relationships[name]["best_friend"]
        }
        yield profile

# Update the features to include new fields
chosen_params = Features({
    'name': Value('string'),
    'birthdate': Value('timestamp[s]'),
    'birth_city': Value('string'),
    'university': Value('string'),
    'employer': Value('string'),
    'parent': Value('string'),
    'child': Value('string'),
    'best_friend': Value('string'),
    'bio': Value('string')
})

# Generate the dataset
N = 10000
dataset = Dataset.from_generator(generate_profiles, features=chosen_params)

# Save the dataset
dataset.save_to_disk("profiles_dataset")

loaded_dataset = Dataset.load_from_disk("profiles_dataset")

# Or use the more efficient `map` function:
def is_nyc_born(examples):
    return {'is_nyc': [c == "Alice Springs, Northern Territory" for c in examples['birth_city']]}

nyc_result = loaded_dataset.map(is_nyc_born, batched=True)
nyc_percentage = (nyc_result['is_nyc'].count(True) / N) * 100
print(f"Percentage of people born in Alice Springs: {nyc_percentage:.1f}%")

Automatic pdb calling has been turned ON


Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Percentage of people born in Alice Springs: 0.3%


In [13]:
features_to_template = ['name', 'birthdate', 'birth_city', 'university', 'employer', 'parent', 'child', 'best_friend']

def get_name_sentence_prompt(prompt_prefix: str, all_details: str, chosen_detail: str, order: int) -> str:
    ordered_details = ['name', chosen_detail]
    prompt = prompt_prefix
    prompt += " for introducing the name in a person's biography, written in third-person."
    prompt += " The template should have exactly one wildcard slot consisting of the word 'name' enclosed in curly braces."
    prompt += f" and exactly one wildcard slot for {all_details[chosen_detail]}, consisting of '{chosen_detail}' enclosed in curly braces."
    prompt += f" {ordered_details[order]} should come first and {ordered_details[1-order]} should come second in the template."
    prompt += " The template should be as short as possible, respecting the above constraints."
    return prompt, ordered_details

def other_sentence_prompt_generator(short_detail: str, long_detail: str):
    def get_other_sentence_prompt(prompt_prefix: str, *args) -> str:
        prompt = prompt_prefix
        prompt += f" for introducing the {long_detail} in a person's biography, written in third-person."
        prompt += f" The template should have exactly one wildcard slot for {long_detail}, consisting of '{short_detail}' enclosed in curly braces."
        prompt += " The template should be as short as possible, respecting the above constraints."
        return prompt, (short_detail,)
    return get_other_sentence_prompt

def generate_diverse_prompts(prompt_function, num_prompts=50):
    prompt_parameters = {
        'writing_style': ['concise', 'conversational'],
        'tone': ['casual', 'humorous', 'serious'],
        'sophistication': ['highbrow', 'lowbrow'],
        'formality': ['formal', 'informal'],
        'genre': ['academic', 'journalistic', 'technical'],
        'figurative_language': ['literal', 'idiomatic'],
        'emotion': ['neutral', 'excited'],
        'vocabulary_level': ['basic', 'intermediate', 'advanced'],
    }
    all_details = {'birth date': 'subject\'s date of birth',
               'birth city': 'subject\'s city of birth',
               'university': 'university the subject attended',
               'employer': 'subject\'s current employer'}
    
    prompts = []
    attributes = []
    for _ in range(num_prompts):
        chosen_params = random.sample(list(prompt_parameters.keys()), 3)
        chosen_detail = random.choice(list(all_details.keys()))
        order = random.choice([0, 1])
        ordered_details = ['name', chosen_detail]
        prompt = "Generate a brief sentence template that is "
        prompt += ", ".join([f"{random.choice(prompt_parameters[k])}" for k in chosen_params[:-1]])
        prompt += f" and {random.choice(prompt_parameters[chosen_params[-1]])}"
        prompt, ordered_details = prompt_function(prompt, all_details, chosen_detail, order)
        prompts.append(prompt)
        attributes.append(ordered_details)
    
    return prompts, attributes



In [11]:
from openai import OpenAI
from typing import List

def query_llm(prompt: str) -> List[str]:
    client = OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key='goodkey',
    )

    try:
        completion = client.chat.completions.create(
            extra_headers={
                "HTTP-Referer": "YOUR_SITE_URL",  # Replace with your actual site URL
                "X-Title": "YOUR_APP_NAME",  # Replace with your actual app name
            },
            model="deepseek/deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that generates sentence templates for biographies."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=150,
            n=3,  # Generate 3 responses
        )
        
        # Extract the generated templates from the response
        templates = [choice.message.content.strip() for choice in completion.choices]
        
        return templates
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [18]:
def verify_template(template: str, attributes: List[str], top_n: int = 10) -> List[str]:
    # Regex patterns to verify syntax
    attribute_patterns = [r'\{' + attribute + r'\}' for attribute in attributes]
    other_brackets_pattern = r'\[.*?\]|\(.*?\)'
    curly_brackets_pattern = r'\{.*?\}'
    single_end_period_pattern = r'^[^.]*\.$'
    underscore_pattern = r'_'

    # Filter templates that match all criteria
    return (all([len(re.findall(ap, template)) == 1 for ap in attribute_patterns])
        and len(re.findall(curly_brackets_pattern, template)) == len(attributes)
        and not re.search(other_brackets_pattern, template)
        and re.match(single_end_period_pattern, template)
        and not re.search(underscore_pattern, template))
    
def filter_templates(templates: List[str]) -> List[str]:
    templates = list(set(templates))
    return sorted(templates, key=lambda x: len(x))

def compare_templates(t1: str, t2: str, max_retries: int = 2) -> int:
    prompt = f"Compare these two biography template sentences and decide which is better. A good sentence:\n"\
          "  - introduces the name\n"\
          "  - is not too long\n"\
          "  - avoids extraneous information beyond the name\n"\
          "The response should be a single number, either 1 or 2, indicating which template is better, and nothing else.\n"\
          "1: {t1}\n"\
          "2: {t2}"
    response = query_llm(prompt)
    if len(response) >= 1 and response[0] in ['1', '2']:
        return 1 if response[0] == '1' else -1
    
    return None

def generate_comparisons(templates: List[str], num_comparisons: int) -> List[Tuple[int, int, int]]:
    comparisons = []
    for _ in range(num_comparisons):
        i, j = random.sample(range(len(templates)), 2)
        result = compare_templates(templates[i], templates[j])
        if result is not None:
            comparisons.append((i, j, result))
    return comparisons

def estimate_template_quality(templates: List[str], num_comparisons: int = 200) -> List[float]:
    comparisons = generate_comparisons(templates, num_comparisons)
    
    X = []
    y = []
    for i, j, result in comparisons:
        X.append([0] * len(templates))
        X[-1][i] = 1
        X[-1][j] = -1
        y.append((result + 1) // 2)  # Convert -1/1 to 0/1
    
    model = LogisticRegression(fit_intercept=False)
    model.fit(X, y)
    
    return model.coef_[0]

def rank_templates(templates: List[str]) -> List[str]:
    qualities = estimate_template_quality(templates)
    return [t for _, t in sorted(zip(qualities, templates), reverse=True)]

# Use the function to generate diverse prompts
diverse_name_prompts, attributes_list = generate_diverse_prompts(get_name_sentence_prompt, num_prompts=250)
# Usage
templates = []
for prompt, attributes in zip(diverse_name_prompts, attributes_list):
    template = query_llm(prompt)[0]
    if verify_template(template, attributes):
        templates.append(template)
best_name_templates = filter_templates(templates)
print(best_name_templates)

['{birth date}, {name}.', '{name} is at {employer}.', '{employer} employee {name}.', '{birth city} native {name}.', '{name} works at {employer}.', '{name} is a {employer} guy.', '{name} went to {university}.', '{name} is from {birth city}.', "{employer}'s esteemed {name}.", '{employer} technician {name}.', '{name} attended {university}.', 'Born on {birth date}, {name}.', '{birth date}, {name} was born.', '{name} hails from {birth city}.', '{name} is a {employer} employee.', '{name} was born on {birth date}.', '{name} was born in {birth city}.', 'Meet {name}, born on {birth date}.', '{name} is a {employer} professional.', '{name} is a scholar at {university}.', 'At {employer}, {name} is crushing it.', '{name} is a graduate of {university}.', '{name} is a professional at {employer}.', '{birth date} is when {name} popped out.', 'At {university}, they called him {name}.', 'Meet {name}, a proud alum of {university}.', '{name} is the guy who went to {university}.', '{birth city} native {name

In [17]:
diverse_name_prompts

["Generate a brief sentence template that is idiomatic, conversational and excited for introducing the name in a person's biography, written in third-person. The template should have exactly one wildcard slot consisting of the word 'name' enclosed in curly braces. and exactly one wildcard slot for subject's city of birth, consisting of 'birth city' enclosed in curly braces. birth city should come first and name should come second in the template. The template should be as short as possible, respecting the above constraints.",
 "Generate a brief sentence template that is technical, excited and intermediate for introducing the name in a person's biography, written in third-person. The template should have exactly one wildcard slot consisting of the word 'name' enclosed in curly braces. and exactly one wildcard slot for subject's city of birth, consisting of 'birth city' enclosed in curly braces. name should come first and birth city should come second in the template. The template should

In [19]:
all_details = {'birth date': 'subject\'s date of birth',
            'birth city': 'subject\'s city of birth',
            'university': 'university the subject attended',
            'employer': 'subject\'s current employer',
            'best friend': 'subject\'s best friend',
            'parent': 'subject\'s parent',
            'child': 'subject\'s child'}

other_templates = dict()

for k, v in all_details.items():
    templates = []
    prompt_fn = other_sentence_prompt_generator(k, v)

    diverse_other_prompts, attributes_list = generate_diverse_prompts(prompt_fn, num_prompts=250)
    for prompt, attributes in zip(diverse_other_prompts, attributes_list):
        template = query_llm(prompt)[0]
        if verify_template(template, attributes):
            templates.append(template)
    best_other_templates = filter_templates(templates)
    other_templates['_'.join(k.split(' '))] = best_other_templates
print(other_templates)

{'birth_date': ['{birth date}.', 'Born {birth date}.', '{birth date}: Born.', '{birth date} - Born.', 'Born on {birth date}.', '- Born on {birth date}.', 'They were born on {birth date}.', '{birth date} is when they were born.', '{birth date} was when they were born.', '{birth date} is the day they were born.', "{birth date}, that's when this guy popped out.", "{birth date} marked the subject's date of birth.", '{birth date} is the date of birth of the subject.', '{birth date} marks the day they entered the world.', '{birth date} was the date of birth for the subject.', "{birth date} - that's the day this person popped out.", "{birth date}, that's when this dude first popped out.", 'On {birth date}, a luminary was ushered into existence.', '{birth date}: the day the world got a little more awesome.', "{birth date}, that's when this person came into the world.", '{birth date} - the day the world got a little less boring.', '{birth date} is when this dude first popped into the world.', "

In [20]:
os.makedirs("gen_data/templates", exist_ok=True)

# Save best_name_templates
with open("gen_data/templates/best_name_templates.txt", "w") as f:
    for template in best_name_templates:
        f.write(f"{template}\n")

# Save other_templates
for key, templates in other_templates.items():
    with open(f"gen_data/templates/{key}_templates.txt", "w") as f:
        for template in templates:
            f.write(f"{template}\n")

In [21]:
def replace_curly_braces(text):
    def replace_inner(match):
        inner = match.group(1)
        return '{' + inner.replace(' ', '_') + '}'

    pattern = r'\{([^}]+)\}'
    return re.sub(pattern, replace_inner, text)

# Directory containing the template files
template_dir = 'generated_data/templates'

# Iterate over all files in the directory
for filename in os.listdir(template_dir):
    if filename.endswith('.txt'):  # Process only text files
        file_path = os.path.join(template_dir, filename)
        
        # Read the file content
        with open(file_path, 'r') as file:
            content = file.read()
        
        # Apply the replacement
        updated_content = replace_curly_braces(content)
        
        # Write the updated content back to the file
        with open(file_path, 'w') as file:
            file.write(updated_content)
        
        print(f"Processed: {filename}")

print("Replacement complete for all files.")

Processed: parent_templates.txt
Processed: birth_date_templates.txt
Processed: university_templates.txt
Processed: employer_templates.txt
Processed: worst_enemy_templates.txt
Processed: birth_city_templates.txt
Processed: best_friend_templates.txt
Processed: best_name_templates.txt
Processed: child_templates.txt
Replacement complete for all files.
