In [4]:
# %% [markdown]
# ### Step 1: Setup & Imports

import os
import random
import openai
from dotenv import load_dotenv
from collections import Counter

In [3]:
import os

# Correct absolute paths
base_path = r"C:\Users\sushm\OneDrive\Desktop\llm_engineering-main\DBMS-25\names"
first_path = os.path.join(base_path, "first_name.txt")
last_path = os.path.join(base_path, "last_name.txt")

# Verify and load
if not os.path.exists(first_path) or not os.path.exists(last_path):
    print("‚ö†Ô∏è File not found. Check directory contents:")
    print(os.listdir(base_path))
else:
    with open(first_path, "r", encoding="utf-8") as f:
        first_names_raw = [line.strip() for line in f if line.strip()]

    with open(last_path, "r", encoding="utf-8") as f:
        last_names_raw = [line.strip() for line in f if line.strip()]

    print(f"‚úÖ Loaded {len(first_names_raw)} first names and {len(last_names_raw)} last names.")


‚úÖ Loaded 2195 first names and 1038 last names.


In [6]:
# ### Step 2: Read Raw Name Files

# Use the path variables directly (no quotes)
with open(first_path, "r", encoding="utf-8") as f:
    first_names_raw = [line.strip() for line in f if line.strip()]

with open(last_path, "r", encoding="utf-8") as f:
    last_names_raw = [line.strip() for line in f if line.strip()]

print(f"‚úÖ Loaded {len(first_names_raw)} first names and {len(last_names_raw)} last names.")


‚úÖ Loaded 2195 first names and 1038 last names.


In [11]:
from openai import OpenAI
import os, json, random

# Connect to local Ollama
client = OpenAI(
    base_url=os.getenv("OLLAMA_API_BASE_URL", "http://localhost:11434/v1"),
    api_key=os.getenv("OLLAMA_API_KEY", "dummy-key")
)

# ‚úÖ Step 1: Randomly sample names from all alphabets for better coverage
sampled_first_names = random.sample(first_names_raw, min(800, len(first_names_raw)))

# ‚úÖ Step 2: Split into chunks to avoid prompt overflow
chunks = [sampled_first_names[i:i+200] for i in range(0, len(sampled_first_names), 200)]

common_first_names = set()

for i, chunk in enumerate(chunks):
    print(f"üß† Processing chunk {i+1}/{len(chunks)} ...")

    prompt = f"""
    Below is a list of first names (sample {i+1} of {len(chunks)}):
    {chunk}

    From these, extract the most common or natural-sounding names.
    Focus on diversity across alphabets (A‚ÄìZ).
    Return only a JSON array of names, no explanation.
    """

    response = client.chat.completions.create(
        model="llama3.2",  # your Ollama model name
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3
    )

    # Parse response
    content = response.choices[0].message.content.strip()
    try:
        names = json.loads(content)
    except:
        names = [n.strip(" -‚Ä¢,") for n in content.splitlines() if n.strip()]

    common_first_names.update(names)

print(f"‚úÖ Extracted {len(common_first_names)} unique first names.")

# ‚úÖ Step 3: Trim or expand to ensure at least 75
common_first_names = list(common_first_names)
if len(common_first_names) < 75:
    common_first_names += random.sample(first_names_raw, 75 - len(common_first_names))

common_first_names = common_first_names[:max(75, len(common_first_names))]
print("‚úÖ Final list of first names (sample):", common_first_names[:15])


üß† Processing chunk 1/4 ...
üß† Processing chunk 2/4 ...
üß† Processing chunk 3/4 ...
üß† Processing chunk 4/4 ...
‚úÖ Extracted 427 unique first names.
‚úÖ Final list of first names (sample): ['"Sushil"', '"Benoy"', '"Ridhi"', '"Ratnesh"', '"Rabin"', '"Paramjit"', '"Niraj"', '"Nawaz"', '"Gurdayal"', '"Yakub"', 'Here is the list of most common and natural-sounding names from the given list, focusing on diversity across alphabets:', '"Sreelakshmi"', '"Vignesh"', '"Thirumal"', '"Aman"']


In [14]:
# ### Step 4: Generate ~400 Full Names by Combining Randomly

import random

num_names = 400
full_names = set()  # use set to avoid duplicates

while len(full_names) < num_names:
    first = random.choice(common_first_names).strip('"').strip("'").strip()
    last = random.choice(last_names_raw).strip('"').strip("'").strip()
    full_names.add(f"{first} {last}")

full_names = list(full_names)
print(f"‚úÖ Generated {len(full_names)} unique, clean full names.")

# ### Step 5: Save Output to File

with open("combined_names.txt", "w", encoding="utf-8") as f:
    for name in full_names:
        f.write(name + "\n")

print("‚úÖ Done! File 'combined_names.txt' created successfully.")

# Show sample output
print("\nüßæ Sample names:")
for name in full_names[:10]:
    print(name)


‚úÖ Generated 400 unique, clean full names.
‚úÖ Done! File 'combined_names.txt' created successfully.

üßæ Sample names:
Chandresh Naqvi
Layth Dave
Anvi Molla
Sudipto D‚ÄôAlia
Saurav Bedi
Iditri Wagh
Kalapu Malik
Aditi Taneja
Nippu Parihar
Ankurjeet Bistagond
