This notebook will be used to create a synthetic dataset locally with gpt-oss 20b. It will also feature determenisticly produced datestampts to be able to showcase the analytical capabilities of the POC

In [1]:
# Cell 1: Imports, configuration, and topic pattern assignment

import os
import math
import uuid
import json
import random
import asyncio
from datetime import datetime, timedelta
from typing import List, Dict, Any, Tuple

import numpy as np
import pandas as pd
import httpx

# -----------------------------
# Reproducibility
# -----------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# -----------------------------
# Core configuration
# -----------------------------
TOTAL_EMAILS = 2500
DAYS = 60  # ~2 months
ENDS_TODAY = True  # if False, set FIXED_END_DATE below
FIXED_END_DATE = datetime(2025, 6, 30).date()  # used only if ENDS_TODAY=False

# Ollama (local LLM) configuration
OLLAMA_BASE_URL = os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434")
LLM_MODEL = os.environ.get("LLM_MODEL", "gpt-oss:20b")
CONCURRENCY = 8  # tune based on your machine
GEN_TIMEOUT = 90  # seconds

# Output
OUTPUT_DIR = os.path.abspath("ModelFinetuning/data_creation/emails_data")
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_CSV = os.path.join(OUTPUT_DIR, "email_dataset_2500_with_timestamps.csv")

# Topics (exactly one topic per email)
TOPICS = [
    "Healthcare",
    "Education",
    "Crime",
    "Immigration",
    "Transportation",
    "Environment",
    "Taxation",
    "Housing",
    "Pensions",
]

# Sentiments
SENTIMENTS = ["Very Negative", "Negative", "Neutral", "Positive", "Very Positive"]

# Length distribution (optional metadata)
LENGTH_BUCKETS = ["Short", "Medium", "Long"]
LENGTH_PROBS = [0.10, 0.80, 0.10]
LENGTH_TO_WORDS = {"Short": (60, 120), "Medium": (130, 280), "Long": (300, 600)}

# -----------------------------
# Randomly assign frequency trends: 3 up, 3 down, 3 flat (constant-ish)
# -----------------------------
topics_shuffled = TOPICS.copy()
random.shuffle(topics_shuffled)
freq_up = topics_shuffled[:3]
freq_down = topics_shuffled[3:6]
freq_flat = topics_shuffled[6:]

FREQ_TREND = {t: ("up" if t in freq_up else "down" if t in freq_down else "flat") for t in TOPICS}

# -----------------------------
# Randomly assign sentiment trends (independent of frequency):
# 3 up (towards more negative), 3 down (towards more positive), 3 flat
# -----------------------------
topics_shuffled2 = TOPICS.copy()
random.shuffle(topics_shuffled2)
sent_up = topics_shuffled2[:3]      # "up" means more negative over time
sent_down = topics_shuffled2[3:6]   # "down" means more positive over time
sent_flat = topics_shuffled2[6:]

SENT_TREND = {t: ("up" if t in sent_up else "down" if t in sent_down else "flat") for t in TOPICS}

print("Frequency trends per topic:", FREQ_TREND)
print("Sentiment trends per topic:", SENT_TREND)

Frequency trends per topic: {'Healthcare': 'flat', 'Education': 'flat', 'Crime': 'down', 'Immigration': 'up', 'Transportation': 'down', 'Environment': 'flat', 'Taxation': 'up', 'Housing': 'up', 'Pensions': 'down'}
Sentiment trends per topic: {'Healthcare': 'down', 'Education': 'flat', 'Crime': 'up', 'Immigration': 'down', 'Transportation': 'flat', 'Environment': 'up', 'Taxation': 'down', 'Housing': 'up', 'Pensions': 'flat'}


In [2]:
# Cell 2: Helpers for trends, seasonality, spikes, and daily totals

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def make_trend(kind: str, days: int) -> np.ndarray:
    x = np.linspace(-2.0, 2.0, days)
    if kind == "up":
        base = sigmoid(x)               # gently increasing
    elif kind == "down":
        base = 1.0 - sigmoid(x)         # gently decreasing
    else:
        base = np.ones(days) * 0.6      # constant-ish baseline
        base += np.random.normal(0.0, 0.03, size=days)  # tiny wobble
    base = np.clip(base, 0.1, None)
    return base

def weekly_seasonality(days: int, weekend_factor: float = 0.8) -> np.ndarray:
    # lower activity on weekends
    if ENDS_TODAY:
        end_date = datetime.now().date()
    else:
        end_date = FIXED_END_DATE
    start_date = end_date - timedelta(days=days - 1)
    vals = []
    for d in range(days):
        date = start_date + timedelta(days=d)
        dow = date.weekday()  # Mon=0
        vals.append(weekend_factor if dow >= 5 else 1.0)
    return np.array(vals)

def smooth_wave(days: int, amplitude: float = 0.15, period: float = 30.0, phase: float = 0.5) -> np.ndarray:
    x = np.arange(days)
    return 1.0 + amplitude * np.sin(2 * np.pi * x / period + phase)

def add_spikes(base: np.ndarray, num_spikes: int = 2, max_amp: float = 0.8, width_range: Tuple[float, float] = (2.0, 6.0)) -> np.ndarray:
    x = np.arange(len(base))
    spikes = np.zeros_like(base)
    for _ in range(num_spikes):
        center = np.random.randint(int(0.1 * len(base)), int(0.9 * len(base)))
        width = np.random.uniform(*width_range)
        amp = np.random.uniform(0.2, max_amp)
        spikes += amp * np.exp(-0.5 * ((x - center) / width) ** 2)
    return base + spikes

def normalize_rows_to_probs(M: np.ndarray) -> np.ndarray:
    row_sums = M.sum(axis=1, keepdims=True)
    out = np.divide(M, row_sums, out=np.ones_like(M) / M.shape[1], where=row_sums != 0)
    return out

def integer_allocation_from_probs(total: int, probs: np.ndarray) -> np.ndarray:
    raw = probs * total
    ints = np.floor(raw).astype(int)
    remainder = total - ints.sum()
    order = np.argsort(-(raw - ints))
    for i in order[:remainder]:
        ints[i] += 1
    return ints

def build_day_totals(days: int, total_emails: int) -> np.ndarray:
    # daily volume with seasonality + slow wave + light noise
    w = weekly_seasonality(days, weekend_factor=0.8)
    wave = smooth_wave(days, amplitude=0.12, period=28.0, phase=0.7)
    noise = np.random.normal(1.0, 0.04, size=days)
    base = w * wave * noise
    base = np.clip(base, 0.4, None)
    base = base / base.sum()
    raw = base * total_emails
    ints = np.floor(raw).astype(int)
    remainder = total_emails - ints.sum()
    order = np.argsort(-(raw - ints))
    for i in order[:remainder]:
        ints[i] += 1
    return ints

In [3]:
# Cell 3: Build per-topic daily frequencies and per-topic daily sentiment "negativity index"

# Frequency intensity per topic per day
freq_matrix = []
for t in TOPICS:
    trend = make_trend(FREQ_TREND[t], DAYS)
    seas = weekly_seasonality(DAYS, weekend_factor=0.85)
    wave = smooth_wave(DAYS, amplitude=0.10, period=26.0, phase=np.random.uniform(0, 2*np.pi))
    base = trend * seas * wave
    base += np.random.normal(0.0, 0.03, size=DAYS)  # light noise
    base = np.clip(base, 0.05, None)
    base = add_spikes(base, num_spikes=2, max_amp=0.7, width_range=(2.5, 6.0))  # spikes in topic frequency
    freq_matrix.append(base)

freq_matrix = np.stack(freq_matrix, axis=1)  # shape: (DAYS, N_TOPICS)
# Normalize per day to probabilities over topics
topic_probs_per_day = normalize_rows_to_probs(freq_matrix)

# Daily totals
day_totals = build_day_totals(DAYS, TOTAL_EMAILS)

# Allocate integer counts per day per topic
day_topic_counts = np.zeros_like(topic_probs_per_day, dtype=int)
for d in range(DAYS):
    day_topic_counts[d] = integer_allocation_from_probs(day_totals[d], topic_probs_per_day[d])

# Sentiment "negativity index" per topic per day (0..1), with its own trend + spikes
neg_index = []
for t in TOPICS:
    kind = SENT_TREND[t]
    if kind == "up":
        base = make_trend("up", DAYS)  # more negative over time
    elif kind == "down":
        base = make_trend("down", DAYS)  # more positive over time
    else:
        base = make_trend("flat", DAYS)
    base = add_spikes(base, num_spikes=2, max_amp=0.5, width_range=(2.0, 5.0))  # spikes in sentiment
    base += np.random.normal(0.0, 0.03, size=DAYS)
    base = np.clip(base, 0.0, 1.0)
    neg_index.append(base)

neg_index = np.stack(neg_index, axis=1)  # shape: (DAYS, N_TOPICS)

print("Check totals:", day_topic_counts.sum(), "==", TOTAL_EMAILS)
topic_totals = dict(zip(TOPICS, day_topic_counts.sum(axis=0).tolist()))
print("Total emails per topic:", topic_totals)

Check totals: 2500 == 2500
Total emails per topic: {'Healthcare': 292, 'Education': 275, 'Crime': 289, 'Immigration': 261, 'Transportation': 255, 'Environment': 330, 'Taxation': 268, 'Housing': 255, 'Pensions': 275}


In [4]:
# Cell 4: Sampling utilities for sentiment/length, timestamp assignment, and prompt building

def sentiment_probs_from_neg_index(neg_val: float) -> np.ndarray:
    # Map 0..1 negativity to 5-way sentiment distribution
    # More mass to positive when neg_val low; more to negative when neg_val high.
    neg_val = float(np.clip(neg_val, 0.0, 1.0))
    very_neg = 0.05 + 0.35 * (neg_val ** 1.2)
    neg = 0.10 + 0.30 * (neg_val ** 1.0)
    very_pos = 0.05 + 0.30 * ((1 - neg_val) ** 1.2)
    pos = 0.10 + 0.25 * ((1 - neg_val) ** 1.0)
    neutral = 1.0 - (very_neg + neg + pos + very_pos)
    probs = np.array([very_neg, neg, neutral, pos, very_pos])
    probs = np.clip(probs, 0.0, None)
    probs = probs / probs.sum()
    return probs

def sample_sentiment_for(day: int, topic_idx: int) -> str:
    p = sentiment_probs_from_neg_index(neg_index[day, topic_idx])
    return np.random.choice(SENTIMENTS, p=p)

def sample_length() -> str:
    return np.random.choice(LENGTH_BUCKETS, p=LENGTH_PROBS)

def sample_word_count(bucket: str) -> int:
    lo, hi = LENGTH_TO_WORDS[bucket]
    return np.random.randint(lo, hi + 1)

def assign_timestamps(days: int, counts_per_day: np.ndarray) -> List[str]:
    # spread within each day mostly 09:00-20:00
    if ENDS_TODAY:
        end_date = datetime.now().date()
    else:
        end_date = FIXED_END_DATE
    start_date = end_date - timedelta(days=days - 1)
    out = []
    for d in range(days):
        c = int(counts_per_day[d])
        for _ in range(c):
            minute = np.random.randint(9 * 60, 20 * 60)
            dt = datetime.combine(start_date + timedelta(days=d), datetime.min.time()) + timedelta(minutes=minute)
            out.append(dt.isoformat())
    return out

def build_prompt(topic: str, sentiment: str, words: int) -> str:
    tone = {
        "Very Negative": "The tone should be very negative and strongly critical.",
        "Negative": "The tone should be negative and concerned.",
        "Neutral": "The tone should be neutral and factual.",
        "Positive": "The tone should be positive and supportive.",
        "Very Positive": "The tone should be very positive and highly supportive.",
    }[sentiment]

    return f"""
Write a realistic UK constituent email to their MP.

Requirements:
- Audience: MP Mr Miller. Start with "Dear Mr Miller," and end with a short sign-off and the sender's first name only.
- Topic focus: {topic}. Keep the content tightly focused on this one topic.
- Tone: {tone}
- Length: about {words} words.
- Style: UK English with concrete, grounded details (local experiences, practical concerns), no generic filler.
- Do NOT include dates, subject lines, or any headers apart from greeting/sign-off.
""".strip()

In [5]:
# Cell 5: Build records, generate emails via Ollama (async), faster settings + live progress, and save CSV

from tqdm.auto import tqdm
import time
import os
import httpx

def estimate_max_tokens(words: int) -> int:
    # ~1.3 tokens/word; add headroom but cap to keep fast
    return int(max(96, min(384, round(words * 1.4))))

async def call_ollama_generate(prompt: str, client: httpx.AsyncClient, max_tokens: int, timeout: int = GEN_TIMEOUT) -> str:
    payload = {
        "model": LLM_MODEL,
        "prompt": prompt,
        "stream": False,
        "keep_alive": "30m",  # keep model hot
        "options": {
            "temperature": 0.4,
            "top_p": 0.9,
            "top_k": 50,
            "num_predict": int(max_tokens),
            "num_ctx": 1024,                  # smaller context = faster
            "num_thread": os.cpu_count() or 8 # use all CPU threads (Metal will use GPU)
        },
    }
    r = await client.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload, timeout=timeout)
    r.raise_for_status()
    data = r.json()
    return data.get("response", "").strip()

# Flatten day_topic_counts into per-email rows while preserving day order
rows_meta = []
for d in range(DAYS):
    for j, topic in enumerate(TOPICS):
        cnt = int(day_topic_counts[d, j])
        for _ in range(cnt):
            rows_meta.append({"day": d, "topic": topic, "topic_idx": j})

# Assign timestamps (one per email in day order)
timestamps = assign_timestamps(DAYS, day_totals)

# Build prompts and per-email sentiment/length
prompts = []
email_meta = []
max_tokens_list = []
for i, r in enumerate(rows_meta):
    sent = sample_sentiment_for(r["day"], r["topic_idx"])
    length_bucket = sample_length()
    words = sample_word_count(length_bucket)
    prompt = build_prompt(r["topic"], sent, words)
    prompts.append(prompt)
    email_meta.append({
        "email_id": str(uuid.uuid4()),
        "timestamp": timestamps[i],
        "to": "Mr Miller",
        "topic_gt": r["topic"],
        "sentiment_gt": sent,
        "length_bucket": length_bucket,
        "word_target": words,
    })
    max_tokens_list.append(estimate_max_tokens(words))

# Async generation with tuned concurrency + live progress
EFFECTIVE_CONCURRENCY = int(os.environ.get("GEN_CONCURRENCY", str(CONCURRENCY)))
semaphore = asyncio.Semaphore(EFFECTIVE_CONCURRENCY)
results = [""] * len(prompts)
topic_counts_done = {t: 0 for t in TOPICS}
stats = {"fail": 0}
counter_lock = asyncio.Lock()

async def worker(idx: int, client: httpx.AsyncClient):
    async with semaphore:
        try:
            text = await call_ollama_generate(prompts[idx], client, max_tokens=max_tokens_list[idx])
            if not text.lstrip().startswith("Dear Mr Miller"):
                text = "Dear Mr Miller,\n\n" + text.strip()
            results[idx] = text.strip()
        except Exception as e:
            results[idx] = f"Dear Mr Miller,\n\n[Generation failed: {e}]"
            async with counter_lock:
                stats["fail"] += 1
        finally:
            async with counter_lock:
                topic_counts_done[email_meta[idx]["topic_gt"]] += 1

async def generate_all():
    total = len(prompts)
    start_t = time.perf_counter()
    limits = httpx.Limits(max_connections=EFFECTIVE_CONCURRENCY, max_keepalive_connections=EFFECTIVE_CONCURRENCY)
    async with httpx.AsyncClient(limits=limits) as client:
        tasks = [asyncio.create_task(worker(i, client)) for i in range(total)]
        with tqdm(total=total, desc=f"Generating (conc={EFFECTIVE_CONCURRENCY})", smoothing=0.2) as pbar:
            for fut in asyncio.as_completed(tasks):
                await fut
                done = pbar.n + 1
                elapsed = time.perf_counter() - start_t
                rate = done / max(elapsed, 1e-6)
                eta = (total - done) / max(rate, 1e-6)
                pbar.set_postfix_str(f"{rate:.2f} e/s | ETA {eta/60:.1f} min | fails {stats['fail']}")
                pbar.update(1)
                if done % 200 == 0 or done == total:
                    snapshot = ", ".join(f"{k}:{topic_counts_done[k]}" for k in TOPICS)
                    print(f"\nProgress {done}/{total} | Topic done counts: {snapshot}\n")

print(f"Using concurrency: {EFFECTIVE_CONCURRENCY} (override with GEN_CONCURRENCY env var)")
await generate_all()

# Build DataFrame and save
print("Assembling DataFrame...")
df = pd.DataFrame({
    "email_id": [m["email_id"] for m in email_meta],
    "timestamp": [m["timestamp"] for m in email_meta],
    "to": [m["to"] for m in email_meta],
    "topic_gt": [m["topic_gt"] for m in email_meta],
    "sentiment_gt": [m["sentiment_gt"] for m in email_meta],
    "length_bucket": [m["length_bucket"] for m in email_meta],
    "word_target": [m["word_target"] for m in email_meta],
    "email_text": results,
})
print("Saving CSV...")
df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved {len(df)} emails to {OUTPUT_CSV} | failures during generation: {stats['fail']}")

# Quick sanity summaries
print("\nPer-topic totals:")
print(df.groupby("topic_gt").size().sort_values(ascending=False))
print("\nSentiment distribution:")
print(df["sentiment_gt"].value_counts())
print("\nSample rows:")
print(df.head(3))

  from .autonotebook import tqdm as notebook_tqdm


Using concurrency: 8 (override with GEN_CONCURRENCY env var)


Generating (conc=8):   8%|▊         | 200/2500 [32:41<6:01:49,  9.44s/it, 0.10 e/s | ETA 376.0 min | fails 19]


Progress 200/2500 | Topic done counts: Healthcare:31, Education:26, Crime:32, Immigration:5, Transportation:33, Environment:23, Taxation:9, Housing:7, Pensions:34



Generating (conc=8):  16%|█▌        | 400/2500 [1:04:55<4:58:18,  8.52s/it, 0.10 e/s | ETA 340.8 min | fails 24]


Progress 400/2500 | Topic done counts: Healthcare:55, Education:47, Crime:61, Immigration:12, Transportation:65, Environment:46, Taxation:22, Housing:23, Pensions:69



Generating (conc=8):  24%|██▍       | 600/2500 [2:29:22<25:13:03, 47.78s/it, 0.10 e/s | ETA 306.8 min | fails 38]  


Progress 600/2500 | Topic done counts: Healthcare:85, Education:70, Crime:86, Immigration:20, Transportation:91, Environment:68, Taxation:34, Housing:36, Pensions:110



Generating (conc=8):  32%|███▏      | 800/2500 [3:01:46<4:31:12,  9.57s/it, 0.10 e/s | ETA 274.7 min | fails 53] 


Progress 800/2500 | Topic done counts: Healthcare:104, Education:91, Crime:110, Immigration:36, Transportation:113, Environment:102, Taxation:49, Housing:50, Pensions:145



Generating (conc=8):  40%|████      | 1000/2500 [3:34:57<4:21:22, 10.46s/it, 0.10 e/s | ETA 243.7 min | fails 71]


Progress 1000/2500 | Topic done counts: Healthcare:127, Education:118, Crime:142, Immigration:61, Transportation:131, Environment:130, Taxation:59, Housing:62, Pensions:170



Generating (conc=8):  48%|████▊     | 1200/2500 [4:09:15<4:00:29, 11.10s/it, 0.10 e/s | ETA 213.2 min | fails 105]


Progress 1200/2500 | Topic done counts: Healthcare:147, Education:141, Crime:172, Immigration:89, Transportation:148, Environment:154, Taxation:72, Housing:73, Pensions:204



Generating (conc=8):  56%|█████▌    | 1400/2500 [4:43:07<3:03:40, 10.02s/it, 0.10 e/s | ETA 181.2 min | fails 136]


Progress 1400/2500 | Topic done counts: Healthcare:167, Education:163, Crime:203, Immigration:114, Transportation:166, Environment:178, Taxation:91, Housing:87, Pensions:231



Generating (conc=8):  64%|██████▍   | 1600/2500 [5:17:07<2:19:33,  9.30s/it, 0.10 e/s | ETA 148.9 min | fails 151]


Progress 1600/2500 | Topic done counts: Healthcare:183, Education:183, Crime:232, Immigration:138, Transportation:188, Environment:214, Taxation:112, Housing:105, Pensions:245



Generating (conc=8):  72%|███████▏  | 1800/2500 [5:50:20<1:51:29,  9.56s/it, 0.10 e/s | ETA 115.8 min | fails 169]


Progress 1800/2500 | Topic done counts: Healthcare:200, Education:207, Crime:259, Immigration:156, Transportation:212, Environment:245, Taxation:137, Housing:128, Pensions:256



Generating (conc=8):  80%|████████  | 2000/2500 [6:23:58<1:07:32,  8.10s/it, 0.10 e/s | ETA 82.9 min | fails 193] 


Progress 2000/2500 | Topic done counts: Healthcare:223, Education:225, Crime:274, Immigration:184, Transportation:235, Environment:273, Taxation:170, Housing:154, Pensions:262



Generating (conc=8):  88%|████████▊ | 2200/2500 [6:58:24<55:04, 11.02s/it, 0.10 e/s | ETA 49.9 min | fails 217]  


Progress 2200/2500 | Topic done counts: Healthcare:257, Education:244, Crime:281, Immigration:211, Transportation:245, Environment:295, Taxation:204, Housing:195, Pensions:268



Generating (conc=8):  96%|█████████▌| 2400/2500 [7:32:12<17:24, 10.45s/it, 0.10 e/s | ETA 16.7 min | fails 243]


Progress 2400/2500 | Topic done counts: Healthcare:281, Education:263, Crime:286, Immigration:243, Transportation:252, Environment:318, Taxation:247, Housing:237, Pensions:273



Generating (conc=8): 100%|██████████| 2500/2500 [7:48:55<00:00, 11.25s/it, 0.10 e/s | ETA 0.0 min | fails 252] 


Progress 2500/2500 | Topic done counts: Healthcare:292, Education:275, Crime:289, Immigration:261, Transportation:255, Environment:330, Taxation:268, Housing:255, Pensions:275

Assembling DataFrame...
Saving CSV...
Saved 2500 emails to /Users/felixmul/Desktop/repos/MP-Project/ModelFinetuning/data_creation/ModelFinetuning/data_creation/emails_data/email_dataset_2500_with_timestamps.csv | failures during generation: 252

Per-topic totals:
topic_gt
Environment       330
Healthcare        292
Crime             289
Education         275
Pensions          275
Taxation          268
Immigration       261
Housing           255
Transportation    255
dtype: int64

Sentiment distribution:
sentiment_gt
Negative         702
Very Negative    583
Positive         501
Very Positive    378
Neutral          336
Name: count, dtype: int64

Sample rows:
                               email_id            timestamp         to  \
0  23d27c17-9d1d-47ca-8387-f08f21fe0162  2025-06-29T12:24:00  Mr Miller   
1  d8


