## This notebook's purpose is to create a synthetic Email Dataset

1250 emails should be enough to finetune a first version of a topic extractor. Furhermore, we will need it to showcase what an indepth inbox analysis would look like 

In [14]:
import pandas as pd
import numpy as np
import random
from typing import List, Tuple
import uuid
import json

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

print("Libraries imported successfully")
print("Starting Recipe Creation - Phase 1")

Libraries imported successfully
Starting Recipe Creation - Phase 1


In [16]:
# Read the JSON file
with open('personas.json', 'r') as f:
    data = json.load(f)

# Extract the personas list and normalize it
personas = pd.json_normalize(data['personas'])

print(len(personas))
personas.head(2)

50


Unnamed: 0,persona_name,frequency_coefficient,characteristics.age,characteristics.gender_identity,characteristics.occupation,characteristics.political_leaning,characteristics.communication_style,characteristics.core_motivation
0,The 'Progressive Activist' Student,5,19,Non-binary,University Student,"Very left-wing (Labour/Green), passionate abou...","Well-researched and emotionally charged, often...",A deep-seated belief in systemic change and ho...
1,The 'Backbone Conservative' Retiree,5,70,Male,Retired Small Business Owner,"Staunchly Conservative, believes in fiscal res...","Formal, direct, and expresses concern.",Preserving a way of life and ensuring financia...


In [20]:
# Define 6 specific sentiments as per requirements
sentiments = [
    "Very Negative",
    "Negative", 
    "Neutral",
    "Positive",
    "Very Positive",
    "Mixed"
]


In [26]:
# Define 15 core themes relevant to UK politics
core_themes = pd.read_json("core_themes.json")
print(len(core_themes))
core_themes.head()

30


Unnamed: 0,theme,coefficient
0,Healthcare & NHS,5
1,Housing & Planning,5
2,Immigration & Asylum,5
3,Social Security & Benefits,5
4,Cost of Living & Economy,5


In [27]:
# Create weighted selection of personas based on frequency_coefficient
def create_weighted_persona_list(personas_df, total_count=1250):
    """Create a list of personas weighted by their frequency coefficient"""
    
    # Create weights based on frequency_coefficient
    weights = personas_df['frequency_coefficient'].values
    
    # Sample personas with replacement based on weights
    selected_indices = np.random.choice(
        len(personas_df), 
        size=total_count, 
        p=weights/weights.sum()
    )
    
    # Get the selected personas
    selected_personas = personas_df.iloc[selected_indices].reset_index(drop=True)
    
    return selected_personas

# Generate 1250 persona selections
recipe_personas = create_weighted_persona_list(personas, 1250)
print(f"Generated {len(recipe_personas)} persona selections")
print("Persona distribution:")
print(recipe_personas['persona_name'].value_counts().head(10))

Generated 1250 persona selections
Persona distribution:
persona_name
The 'Civic Pragmatist' Community Leader    80
The 'Loyal National' Brexiteer             75
The 'Progressive Activist' Student         73
The 'Backbone Conservative' Retiree        71
The Single-Issue Campaigner                62
The 'Established Liberal' Professional     61
The Local Journalist                       40
The Trade Union Representative             30
The 'Soft-Left Liberal' Teacher            29
The 'Well-Off Traditionalist' Landowner    29
Name: count, dtype: int64


In [28]:
# Create equally weighted sentiment selection
def create_sentiment_list(sentiments, total_count=1250):
    """Create a list of sentiments with equal probability"""
    
    selected_sentiments = np.random.choice(sentiments, size=total_count)
    return selected_sentiments

# Generate 1250 sentiment selections
recipe_sentiments = create_sentiment_list(sentiments, 1250)
print(f"Generated {len(recipe_sentiments)} sentiment selections")
print("Sentiment distribution:")
unique, counts = np.unique(recipe_sentiments, return_counts=True)
for sentiment, count in zip(unique, counts):
    print(f"{sentiment}: {count}")

Generated 1250 sentiment selections
Sentiment distribution:
Mixed: 218
Negative: 219
Neutral: 168
Positive: 228
Very Negative: 203
Very Positive: 214


In [29]:
# Create topic combinations based on specified distribution
def create_topic_combinations(themes_df, total_count=1250):
    """
    Create topic combinations:
    - 70% single topics (875 emails)
    - 25% double topics (313 emails) 
    - 5% triple topics (62 emails)
    """
    
    # Calculate counts for each type
    single_count = int(total_count * 0.70)  # 875
    double_count = int(total_count * 0.25)  # 313
    triple_count = total_count - single_count - double_count  # 62
    
    print(f"Single topics: {single_count}")
    print(f"Double topics: {double_count}")
    print(f"Triple topics: {triple_count}")
    
    # Create weights based on theme coefficients
    theme_weights = themes_df['coefficient'].values
    theme_names = themes_df['theme'].values
    
    all_topic_combinations = []
    
    # Single topics
    for _ in range(single_count):
        topic = np.random.choice(theme_names, p=theme_weights/theme_weights.sum())
        all_topic_combinations.append([topic])
    
    # Double topics (randomly combined)
    for _ in range(double_count):
        topics = np.random.choice(
            theme_names, 
            size=2, 
            replace=False,  # Don't allow duplicate topics in same email
            p=theme_weights/theme_weights.sum()
        )
        all_topic_combinations.append(list(topics))
    
    # Triple topics (randomly combined)
    for _ in range(triple_count):
        topics = np.random.choice(
            theme_names, 
            size=3, 
            replace=False,  # Don't allow duplicate topics in same email
            p=theme_weights/theme_weights.sum()
        )
        all_topic_combinations.append(list(topics))
    
    return all_topic_combinations

# Generate topic combinations
recipe_topics = create_topic_combinations(core_themes, 1250)
print(f"Generated {len(recipe_topics)} topic combinations")

# Show distribution of topic counts
topic_counts = [len(topics) for topics in recipe_topics]
unique, counts = np.unique(topic_counts, return_counts=True)
print("\nTopic combination distribution:")
for num_topics, count in zip(unique, counts):
    print(f"{num_topics} topic(s): {count} emails")

Single topics: 875
Double topics: 312
Triple topics: 63
Generated 1250 topic combinations

Topic combination distribution:
1 topic(s): 875 emails
2 topic(s): 312 emails
3 topic(s): 63 emails


In [30]:
# Create length distribution as per requirements
def create_length_list(total_count=1250):
    """
    Create length distribution:
    - 10% Short (125 emails)
    - 80% Medium (1000 emails)  
    - 10% Long (125 emails)
    """
    
    short_count = int(total_count * 0.10)  # 125
    medium_count = int(total_count * 0.80)  # 1000
    long_count = total_count - short_count - medium_count  # 125
    
    lengths = (["Short"] * short_count + 
              ["Medium"] * medium_count + 
              ["Long"] * long_count)
    
    # Shuffle the list
    np.random.shuffle(lengths)
    
    print(f"Short: {short_count}")
    print(f"Medium: {medium_count}")
    print(f"Long: {long_count}")
    
    return lengths

# Generate length selections
recipe_lengths = create_length_list(1250)
print(f"Generated {len(recipe_lengths)} length selections")

Short: 125
Medium: 1000
Long: 125
Generated 1250 length selections


In [35]:
# Combine all components into final recipe dataframe
def create_final_recipe_dataframe(personas_df, sentiments_list, topics_list, lengths_list):
    """Combine all recipe components into final dataframe"""
    
    # Create recipe IDs
    recipe_ids = [f"rID_{i}" for i in range(len(personas_df))]
    
    # Create the recipe dataframe
    recipe_df = pd.DataFrame({
        'recipe_id': recipe_ids,
        'length': lengths_list,
        'sentiment': sentiments_list,
        'topics': topics_list
    })
    
    # Add all persona columns (except frequency_coefficient)
    persona_columns = [col for col in personas_df.columns if col != 'frequency_coefficient']
    for col in persona_columns:
        recipe_df[col] = personas_df[col].values
    
    # Shuffle the entire dataframe to mix everything up
    recipe_df = recipe_df.sample(frac=1).reset_index(drop=True)
    
    return recipe_df

# Create final recipe dataframe
final_recipe = create_final_recipe_dataframe(
    recipe_personas, 
    recipe_sentiments, 
    recipe_topics, 
    recipe_lengths
)

print(f"Final recipe dataframe shape: {final_recipe.shape}")
print(f"Columns: {list(final_recipe.columns)}")
print("\nFirst 3 rows:")
final_recipe.head(3)

Final recipe dataframe shape: (1250, 11)
Columns: ['recipe_id', 'length', 'sentiment', 'topics', 'persona_name', 'characteristics.age', 'characteristics.gender_identity', 'characteristics.occupation', 'characteristics.political_leaning', 'characteristics.communication_style', 'characteristics.core_motivation']

First 3 rows:


Unnamed: 0,recipe_id,length,sentiment,topics,persona_name,characteristics.age,characteristics.gender_identity,characteristics.occupation,characteristics.political_leaning,characteristics.communication_style,characteristics.core_motivation
0,rID_262,Medium,Very Negative,[Housing & Planning],The 'Civic Pragmatist' Community Leader,45,Female,Charity Manager / Community Leader,"Politically centrist, pragmatic, and non-parti...","Collaborative, solution-oriented, and professi...",Improving the local community and seeking part...
1,rID_115,Short,Very Positive,[Pensions & National Insurance],The Carer for a Disabled Family Member,55,Female,Full-time Unpaid Carer,"No time for politics, but experiences make her...","Tired, frustrated, but articulate about the fa...",The failure of other support systems has left ...
2,rID_890,Medium,Neutral,"[Crime & Community Safety, Foreign Affairs & I...",The 'Disengaged Traditionalist',68,Female,Retired,Believes in traditional values but is generall...,"Short, to the point, and written with a sense ...",A specific local nuisance or a matter of princ...


In [36]:
# Verify the final dataframe
print("=== FINAL RECIPE VERIFICATION ===")
print(f"Total recipes: {len(final_recipe)}")
print(f"Unique recipe IDs: {final_recipe['recipe_id'].nunique()}")

print("\nLength distribution:")
print(final_recipe['length'].value_counts())

print("\nSentiment distribution:")
print(final_recipe['sentiment'].value_counts())

print("\nTopic count distribution:")
topic_counts = final_recipe['topics'].apply(len)
print(topic_counts.value_counts().sort_index())

print("\nTop 10 personas:")
print(final_recipe['persona_name'].value_counts().head(10))

# Save to CSV
final_recipe.to_csv('email_recipes_1250.csv', index=False)
print(f"\nRecipe dataframe saved to 'email_recipes_1250.csv'")

=== FINAL RECIPE VERIFICATION ===
Total recipes: 1250
Unique recipe IDs: 1250

Length distribution:
length
Medium    1000
Short      125
Long       125
Name: count, dtype: int64

Sentiment distribution:
sentiment
Positive         228
Negative         219
Mixed            218
Very Positive    214
Very Negative    203
Neutral          168
Name: count, dtype: int64

Topic count distribution:
topics
1    875
2    312
3     63
Name: count, dtype: int64

Top 10 personas:
persona_name
The 'Civic Pragmatist' Community Leader    80
The 'Loyal National' Brexiteer             75
The 'Progressive Activist' Student         73
The 'Backbone Conservative' Retiree        71
The Single-Issue Campaigner                62
The 'Established Liberal' Professional     61
The Local Journalist                       40
The Trade Union Representative             30
The 'Well-Off Traditionalist' Landowner    29
The 'Soft-Left Liberal' Teacher            29
Name: count, dtype: int64

Recipe dataframe saved to 'ema