# Imports

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import json
import pandas as pd
import csv
import time
from concurrent.futures import ThreadPoolExecutor
import threading
from prompts.dataset_generation import general_instruction_01 as general_instruction
from typing import List, Dict

load_dotenv()  
OPENAI_API_KEY  = os.getenv("OPENAI_API_KEY")

client = OpenAI(
    api_key=OPENAI_API_KEY,
)

# Utils

In [None]:
# Function to read the JSON file
def read_json(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        print(f"Error: the file {file_path} was not found.")
    except json.JSONDecodeError as e:
        print(f"Error decoding the JSON: {e}")
    return None

# Dataset Generation

In this section, the datasets are actually generated to evaluate the email classification prompt. Specifically, the following datasets are created:

- **General Dataset**  
- **Edge Cases**
- **Language and Cultural Diversity**
- **Special Service Requests (SSR) Emphasis**
- **High Complexity**
- **Sentiment Variations**
- **Diverse Writing Styles and Formats**
- **Tool Requirement Variations**  
- **Customer Status Unknown**
- **Urgency and Priority Levels**

In [None]:
# CSV Fieldnames
fieldnames = [
    'subject',
    'sender',
    'recipients',
    'body'
]

# Reading json file with prompts for dataset generator
file_path = './config/prompt_configs.json'  
prompts = read_json(file_path)

# Lock for managing concurrent writes to the CSV file
lock = threading.Lock()

def create_enhanced_prompt(general_instructions: str, prompt_info: Dict, batch_size: int, fieldnames: List[str]) -> Dict:
    """
    Creates an enhanced prompt combining general guidelines with specific scenario instructions.
    """
    scenario_name = prompt_info["name"]
    scenario_instructions = prompt_info["instructions"]
    
    system_content = f"""
    {general_instructions}

    SPECIFIC SCENARIO FOCUS: {scenario_name}
    {scenario_instructions}
    """

    user_content = f"""
    Generate {batch_size} unique emails **one per each purpose categories** following these strict formatting guidelines:

    1. Use ";" as the separator between fields
    2. Enclose all field values in double quotes (e.g., "value1";"value2";"value3")
    3. Include only the raw CSV content with these fields: {'; '.join(fieldnames)}
    4. No explanatory text or formatting markers

    """

    return {
        "messages": [
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content}
        ],
        "temperature": 0.7,
    }

# Function to generate a batch of emails
def generate_email_batch(prompt_info, batch_size, fieldnames):
    
    enhanced_prompt = create_enhanced_prompt(
            general_instructions=general_instruction,
            prompt_info=prompt_info,
            batch_size=batch_size,
            fieldnames=fieldnames
        )


    try:
        # Make API call
        response = client.chat.completions.create(
            model='gpt-4o', 
            messages=enhanced_prompt["messages"],
            temperature=enhanced_prompt["temperature"],
        )

        # Extract the AI's response
        assistant_reply = response.choices[0].message.content
        lines = assistant_reply.strip().split('\n')

        # Filter unnecessary lines and process data
        data_lines = [
            line for line in lines 
            if line.strip() and not (line.startswith("```") or line.startswith("```csv") or line.startswith('"subject"'))
        ]
        reader = csv.DictReader(data_lines, fieldnames=fieldnames, delimiter=';')
        return list(reader)  # Return a list of rows as dictionaries

    except Exception as e:
        print(f"Error in batch generation: {e}")
        return []  # Return an empty list in case of an error

# Function to process a single prompt
def process_prompt(prompt_info, total_emails, output_file):
    emails_generated = 0
    batch_size = 12  # Batch size

    # Create a filename for each prompt based on its name
    sanitized_name = prompt_info['name'].replace(" ", "_").replace("/", "_")
    csv_filename = f"./datasets/{sanitized_name}.csv"

    print(f"\nProcessing prompt: {prompt_info['name']}")
    print(f"Saving data to file: {csv_filename}")

    # Open a new CSV file for each prompt
    with open(csv_filename, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=';', quoting=csv.QUOTE_ALL)
        writer.writeheader()

        with ThreadPoolExecutor(max_workers=1000) as executor:  # Use up to 5 threads for parallelism
            futures = []

            # Schedule batches of email generation
            while emails_generated < total_emails:
                batch_to_generate = min(batch_size, total_emails - emails_generated)
                futures.append(
                    executor.submit(generate_email_batch, prompt_info, batch_to_generate, fieldnames)
                )
                emails_generated += batch_to_generate

            # Write results to the CSV file
            for future in futures:
                rows = future.result()  # Collect results from the thread
                with lock:  # Ensure only one thread writes at a time
                    for row in rows:
                        # Validate fields before writing
                        if not row or any(key not in fieldnames for key in row.keys()):
                            print(f"Invalid row detected and skipped: {row}")
                            continue  # Skip invalid rows
                        try:
                            writer.writerow(row)
                        except ValueError as e:
                            print(f"Error writing row: {row}, Error: {e}")

    print(f"Finished processing prompt: {prompt_info['name']}")


# Process each prompt
for prompt_info in prompts.get("prompts", []):
    process_prompt(prompt_info, prompt_info['emails_to_generate'], fieldnames)

print("Email dataset generation complete.")



Processing prompt: General Dataset
Saving data to file: ./datasets/General_Dataset.csv
Finished processing prompt: General Dataset

Processing prompt: Edge Cases
Saving data to file: ./datasets/Edge_Cases.csv
Finished processing prompt: Edge Cases

Processing prompt: Language and Cultural Diversity
Saving data to file: ./datasets/Language_and_Cultural_Diversity.csv
Finished processing prompt: Language and Cultural Diversity

Processing prompt: Special Service Requests (SSR) Emphasis
Saving data to file: ./datasets/Special_Service_Requests_(SSR)_Emphasis.csv
Finished processing prompt: Special Service Requests (SSR) Emphasis

Processing prompt: High Complexity
Saving data to file: ./datasets/High_Complexity.csv
Finished processing prompt: High Complexity

Processing prompt: Sentiment Variations
Saving data to file: ./datasets/Sentiment_Variations.csv
Finished processing prompt: Sentiment Variations

Processing prompt: Diverse Writing Styles and Formats
Saving data to file: ./datasets/D

### Combining the Generated Datasets

In this section, the previously created datasets are combined into a single dataset.

In [None]:
# Path to the directory containing the CSV files
directory = './datasets'

# Name of the output file
output_file = './datasets/combined_dataset.csv'

# List to store the DataFrames
dataframes = []

# Read each CSV file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory, filename)
        print(f"Reading file: {file_path}")
        
        # Read the CSV file
        df = pd.read_csv(file_path, delimiter=';')
        
        # Add the DataFrame to the list
        dataframes.append(df)

# Concatenate all DataFrames into a single one
combined_df = pd.concat(dataframes, ignore_index=True)

# Remove duplicate headers in the content (if any)
combined_df = combined_df[~combined_df['subject'].str.contains('subject', na=False)]

# Save the result into a single CSV file
combined_df.to_csv(output_file, index=False, sep=';')

print(f"Combined dataset saved to {output_file}")

Reading file: ./datasets\Customer_Status_Unknown.csv
Reading file: ./datasets\Diverse_Writing_Styles_and_Formats.csv
Reading file: ./datasets\Edge_Cases.csv
Reading file: ./datasets\General_Dataset.csv
Reading file: ./datasets\High_Complexity.csv
Reading file: ./datasets\Language_and_Cultural_Diversity.csv
Reading file: ./datasets\Sentiment_Variations.csv
Reading file: ./datasets\Special_Service_Requests_(SSR)_Emphasis.csv
Reading file: ./datasets\Tool_Requirement_Variations.csv
Reading file: ./datasets\Urgency_and_Priority_Levels.csv
Combined dataset saved to ./datasets/combined_dataset.csv
