In [3]:
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import spacy
import re

In [4]:
# Load spaCy model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

# Load pre-trained model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("./fine_tuned_model_3")
model = T5ForConditionalGeneration.from_pretrained("./fine_tuned_model_3")

In [6]:
import re

def clean_email(raw_email):
    # Replace standard metadata fields
    clean_email = re.sub(r'(?i)^Message-ID:.*\n', '[MESSAGE_ID]', raw_email)
    clean_email = re.sub(r'(?i)^Date:.*\n', '[DATE]', clean_email)
    clean_email = re.sub(r'(?i)^From:.*\n', '[FROM]', clean_email)
    clean_email = re.sub(r'(?i)^To:.*\n', '[TO]', clean_email)
    clean_email = re.sub(r'(?i)^Subject:.*\n', '[SUBJECT LINE]', clean_email)

    # Remove email headers that don't need to be retained
    clean_email = re.sub(r'(?i)^Mime-Version:.*\n', '', clean_email)
    clean_email = re.sub(r'(?i)^Content-Type:.*\n', '', clean_email)
    clean_email = re.sub(r'(?i)^Content-Transfer-Encoding:.*\n', '', clean_email)
    clean_email = re.sub(r'(?i)^X-.*\n', '', clean_email)
    clean_email = re.sub(r'(?i)^FYI.*\n', '', clean_email)
    clean_email = re.sub(r'(?i)^----- Forwarded by.*\n', '', clean_email)

    # Replace email addresses
    clean_email = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', '[EMAIL]', clean_email)

    # Replace phone numbers (various formats)
    clean_email = re.sub(r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', '[PHONE_NUMBER]', clean_email)
    clean_email = re.sub(r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{3,4}[-.\s]?\d{3,4}', '[PHONE_NUMBER]', clean_email)

    # Replace personal names (basic pattern, can be improved with NLP)
    clean_email = re.sub(r'\b[A-Z][a-z]+(?:\s[A-Z][a-z]+){0,2}\b', '[NAME]', clean_email)

    # Replace company names (basic approach)
    clean_email = re.sub(r'\b(?:Enron|ExxonMobil|Amazon|Google|Microsoft|Facebook|Tesla|Apple)\b', '[COMPANY]', clean_email)

    # Replace any identifiers (contract numbers, transaction IDs, etc.)
    clean_email = re.sub(r'\b[A-Z0-9]{5,}\b', '[IDENTIFIER]', clean_email)

    # Remove excess whitespace
    clean_email = re.sub(r'\n+', ' ', clean_email)
    clean_email = clean_email.strip()
    
    return clean_email


In [8]:
def generate_synthetic_email_with_context(prompt="Form of Memorandum of Option Attached is a copy of our proposed Memorandum of Option that we would like to  use for our land options.", max_length=300):
    # Tokenize the input prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Generate text with more randomness and diversity
    output_ids = model.generate(
        input_ids,
        max_length=max_length,    # Allow the generation to be longer if necessary
        num_beams=1,              # Use random sampling instead of greedy search
        top_p=0.9,                # Nucleus sampling: top 90% probability mass
        top_k=50,                 # Restrict to top 50 tokens for sampling
        no_repeat_ngram_size=2,   # Avoid repeating the same n-grams
        temperature=1.0,          # Higher temperature for more randomness
        do_sample=True,           # Enable sampling for randomness
        num_return_sequences=5    # Generate multiple variations of the email
    )

    # Decode the generated ids back into text
    generated_emails = []
    for output in output_ids:
        email_text = tokenizer.decode(output, skip_special_tokens=True)
        generated_emails.append(email_text)

    return generated_emails

# Generate 5 synthetic emails
generated_emails = generate_synthetic_email_with_context(prompt="""| william.giuliani@enron.com | andrew.fastow@enron.com | 2001-06-07 07:48:00 |
Subject: Approval of the DPR Accelerated Put transaction
Dear Andrew,

Attached is the DASH for the approval of the DPR Accelerated Put transaction. This
partial divestiture allows us to put $11 million of our equity interest back to DPR
Holding Company, LLC, and its subsidiary, Dakota, LLC. Both entities are controlled
by Chris Cline.
In addition to redeeming part of our equity interest, the deal provides us with
900,000 tons of coal priced below market, an option which could lead to a very
profitable synfuel project, and the potential for more marketing fees from other
Cline entities.
The DASH has been approved and signed by RAC and JEDI II and is now awaiting
Mark Haedicke’s review and approval. I wanted to give you the opportunity to review
the DASH and become familiar with the provisions of the deal.
If you have any questions on the transaction, feel free to contact me at (412) 490-
9048. Others familiar with the deal are Mike Beyer, George McClellan, and Wayne
Gresham.
Thank you.
Best regards,
Bill Giuliani""")

# Print the generated emails
for i, email in enumerate(generated_emails, 1):
    print(f"Generated Email {i}:\n")
    print(email)
    print("\n" + "-"*50 + "\n")

Generated Email 1:

Approval of the DPR Accelerated Put transaction Dear Andrew, Attached is the dASH for the approval of this. This partial divestiture allows us to put $11 million of our equity interest back to DRP Holding Company, LLC, and its subsidiary, Dakota, Inc. Both entities are controlled by Chris Cline. In addition to redeeming part of its equity interests, the deal provides us with 900,000 tons of coal priced below market, an option which could lead to a very profitable synfuel

--------------------------------------------------

Generated Email 2:

Approval of the DPR Accelerated Put transaction Dear Andrew, Attached is the LASH for the approval of. The DASH has been approved and signed by RAC and JEDI II and is now waiting Mark Haedicke’s review and approval ; i wanted to give you the opportunity to review the, the ACL & the deal if you have any questions ad hoc. the original DAR will be delivered at (412) 490- 9048. For more information on

-----------------------------