In [None]:
# Import libraries
from openai import OpenAI
import pandas as pd
import re

In [None]:
# Connect to OpenAI
key = #API key
client = OpenAI(api_key=key)

In [None]:
# Load preprocessed data
df = pd.read_pickle("./../data/df_selection.pkl")

In [None]:
# Define function the validate email entires
def email_valid(email):
  response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
      {
        "role": "system",
        "content": "You will be provided with a text message. Please return 1 if you think the message was written by a human, or 0 if you think it was computer generated, represents a calendar invite, or similar."
      },
      {
        "role": "user",
        "content": f"{email}"
      }
    ],
    max_tokens=64,
    top_p=1
  )

  answer = response.choices[0].message.content
  return answer == "1"


In [None]:
# Define function to label sentences
def label_sentence(sentence):
  response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
      {
        "role": "system",
        "content": "As an expert in corporate communications, analyze the sentiment of the following sentence. Take into account the subtle cues and formalities often present in business emails. Return the label 'p' if you think the message is positive, 'n' if the message is negative or 0 if you think the sentiment of the message is neutral. Provide no explanation, only the label."
      },
      {
        "role": "user",
        "content": f"{sentence}"
      }
    ],
    max_tokens=64,
    top_p=1
  )

  answer = response.choices[0].message.content
  return answer

In [None]:
# Define functions to split messages into sentences
def extract_sentences(message_body):    
    pattern = r'(?<=[\?\.\!])\s+'
    sentences = [sentence.strip() for sentence in re.split(pattern, message_body)] # split and strip sentences
    sentences = [re.sub(r'[ ]{2,}', " ", sentence) for sentence in sentences] # remove multiple spaces
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # filter out short sentences
    return sentences

In [None]:
# Run the pipeline 
count = 1
labeled = []

for i, row in df.iterrows():
    print("Processing email", count, "of", len(df))
    print(row["body_new"])  
    valid = email_valid(row["body_new"])
    if not valid:
        print("Email not valid")
        continue

    sentences = extract_sentences(row["body_new"])
    print("Extracted", len(sentences), "sentences:", sentences)
    for sentence in sentences:
        label = label_sentence(sentence)
        labeled.append({"Message-ID": row["Message-ID"], "sentence": sentence, "label": label})

    if count % 10 == 0:
        print(f"Processed {count} emails, saving...")
        pd.DataFrame(labeled).to_csv(f"./../data/df_labeled.csv")

    count += 1    
    print("=====================================")



In [None]:
# Save labeled data
df_labeled = pd.read_csv("./../data/df_labeled.csv")

In [None]:
# Further filtering

# Remove rows where "sentence" is less than 30 characters
df_labeled = df_labeled[df_labeled['sentence'].str.len() >= 30]

# Remove rows that contain a date in the format dd/mm/yyyy
df_labeled = df_labeled[~df_labeled['sentence'].str.contains(r'\b\d{2}/\d{2}/\d{4}\b')]

df_labeled.to_csv("./../data/df_labeled_filtered.csv")