In [3]:
# Import necessary libraries
from pymongo import MongoClient
import json
import re
from mistralai import Mistral

In [5]:
# Connect to the MongoDB database
client = MongoClient('mongodb://localhost:27017/')
db = client['emails']
collection = db['complete_details']
api_response_collection = db['for-bulk']

In [25]:
# Specify how many documents to process
num_documents_to_process = 3

# Retrieve the documents
documents = collection.find().limit(num_documents_to_process)

In [27]:
# Initialize Mistral client
api_key = "IzwdiP0W04dwKFs1hRB3Mex6yhqHjPVV"
model = "mistral-large-latest"
client = Mistral(api_key=api_key)

In [29]:
def summarize_text(input_text):
    prompt = f"Please summarize the following text into 20 words and provide a new heading, separate both of them by two new lines do not give any sub-headings like Subject or Summary:\n\n{input_text}"
    
    chat_response = client.chat.complete(
        model=model,
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ]
    )
    return chat_response.choices[0].message.content

In [31]:
# Iterate through the documents
for docs in documents:
    # Convert the document to a JSON string
    document_detail = json.dumps(docs, default=str)
    # Load JSON
    data = json.loads(document_detail)

    # Extract the 'message' field
    message = data.get("message", "")
    # Initialize a dictionary to hold the parsed fields
    parsed_data = {}

    # Define a regex pattern to find key-value pairs in the message
    pattern = re.compile(r'^([A-Za-z\-]+):\s*(.+)', re.MULTILINE)
    
    # Find all matches and add them to the parsed_data dictionary
    for match in pattern.finditer(message):
        key = match.group(1).strip()  # Extract the key
        value = match.group(2).strip()  # Extract the value

        # Add the key-value pair to the dictionary
        if key in parsed_data:
            # If the key already exists, convert it to a list if not already
            if not isinstance(parsed_data[key], list):
                parsed_data[key] = [parsed_data[key]]
            parsed_data[key].append(value)  # Append the new value
        else:
            parsed_data[key] = value  # Add new key-value pair

    # Include other fields from the main document if necessary
    parsed_data['_id'] = data.get('_id', "")
    parsed_data['file'] = data.get('file', "")

    # Find the index of the first blank line to determine where the body starts
    body_start_index = message.find('\n\n')  # Look for the first occurrence of double newline

    if body_start_index != -1:
        # Extract the body of the email, starting after the blank line
        body = message[body_start_index + 2:].strip()  # Skip the blank line
        parsed_data['Unstructured-Text'] = body  # Add the body as unstructured text
    else:
        # If no blank line is found, set unstructured text as empty
        parsed_data['Unstructured-Text'] = ""

    # Prepare the result string for summarization
    result_string = (
        f"\nMessage-ID: \t {parsed_data['Message-ID']}"
        f"\nFrom: \t {parsed_data['From']}"
        f"\nSubject: \t {parsed_data['Subject']}"
        f"\nDate: \t {parsed_data['Date']}"
        f"\nBody: \t {parsed_data['Unstructured-Text']}"
    )
    print(result_string)
    # Get the summary of the message
    summary_output = summarize_text(result_string)
    
    # Prepare the new document to be inserted into the api_response collection
    new_response = {
        "Message-ID": parsed_data["Message-ID"],
        "From": parsed_data["From"],
        "To": parsed_data["To"],  # Ensure 'To' is included if it's available
        "Subject": summary_output.split('\n')[0],  # Use the first line of the summary as the subject
        "Summarized-Body": summary_output.split('\n')[2]  # The entire summary as the body
    }

    # Insert the new document into the api_response collection
    api_response_collection.insert_one(new_response)

    # Print the new response document for each processed email
    print(f"New response added to 'api_response': {new_response}")


Message-ID: 	 <18782981.1075855378110.JavaMail.evans@thyme>
From: 	 phillip.allen@enron.com
Subject: 	 Mime-Version: 1.0
Date: 	 Mon, 14 May 2001 16:39:00 -0700 (PDT)
Body: 	 Here is our forecast
New response added to 'api_response': {'Message-ID': '<18782981.1075855378110.JavaMail.evans@thyme>', 'From': 'phillip.allen@enron.com', 'To': 'tim.belden@enron.com', 'Subject': '**Enron Forecast Submitted**', 'Summarized-Body': 'Phillip Allen shares forecast on May 14, 2001.', '_id': ObjectId('670cea7e3b9c33f4ab7b0739')}

Message-ID: 	 <15464986.1075855378456.JavaMail.evans@thyme>
From: 	 phillip.allen@enron.com
Subject: 	 Re:
Date: 	 Fri, 4 May 2001 13:51:00 -0700 (PDT)
Body: 	 Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or nec

In [33]:
# Print the total number of new responses added
print(f"{num_documents_to_process} new responses added to 'api_response'.")

3 new responses added to 'api_response'.
