In [1]:
# Import necessary libraries
from pymongo import MongoClient
import json
import re
from mistralai import Mistral
from bson import ObjectId  # Import ObjectId

# Initialize the Mistral client (ensure you have the right credentials and model name)
mistral_client = Mistral(api_key='IzwdiP0W04dwKFs1hRB3Mex6yhqHjPVV')  # Replace with your API key
model = "mistral-large-latest"  # Specify your model name

# Function to summarize and classify text
def summarize_and_classify_text(input_text):
    prompt = (
        f"Please summarize the following text into 20 words, provide a new heading"
        f"Classify the content as'Spam', 'Important', or 'Other' by analyzing in detail mark important only if it is."
        f"Reply in plain text; do not reply in bold text. "
        f"Separate each part by adding two new lines and do not give any sub-headings like Subject, Summary or Classification, just separate subject, summary and classification by two new lines:\n\n{input_text}"
    )

    chat_response = mistral_client.chat.complete(
        model=model,
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ]
    )

    return chat_response.choices[0].message.content

# Connect to the MongoDB database
client = MongoClient('mongodb://localhost:27017/')
db = client['emails']
collection = db['sample_of_20_mails']
api_response_collection = db['for-bulk-with-cs']

# Specify the Object-ID to process
object_id_to_process = "6708a9fc2a972ccfae483e93"  # Replace with the actual Object-ID you want to process

# Retrieve the document by Object-ID
try:
    document = collection.find_one({"_id": ObjectId(object_id_to_process)})
except Exception as e:
    print(f"Error retrieving document: {e}")
    document = None

if document:
    # Convert the document to a JSON string
    document_detail = json.dumps(document, default=str)
    # Load JSON
    data = json.loads(document_detail)

    # Extract the 'message' field
    message = data.get("message", "")
    # Initialize a dictionary to hold the parsed fields
    parsed_data = {}

    # Define a regex pattern to find key-value pairs in the message
    pattern = re.compile(r'^([A-Za-z\-]+):\s*(.+)', re.MULTILINE)
    
    # Find all matches and add them to the parsed_data dictionary
    for match in pattern.finditer(message):
        key = match.group(1).strip()  # Extract the key
        value = match.group(2).strip()  # Extract the value

        # Add the key-value pair to the dictionary
        if key in parsed_data:
            # If the key already exists, convert it to a list if not already
            if not isinstance(parsed_data[key], list):
                parsed_data[key] = [parsed_data[key]]
            parsed_data[key].append(value)  # Append the new value
        else:
            parsed_data[key] = value  # Add new key-value pair

    # Include other fields from the main document if necessary
    parsed_data['_id'] = data.get('_id', "")
    parsed_data['file'] = data.get('file', "")

    # Find the index of the first blank line to determine where the body starts
    body_start_index = message.find('\n\n')  # Look for the first occurrence of double newline

    if body_start_index != -1:
        # Extract the body of the email, starting after the blank line
        body = message[body_start_index + 2:].strip()  # Skip the blank line
        parsed_data['Unstructured-Text'] = body  # Add the body as unstructured text
    else:
        # If no blank line is found, set unstructured text as empty
        parsed_data['Unstructured-Text'] = ""

    # Prepare the result string for summarization
    result_string = (
        f"\nMessage-ID: \t {parsed_data.get('Message-ID', '')}"
        f"\nFrom: \t {parsed_data.get('From', '')}"
        f"\nSubject: \t {parsed_data.get('Subject', '')}"
        f"\nDate: \t {parsed_data.get('Date', '')}"
        f"\nBody: \t {parsed_data['Unstructured-Text']}"
    )

    # Get the summary and classification of the message
    summary_output = summarize_and_classify_text(result_string)
    print(summary_output)
    
    # Split the output into subject, classification, and summarized body
    lines = summary_output.strip().split('\n')

    # Ensure that we have at least three lines in the output
    if len(lines) >= 3:
        subject = lines[0]  # First line is the new subject
        classification = lines[4]  # Last line is the classification (spam, important, other)
        summarized_body = lines[2]  # Second line is the summarized body

        # Prepare the new document to be inserted into the api_response collection
        new_response = {
            "Message-ID": parsed_data.get("Message-ID", ""),
            "From": parsed_data.get("From", ""),
            "To": parsed_data.get("To", ""),  # Ensure 'To' is included if it's available
            "Subject": subject,  # Use the new subject
            "Summarized-Body": summarized_body,  # The summarized body
            "Classification": classification  # Add classification
        }

        # Insert the new document into the api_response collection
        api_response_collection.insert_one(new_response)

    else:
        print("Error: API response did not return the expected format.")
else:
    print(f"Document with Object-ID {object_id_to_process} not found.")

# Print the total number of new responses added
print("\nOne new response added to 'for-bulk-with-cs'." if document else "No response added.")


Catching Up

Phillip Allen asks John to catch up.

Other

One new response added to 'for-bulk-with-cs'.


In [None]:
'''
Important Emails
6708a9fc2a972ccfae483e78, 6708a9fc2a972ccfae483e80, 6708a9fc2a972ccfae483e81, 6708a9fc2a972ccfae483e84, 6708a9fc2a972ccfae483e89, 6708a9fc2a972ccfae483e90

Spam Emails
6708a9fc2a972ccfae483e79, 6708a9fc2a972ccfae483e82, 6708a9fc2a972ccfae483e85, 6708a9fc2a972ccfae483e86, 6708a9fc2a972ccfae483e88, 6708a9fc2a972ccfae483e92

Other Emails
6708a9fc2a972ccfae483e80, 6708a9fc2a972ccfae483e83, 6708a9fc2a972ccfae483e87, 6708a9fc2a972ccfae483e91, 6708a9fc2a972ccfae483e93
'''