In [21]:
from pymongo import MongoClient
import json
import re
from mistralai import Mistral
from bson import ObjectId


In [41]:
# Initialize the Mistral client
mistral_client = Mistral(api_key='IzwdiP0W04dwKFs1hRB3Mex6yhqHjPVV')  # Replace with your API key
model = "mistral-large-latest"  # Specify your model name

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['emails']
collection = db['sample_of_20_mails']
api_response_collection = db['for-individual-with-cs']


In [43]:
def summarize_and_classify_text(input_text):
    prompt = (
        f"Please summarize the following text into 20 words, provide a new heading."
        f"Classify the content as 'Spam', 'Important', or 'Other' by analyzing in detail. Mark important only if it is."
        f"Reply in plain text; do not reply in bold text. "
        f"Separate each part by adding two new lines and do not give any sub-headings like Subject, Summary, or Classification. "
        f"Separate subject, summary, and classification by two new lines:\n\n{input_text}"
    )

    chat_response = mistral_client.chat.complete(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )

    return chat_response.choices[0].message.content


In [45]:
# Retrieve document by ObjectId
def get_document_by_id(object_id):
    try:
        document = collection.find_one({"_id": ObjectId(object_id)})
        print("\nFetched Document:\n")
        # Load JSON
        data = document
        
        # Print the content in plain text format
        print( f"ID: {data['_id']}")
        print(f"File: {data['file']}")
        print("Message:")
        print(data['message'].replace('\\n', '\n'))
        print("\n------------------------------------------------------------------------------------------------------------------------------")
        return document
    except Exception as e:
        print(f"Error retrieving document: {e}")
        return None


In [47]:
# Parse message content using regex
def parse_message(message):
    parsed_data = {}
    pattern = re.compile(r'^([A-Za-z\-]+):\s*(.+)', re.MULTILINE)
    
    for match in pattern.finditer(message):
        key = match.group(1).strip()
        value = match.group(2).strip()

        if key in parsed_data:
            if not isinstance(parsed_data[key], list):
                parsed_data[key] = [parsed_data[key]]
            parsed_data[key].append(value)
        else:
            parsed_data[key] = value

    # Extract domain (mail server) from the "From" field
    if 'From' in parsed_data:
        email = parsed_data['From']
        domain = email.split('@')[-1]  # Extract the domain part after '@'
        organization = domain.split('.')[0]  # Extract the first part of the domain (e.g., google from google.com)
        parsed_data['Organization'] = organization

    print("\n\n\nParsed Message Data:\n")
    for key, value in parsed_data.items():
        print(f"{key}: {value}")
    print("\n------------------------------------------------------------------------------------------------------------------------------")
    
    return parsed_data


In [49]:
# Extract unstructured text (body) from message
def extract_body(message):
    body_start_index = message.find('\n\n')  # Look for the first occurrence of double newline
    body = message[body_start_index + 2:].strip() if body_start_index != -1 else ""
    print("\n\n\nExtracted Body Text:\n")
    print(body)
    print("\n------------------------------------------------------------------------------------------------------------------------------")
    return body


In [51]:
# Build result string for summarization
def build_result_string(parsed_data):
    result_string = (
        f"\nMessage-ID: \t {parsed_data['Message-ID'] if 'Message-ID' in parsed_data else ''}"
        f"\nFrom: \t {parsed_data['From'] if 'From' in parsed_data else ''}"
        f"\nSubject: \t {parsed_data['Subject'] if 'Subject' in parsed_data else ''}"
        f"\nDate: \t {parsed_data['Date'] if 'Date' in parsed_data else ''}"
        f"\nBody: \t {parsed_data['Unstructured-Text']}"
    )
    print("\n\n\nResult String (to be sent to API):\n")
    print(result_string)
    print("\n------------------------------------------------------------------------------------------------------------------------------")
    return result_string


In [53]:
# Insert summarized and classified response into MongoDB
def insert_summarized_response(parsed_data, summary_output):
    lines = summary_output.strip().split('\n')

    if len(lines) >= 3:
        subject = lines[0]
        classification = lines[4]
        summarized_body = lines[2]

        new_response = {
            "Message-ID": parsed_data["Message-ID"] if "Message-ID" in parsed_data else "",
            "From": parsed_data["From"] if "From" in parsed_data else "",
            "To": parsed_data["To"] if "To" in parsed_data else "",
            "Subject": subject,
            "Summarized-Body": summarized_body,
            "Organization": parsed_data["Organization"] if "Organization" in parsed_data else "",
            "Classification": classification
        }

        api_response_collection.insert_one(new_response)
        print("\n\n\nNew Response Added to 'for-bulk-with-cs':\n")
        for key, value in new_response.items():
            print(f"{key}: {value}")
        print("\n------------------------------------------------------------------------------------------------------------------------------")
    else:
        print("Error: API response did not return the expected format.")


In [55]:
def process_document_by_id(object_id):
    # 1. Fetch document from MongoDB
    document = get_document_by_id(object_id)
    
    if document:
        # 2. Parse the message content
        message = document["message"] if "message" in document else ""
        parsed_data = parse_message(message)
        
        # 3. Extract the unstructured body text
        parsed_data['Unstructured-Text'] = extract_body(message)

        # 4. Build the result string for summarization
        result_string = build_result_string(parsed_data)

        # 5. Summarize and classify the email content using the API
        summary_output = summarize_and_classify_text(result_string)
        print("\n\n\nSummary Output from API:\n")
        print(summary_output)
        print("\n------------------------------------------------------------------------------------------------------------------------------")

        # 6. Insert summarized and classified data into MongoDB
        insert_summarized_response(parsed_data, summary_output)
    else:
        print(f"Document with Object-ID {object_id} not found.")


In [61]:
# Example usage: process a specific document by Object-ID
object_id_to_process = "6708a9fc2a972ccfae483e87"  # Replace with the actual Object-ID
process_document_by_id(object_id_to_process)



Fetched Document:

ID: 6708a9fc2a972ccfae483e87
File: allen-p/_sent_mail/110.
Message:
Message-ID: <90123456.8899001122.JavaMail.evans@thyme>  
Date: Wed, 14 Feb 2001 11:30:00 -0700 (PDT)  
From: phillip.allen@enron.com  
To: lily.james@enron.com  
Subject: Coffee Break?  
Mime-Version: 1.0  
Content-Type: text/plain; charset=us-ascii  
Content-Transfer-Encoding: 7bit  
X-From: Phillip K Allen  
X-To: Lily James  
X-cc:  
X-bcc:  
X-Folder: \Phillip_Allen_Feb2001\'Sent Mail  
X-Origin: Allen-P  
X-FileName: pallen.nsf  

Hello Lily,

Just wondering if you'd like to grab a coffee later today. Let me know if you're free!

------------------------------------------------------------------------------------------------------------------------------



Parsed Message Data:

Message-ID: <90123456.8899001122.JavaMail.evans@thyme>
Date: Wed, 14 Feb 2001 11:30:00 -0700 (PDT)
From: phillip.allen@enron.com
To: lily.james@enron.com
Subject: Coffee Break?
Mime-Version: 1.0
Content-Type: text/plain

In [None]:
'''
Important Emails
6708a9fc2a972ccfae483e78, 6708a9fc2a972ccfae483e80, 6708a9fc2a972ccfae483e81, 6708a9fc2a972ccfae483e84, 6708a9fc2a972ccfae483e89, 6708a9fc2a972ccfae483e90

Spam Emails
6708a9fc2a972ccfae483e79, 6708a9fc2a972ccfae483e82, 6708a9fc2a972ccfae483e85, 6708a9fc2a972ccfae483e86, 6708a9fc2a972ccfae483e88, 6708a9fc2a972ccfae483e92

Other Emails
6708a9fc2a972ccfae483e80, 6708a9fc2a972ccfae483e83, 6708a9fc2a972ccfae483e87, 6708a9fc2a972ccfae483e91, 6708a9fc2a972ccfae483e93
'''