In [118]:
# Import necessary libraries
from pymongo import MongoClient
from bson.objectid import ObjectId
import json
import re

In [120]:
# Connect to the MongoDB database
client = MongoClient('mongodb://localhost:27017/')
db = client['emails']
collection = db['complete_details']

In [545]:
# Specify which document to retrieve
n = 5  # Change this number to retrieve different documents

# Retrieve the document
document = collection.find().skip(n - 1).limit(1)

In [547]:
# Initialize a variable to hold document details
document_detail = ""

In [549]:
# Iterate through the document cursor
for docs in document:
    # Convert the document to a JSON string
    document_detail = json.dumps(docs, default=str)

In [551]:
print(document_detail)

{"_id": "6708a9fc2a972ccfae483e79", "file": "allen-p/_sent_mail/1001.", "message": "Message-ID: <30922949.1075863688243.JavaMail.evans@thyme>\nDate: Thu, 31 Aug 2000 05:07:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: greg.piper@enron.com\nSubject: Re: Hello\nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: Greg Piper\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Dec2000\\Notes Folders\\'sent mail\nX-Origin: Allen-P\nX-FileName: pallen.nsf\n\nLet's shoot for Tuesday at 11:45.  "}


In [553]:
# Load JSON
data = json.loads(document_detail)

# Print the content in plain text format
print(f"ID: {data['_id']}")
print(f"File: {data['file']}")
print("Message:")
print(data['message'].replace('\\n', '\n'))

ID: 6708a9fc2a972ccfae483e79
File: allen-p/_sent_mail/1001.
Message:
Message-ID: <30922949.1075863688243.JavaMail.evans@thyme>
Date: Thu, 31 Aug 2000 05:07:00 -0700 (PDT)
From: phillip.allen@enron.com
To: greg.piper@enron.com
Subject: Re: Hello
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Greg Piper
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Dec2000\Notes Folders\'sent mail
X-Origin: Allen-P
X-FileName: pallen.nsf

Let's shoot for Tuesday at 11:45.  


In [555]:
# Convert the JSON string to a dictionary
data = json.loads(document_detail)

# Extract the 'message' field
message = data.get("message", "")


In [557]:
# Initialize a dictionary to hold the parsed fields
parsed_data = {}

# Define a regex pattern to find key-value pairs in the message
pattern = re.compile(r'^([A-Za-z\-]+):\s*(.+)', re.MULTILINE)


In [559]:
# Find all matches and add them to the parsed_data dictionary
for match in pattern.finditer(message):
    key = match.group(1).strip()  # Extract the key
    value = match.group(2).strip()  # Extract the value

    # Add the key-value pair to the dictionary
    if key in parsed_data:
        # If the key already exists, convert it to a list if not already
        if not isinstance(parsed_data[key], list):
            parsed_data[key] = [parsed_data[key]]
        parsed_data[key].append(value)  # Append the new value
    else:
        parsed_data[key] = value  # Add new key-value pair


In [561]:
# Include other fields from the main document if necessary
parsed_data['_id'] = data.get('_id', "")
parsed_data['file'] = data.get('file', "")

# Find the index of the first blank line to determine where the body starts
body_start_index = message.find('\n\n')  # Look for the first occurrence of double newline

if body_start_index != -1:
    # Extract the body of the email, starting after the blank line
    body = message[body_start_index + 2:].strip()  # Skip the blank line
    parsed_data['Unstructured-Text'] = body  # Add the body as unstructured text
else:
    # If no blank line is found, set unstructured text as empty
    parsed_data['Unstructured-Text'] = ""


In [563]:
# Print the parsed data as a dictionary
print(parsed_data)


{'Message-ID': '<30922949.1075863688243.JavaMail.evans@thyme>', 'Date': 'Thu, 31 Aug 2000 05:07:00 -0700 (PDT)', 'From': 'phillip.allen@enron.com', 'To': 'greg.piper@enron.com', 'Subject': 'Re: Hello', 'Mime-Version': '1.0', 'Content-Type': 'text/plain; charset=us-ascii', 'Content-Transfer-Encoding': '7bit', 'X-From': 'Phillip K Allen', 'X-To': 'Greg Piper', 'X-cc': 'X-bcc:', 'X-Folder': "\\Phillip_Allen_Dec2000\\Notes Folders\\'sent mail", 'X-Origin': 'Allen-P', 'X-FileName': 'pallen.nsf', '_id': '6708a9fc2a972ccfae483e79', 'file': 'allen-p/_sent_mail/1001.', 'Unstructured-Text': "Let's shoot for Tuesday at 11:45."}


In [565]:
result_string = (
    f"\nFrom: \t {parsed_data['From']}"
    f"\nSubject: \t {parsed_data['Subject']}"
    f"\nDate: \t {parsed_data['Date']}"
    f"\nBody: \t {parsed_data['Unstructured-Text']}"
)

In [567]:
print(result_string)


From: 	 phillip.allen@enron.com
Subject: 	 Re: Hello
Date: 	 Thu, 31 Aug 2000 05:07:00 -0700 (PDT)
Body: 	 Let's shoot for Tuesday at 11:45.
