In [None]:
import re
import pandas as pd
import os
from datetime import datetime

def parse_date(date_string):
    """
    Parses a date string with various formats.

    Args:
        date_string (str): The date string to parse.

    Returns:
        datetime.date or None: The parsed date, or None if parsing fails.
    """
    date_formats = [
        "%Y-%m-%d",
        "%d-%m-%Y",
        "%m/%d/%Y",
        "%d/%m/%Y",
        "%d %b %Y",
        "%d %B %Y",
        "%dth %b %Y",
        "%dth %B %Y",
        "%d %b, %Y",
        "%d %B, %Y",
        "%dth %b, %Y",
        "%dth %B, %Y",
        "%b %d, %YYYY",
        "%B %d, %Y",
        "%b %d %Y",
        "%B %d %Y",
    ]

    for fmt in date_formats:
        try:
            return datetime.strptime(date_string, fmt).date()
        except ValueError:
            pass  # Try the next format

    return None

def extract_purchase_info(email_content):
    """
    Extracts purchase order information from email content, handling variations.

    Args:
        email_content (str): The content of the email.

    Returns:
        list: A list of dictionaries, where each dictionary represents a purchase order item.
    """

    items = []
    lines = email_content.split('\n')
    delivery_date = None

    # Extract delivery date if present
    for line in lines:
        date_match = re.search(r'(?:delivery date|delivery|date required|required date|ship date|expected delivery|arrive by|by)\s*(?:before|on|:)?\s*([a-zA-Z0-9\s,-]+)', line, re.IGNORECASE)
        if date_match:
            print(date_match.group())
            delivery_date = parse_date(date_match.group(1).strip())
            break

    for line in lines:
        # Refined regex to handle variations in item code formatting
        match = re.search(r'^\s*-?\s*(\d+)\s+units?\s+of\s+([\w\s-]+?)(?:\s*\(?Item Code:?\s*([-\w\d]+)\)?)?$', line, re.IGNORECASE)
        if match:
            units, product_name, item_code = match.groups()
            items.append({
                'units': int(units),
                'product_name': product_name.strip(),
                'item_code': item_code.strip() if item_code else None,  # Handle missing item codes
                'delivery_date': delivery_date
            })
        else:
            match = re.search(r'^(\d+)\s+([\w\s-]+?)(?:,\s*Item Code:?\s*([-\w\d]+))?$', line, re.IGNORECASE)
            if match:
                units, product_name, item_code = match.groups()
                items.append({
                    'units': int(units),
                    'product_name': product_name.strip(),
                    'item_code': item_code.strip() if item_code else None,
                    'delivery_date': delivery_date
                })

        #Handles erroneous duplicate orders.
        match_duplicate = re.search(r'^\s*-?\s*(\d+)\s+\d+\s+units?\s+of\s+([\w\s-]+?)(?:\s*\(?Item Code:?\s*([-\w\d]+)\)?)?$', line, re.IGNORECASE)
        if match_duplicate:
            units, product_name, item_code = match_duplicate.groups()
            items.append({
                'units': int(units),
                'product_name': product_name.strip(),
                'item_code': item_code.strip() if item_code else None,
                'delivery_date': delivery_date
            })

    return items

def process_emails_from_directory(directory):
    """
    Processes all email files in the given directory and creates a DataFrame.

    Args:
        directory (str): The path to the directory containing email files.

    Returns:
        pandas.DataFrame: A DataFrame containing purchase order information.
    """

    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt") and "_PO_" in filename:
            file_path = os.path.join(directory, filename)
            try:
                with open(file_path, 'r') as file:
                    email_content = file.read()

                vendor_name = filename.split('_PO_')[0]
                po_number = filename.split('_PO_')[1].split('.')[0]

                items = extract_purchase_info(email_content)
                for item in items:
                    item['vendor_name'] = vendor_name
                    item['po_number'] = po_number
                    data.append(item)
            except FileNotFoundError:
                print(f"File not found: {file_path}")
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    return pd.DataFrame(data)

# Example Usage (replace 'your_directory' with the actual directory path)
directory = 'mail/'  # Current directory, change if needed
df = process_emails_from_directory(directory)
df

by February 09, 2025 is required
by February 03, 2025
by February 20, 2025 is required
by January 29, 2025
February 18, 2025
by February 12, 2025 is required
February 09, 2025
January 31, 2025
by February 20, 2025 is required
by February 20, 2025
February 09, 2025
February 09, 2025
by February 03, 2025
by February 18, 2025 is required
by January 26, 2025 is required
by January 26, 2025 is required
February 06, 2025
February 05, 2025
by February 04, 2025
by January 25, 2025
by February 04, 2025
by January 26, 2025
by January 26, 2025
February 02, 2025
February 11, 2025
February 03, 2025
by February 17, 2025 is required
by February 17, 2025 is required
by February 12, 2025
by February 12, 2025


Unnamed: 0,units,product_name,item_code,delivery_date,vendor_name,po_number
0,197,Pneumatic Cylinder,MA-2200,,Allen_Consulting,12
1,322,Compressor Unit,MA-2200,,Allen_Consulting,12
2,298,Hydraulic Pump,CU-5643,,Allen_Consulting,12
3,321,Hydraulic Pump,MA-2200,,Allen_Consulting,27
4,270,Pneumatic Cylinder,PC-1122,,Anderson_Industries,09
...,...,...,...,...,...,...
111,30,Gear Reducer,CB-3300,,Young_Group,44
112,419,Hydraulic Pump,MA-2200,,Young_Group,44
113,484,Motor Assembly,BS-7890,,Young_Group,44
114,470,Conveyor Belt,CU-5643,,Young_Group,57


In [22]:
df[df.delivery_date.notnull()]

Unnamed: 0,units,product_name,item_code,delivery_date,vendor_name,po_number


In [16]:
df[df.vendor_name == 'Brown_Inc']

Unnamed: 0,units,product_name,item_code,delivery_date,vendor_name,po_number
10,299,Heat Exchanger,CV-6677,,Brown_Inc,11
11,167,Heat Exchanger,PC-1122,,Brown_Inc,11
12,230,Motor Assembly,HP-1001,,Brown_Inc,11
13,299,Heat Exchanger,CV-6677,,Brown_Inc,13
14,167,Heat Exchanger,PC-1122,,Brown_Inc,13
15,230,Motor Assembly,HP-1001,,Brown_Inc,13
16,264,Motor Assembly,CV-6677,,Brown_Inc,50
17,355,Bearing Set,EP-9012,,Brown_Inc,55
18,135,Bearing Set,CV-6677,,Brown_Inc,58
19,304,Bearing Set,HE-2345,,Brown_Inc,58
