## Extract Data from Emails
Extract information from emails using regex or other applicable methods.

### A. Win32
Extract directly from Outlook.

#### 1. Import Libraries

In [145]:
import pandas as pd
import numpy as np
import os
import shutil
import win32com.client
import re
import time
import dateutil.parser
from dateutil import parser
from dateutil import relativedelta
from datetime import datetime
from pandas import ExcelWriter
from pandas import ExcelFile

#### 2. Initiate

In [2]:
# Set Outlook settings to retrieve messages from main inbox.
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
inbox = outlook.GetDefaultFolder(6) 
messages = inbox.Items

# Set initial lists, dataframes and file path.
submission = []
df_preval = pd.DataFrame()
df_error = pd.DataFrame()
df_cert = pd.DataFrame()

#### 3. Loop and Set
Loop through and identify D2-related messages and set parameters and values for:
- File submissions
- Pre-validation errors
- D2 error reports
- D2 certifications

In [3]:
# Loop through each message and filter accordingly.
for m in messages: 
    #Extract date.
    date = m.CreationTime.strftime('%Y-%m-%d %H:%M:%S')
    date = dateutil.parser.parse(date)
    
    # Retrieve file submission messages.
    if m.Subject == 'Data Act File Submission' and m.SenderName == 'OCFO - FMMI BI TEAM' or m.Subject == 'Data Act File Submission' and m.SenderName == 'FMMIBITEAM@cfo.usda.gov':
        # Set file name, date received and agency.
        date_submission = date
        submission_file_name = re.search(r'file (.*?) has', m.Body).group(1)
        agency = ''
        
        # Retrieve agency codes and set agency names.
        if ('D2_' in submission_file_name) or ('AC_' in submission_file_name):
            agency_code = re.search(r'_(.*?)_', m.Body).group(1)
            if agency_code == 'AM00':
                agency = 'Agricultural Marketing Service'
            elif agency_code == 'AO00':
                agency = 'Office of Advocacy and Outreach'
            elif agency_code == 'AP00':
                agency = 'Animal and Plant Health Inspection Service'
            elif agency_code == 'AP02':
                agency = 'Federal Shared Service Provider'
            elif agency_code == 'AR00':
                agency = 'Agricultural Research Service'
            elif agency_code == 'EC00':
                agency = 'Office of the Chief Economist'
            elif agency_code == 'ER00':
                agency = 'Economic Research Service'
            elif agency_code == 'FA00':
                agency = 'Farm Service Agency'
            elif agency_code == 'FI00':
                agency = 'Food Safety and Inspection Service'
            elif agency_code == 'FN00':
                agency = 'Food and Nutrition Service'
            elif agency_code == 'FS00':
                agency = 'Forest Service'
            elif agency_code == 'FX00':
                agency = 'Foreign Agricultural Service'
            elif agency_code == 'NI00':
                agency = 'National Institute of Food and Agriculture'
            elif agency_code == 'NR00':
                agency = 'Natural Resources Conservation Service '
            elif agency_code == 'RD00':
                agency = 'Rural Development'
            elif agency_code == 'RM00':
                agency = 'Risk Management Agency'
        else:
            agency_code = ''
        
        # Append all records to submission list.
        submission.append({'File Submission Notification Date': date_submission,'File Name': submission_file_name, 'Agency Code': agency_code, 'Agency Name': agency})
    
    # Retrieve pre-validation error messages.
    if m.Subject == 'DATA ACT D2 FILE SUBMISSION - PRE-VALIDATION ERROR' and  m.SenderName == 'BEREMOTE' or m.Subject == 'FW: DATA ACT D2 FILE SUBMISSION - PRE-VALIDATION ERROR' and m.SenderName == 'Doe, John - OCFO, Washington, DC':
        # Set date based on email receipt, i.e., directly or forwarded.
        if m.Subject == 'DATA ACT D2 FILE SUBMISSION - PRE-VALIDATION ERROR' and m.SenderName == 'BEREMOTE':
            date_preval = date
        else:
            # Extract the sent date from the body of the forwarded email.
            date_preval_text = re.search(r'day, (.*?)\n', m.Body).group(1)
            # Split at "(" for date formats containing (UTC)...
            date_preval_text = date_preval_text.split(' (')[0]
            # Convert date string to date-time.
            date_preval = parser.parse(date_preval_text)
        
        # Set filename and filepath and save to current directory.
        attachment = m.Attachments.Item(1)
        preval_attachment = attachment.FileName
        attachment.SaveAsFile(os.getcwd() + '\\' + preval_attachment)
        preval_file_name = preval_attachment[:-10]
        
        # Open text file with utf8 to prevent encoding errors. 
        with open (preval_attachment, 'rt', encoding='utf8') as txt_file:
            preval_content = txt_file.read()
            # Count instances of 'Error on line' in the text file.
            preval_count = len(re.findall(r'Error on line:', preval_content, re.IGNORECASE))
        
        #Convert to dataframe and set index=[0] to prevent scalar value error.
        preval_report = pd.DataFrame({'File Name': preval_file_name, 'Pre-Validation Error Count': preval_count, 'Pre-Validation Notification Date': date_preval}, index=[0])
        # Append all dataframes created by the for loop.
        df_preval = df_preval.append(preval_report)
        # Delete file from current directory.
        os.remove(preval_attachment)
    
    # Retrieve D2 error report messages. Set additional condition 'and len(re.findall...)==1' to filter out replies to forwarded messages.
    if 'Data Act D2 Error Report' in m.Subject and m.SenderName == 'OCFO - FMMI BI TEAM' or 'FW: Data Act D2 Error Report' in m.Subject and m.SenderName == 'Doe, John - OCFO, Washington, DC' and len(re.findall(r'From:', m.Body)) == 1:
        # Extract the reporting period date from the email body and convert to date time format.
        date_error_text = re.search(r'period (.*?)\.', m.Body).group(1)
        date_error_replace = date_error_text.replace('-','')
        date_error = datetime.strptime(date_error_replace, '%Y%m%d%H%M%S')
        
        # Set filename and filepath and save to current directory.
        attachment = m.Attachments.Item(1)
        error_attachment = attachment.FileName
        attachment.SaveAsFile(os.getcwd() + '\\' + error_attachment)
        
        # Read into dataframe, locate source file name column, and get value count. 
        error_file = pd.read_excel(error_attachment, sheetname='D2_Error_Records')
        error_file_filter = error_file.loc[:,'Source File Name'].value_counts()
        
        # Convert index (i.e., filename) and value (i.e., count) to list.
        error_file_names = error_file_filter.index.tolist()
        error_count = error_file_filter.values.tolist()
        
        # List comprehension to assign reporting period date to each record.
        error_report_period = [date_error for f in error_file_names]
        # Convert to dataframe.
        error_report = pd.DataFrame({'File Name': error_file_names, 'D2 Error Count': error_count,'D2 Error Reporting Period': error_report_period})
        # Append all dataframes created by the for loop.
        df_error = df_error.append(error_report)
        # Delete file from current directory.
        os.remove(error_attachment)
        
    # Retrieve D2 certification messages. Set additional condition 'and len(re.findall...)==1' to filter out replies to forwarded messages.
    if 'DATA ACT D2 CERTIFICATION Report' in m.Subject and m.SenderName == 'OCFO - FMMI BI TEAM' or 'FW: DATA ACT D2 CERTIFICATION Report' in m.Subject and m.SenderName == 'Doe,  John - OCFO, Washington, DC' and len(re.findall(r'From:', m.Body)) == 1:        
        # Extract the reporting period date from the email body and convert to date time format.
        date_cert_text = re.search(r'period (.*?)\.', m.Body).group(1)
        date_cert_replace = date_cert_text.replace('-','')
        date_cert = datetime.strptime(date_cert_replace, '%Y%m%d%H%M%S')
        
        # Set filename and filepath and save to current directory. 
        attachment = m.Attachments.Item(1)
        cert_attachment = attachment.FileName
        attachment.SaveAsFile(os.getcwd() + '\\' + cert_attachment)
        
        # Read into dataframe, locate source file name column, and get value count. 
        cert_file = pd.read_excel(cert_attachment, sheetname='DATA ACT D2 CERTIFICATION REPOR')
        cert_file_filter = cert_file.loc[:,'Source File Name'].value_counts()
        
        # Convert index (i.e., filename) and value (i.e., count) to list.
        cert_file_names = cert_file_filter.index.tolist()
        cert_count = cert_file_filter.values.tolist()
        
        # List comprehension to assign reporting period date to each record.
        cert_report_period = [date_cert for f in cert_file_names]
        # Convert to dataframe.
        cert_report = pd.DataFrame({'File Name': cert_file_names, 'D2 Certification Count': cert_count,'D2 Cert Reporting Period': cert_report_period})  
        # Append all dataframes created by the for loop.
        df_cert = df_cert.append(cert_report)
        # Delete file from current directory.
        os.remove(cert_attachment)

#### 4. Dataframe

In [4]:
# Convert submission list to dataframe.
df_submission = pd.DataFrame(submission)

# Rearrange columns.
df_submission = df_submission[['File Submission Notification Date', 'Agency Code', 'Agency Name', 'File Name']]
df_cert = df_cert[['File Name', 'D2 Certification Count', 'D2 Cert Reporting Period']]

# Sort dataframes by descending date and reset indices as applicable.
df_submission = df_submission.sort_values(by=['File Submission Notification Date'], ascending=False)
df_preval = df_preval.sort_values(by=['Pre-Validation Notification Date'], ascending=False).reset_index(drop=True)
df_error = df_error.sort_values(by=['D2 Error Reporting Period'], ascending=False).reset_index(drop=True)
df_cert = df_cert.sort_values(by=['D2 Cert Reporting Period'], ascending=False).reset_index(drop=True)

# Drop duplicates (based on filename) and keep first instance (i.e., most recent based on sort).
df_submission = df_submission.drop_duplicates(subset=['File Name'], keep='first')
df_preval = df_preval.drop_duplicates(subset=['File Name'], keep='first')
df_error = df_error.drop_duplicates(subset=['File Name'], keep='first')
df_cert = df_cert.drop_duplicates(subset=['File Name'], keep='first')

# Merge dataframes on filename
df_combined1 = pd.merge(df_submission, df_preval, how='outer', on='File Name').fillna('')
df_combined2 = pd.merge(df_combined1, df_error, how='outer', on='File Name').fillna('')
df_combined_final = pd.merge(df_combined2, df_cert, how='outer', on='File Name').fillna('')

#### 5. CSV File

In [6]:
# Create a csv file in the target directory.
timestr = time.strftime('%m_%d_%Y')
dest_path = r'\\wdcnetapp01\\CFOData$\\Data\\TARD\\DATA Act\\DATA Act D2 Error-Certs Files\\'
# Set the filename with a time stamp.
file_name = 'D2_Files_Tracking_Report_'+ timestr + '.csv'
df_combined_final.to_csv(dest_path + file_name, index=False)

#### 6. Folder Clean Up

In [159]:
# Delete files in the destination folder older than three months.
# Loop through each file.
today = date.today()
for file in os.listdir(dest_path):
    # Set file path.
    file_path = os.path.join(dest_path, file)
    # Extract the date from the file's name.
    date_file_text = re.search(r'Report_(.*?).csv', file)
    
    # Use conditonal to prevent errors from non-matching files. 
    if date_file_text != None:
        # Extract the date string from the match object.
        date_file = date_file_text.group(1)
        # Convert string to date time.
        date_file = datetime.strptime(date_file, '%m_%d_%Y')
        
        # Get difference between today and file's date.
        date_diff = relativedelta.relativedelta(today, date_file)
        months = date_diff.months
        days = date_diff.days
        
        # Conditional to delete files older than three months.
        if months == 3 and days > 0:
            os.unlink(file_path)

#### 7. Notes

In [7]:
'''
# To create a copy of the file in a different directory:
shutil.copyfile(source path + file name, destination path + file name)
'''

In [8]:
'''
# To create an Excel file:
writer = pd.ExcelWriter('file name.xlsx', engine='xlsxwriter')
df_combined_final.to_excel(writer, 'Sheet1', index=False)
writer.save()
'''

To execute code on a schedule, convert to a py file and run through Windows Task Scheduler.

### B. ExtractMsg

Extract information from emails saved to a drive/folder. See https://github.com/mattgwwalker/msg-extractor.

In [9]:
'''
import ExtractMsg
msg = ExtractMsg.Message('message_subject.msg')
print(msg.sender)
print(msg.date)
print(msg.body)
'''