#### <font color="#000086"> This script predicts the <b>topic category</b> and <b>sentiment</b> in <font color="red">real time</font> for the latest email message received in S3. 
#### <font color="#000086"> The script uses the trained custom classification model <i>TopicClassifierSept7</i> for the predictions. 
#### <font color="#000086"> Following are the steps involved in detail: 
##### <font color="#000086"> &emsp;Step 1: Retrieve the latest message posted in S3 in "pro-rds-emails" bucket under Inquiry/ THOR/ or thor/ folders. 
##### <font color="#000086"> &emsp;Step 2: Extract body, subject and fromEmailId and strip the html tags off the body. 
##### <font color="#000086"> &emsp;Step 3: Find out if the email is one of the types: (a) Out Of Office (OOO), (b) Undeliverable, (c) Volunteer messages. <br> &emsp;&emsp;&emsp;&emsp;&nbsp; If so, update the email_type as 'OOO', 'VOLUNTEER' or 'UNDELIVERABLE', and email_status as 'P' in the PostgreSQL database table and <br> &emsp;&emsp;&emsp;&emsp;&nbsp; exit the loop. 
##### <font color="#000086"> &emsp;Step 3: Run cleanup functions to remove name, signature, organizations, locations, phone numbers, URLs, punctuations, salutations etc. 

In [None]:
!pip install spacy

In [None]:
!pip install pyap

In [None]:
!pip install psycopg2-binary

#### <b><font color=blue>Customization steps for pyap<b>  
1)  Login into Terminal  
2)  Install vim
    
    a)  apt-get update 
    
    b)  apt-get install vim  
3)  vi data.py in /opt/conda/lib/<b>python3.10</b>/site-packages/pyap/source_US  
4)  Delete all contents (1,$d) and refresh the file with /root/scripts/data.py (r: /root/scripts/data.py)
    

!python -m spacy download en_core_web_trf

In [None]:
import re
import boto3
import pyap
import spacy
import json
from bs4 import BeautifulSoup as bs
import psycopg2

In [None]:
NER = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

#### Define Regular Expresssions

In [None]:
#----------------------------------- Define Regular Expresssions -----------------------------------#

REMOVE_CTRL_CHAR = re.compile('[^\x00-\x7F]+')
URL_REGEX = re.compile(
    r"(?:^|(?<![\w\/\.]))"
    r"(?:(?:https?:\/\/|ftp:\/\/|www\d{0,3}\.))"
    r"(?:\S+(?::\S*)?@)?" r"(?:"
    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
    r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
    r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
    r"|"
    r"(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)"
    r"(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*"
    r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))" r"|" r"(?:(localhost))" r")"
    r"(?::\d{2,5})?"
    r"(?:\/[^\)\]\}\s]*)?",
    flags=re.UNICODE | re.IGNORECASE | re.MULTILINE
)

# email_pattern = re.compile(r"([a-z0-9!#$%&'*+\/=?^_`{|.}~-]+@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)", flags=re.UNICODE | re.IGNORECASE | re.MULTILINE)

email_pattern = re.compile(r"[\w\.-]+@[\w\.-]+\.\w+", flags=re.UNICODE | re.IGNORECASE | re.MULTILINE)

suite_pattern = re.compile(r"(?:suite\s*\d{1,5}$)", flags=re.UNICODE | re.IGNORECASE | re.MULTILINE)


#### Define Match Words

In [None]:
#----------------------------------- Define Match Words -------------------------------------#

ooo_matches = ("will be on pto", "m out of office", "m out of the office", "ll be out of office", "ll be out of the office", 
               "m currently out of office", "m currently out of the office")

undelivered_matches = ("was undeliverable.", "Subject: Undeliverable:")

volunteer_matches = ("volunteers needed! ", "rds listening session – ira", "RDS listening session – ira ")

sal_st_matches = ("rds:", "hello", "good ", "dear", "hi", "to whom it may concern")

sal_end_matches = ("sincerely,", "thanks", "thank you", "thanks in advance", "regards", "best", "kind regards", 
                   "kindest regards,", "kindly,", "blessings,", "sent from ")

addr_end_matches = (" st.", " street", " road", " plaza", " place", " dr.", " dr", " blvd", " blvd.", " floor", " lane", " ave", 
                    " avenue", " ct.", " parkway", " highway", " boulevard", " park")

sig_start_matches = ("office", "ph:", "fax", "mobile", "work:")

suppref_matches  = ("PersonEmailAddress:", "PersonName:", "PersonUserName:", "PersonID:", "PlanSponsorID:", "ReferenceNumber:", 
                     "ApplicationID:", "Priority:", "URL:", "SLATime:")

po_box_matches = ("p.o. box", "p.o box", "po box")

ignore_matches = ("name: ", "your name", "application number:", "email:", "url:", "sent:", "to:", "cc:", "phone", 
                  "m ", "organization name: ", "plan sponsor id: ", "applicationid: ", "plan year start date:", 
                  "plan year end date:", "please note: this email", "please refer to the rds user", 
                  "if you need further assistance with this", "important: when a user enters login")

matches = ["original message", "delete this line", "external sender", "external mail", "content is safe", "unless you recognize"]

replace_list = [ "cms' rds center", " csd ", " re ", " fw ", "ph: ", "fax", "(mobile)", "(work)", "(office)", "(fax)", "samrat", 
                " rick", "leidos", " est", " 's", " the ", "american bureau of shipping", "microsoft", "mozilla", "apple",
                "nokia of america", "nokia", "chubb ina holdings inc"]

nonEnglish_matches = ("avis :", "title avis :", "anmerkung:", "upozornení:") 

suppref_replace_list = [ "QuestionSubCategoryDesc:", "TopicCategoryDesc:", "UserQuestionText:" ]


#### Define Non Breaking Elements for bs4

In [None]:
strip_tags = ['style', 'script', 'code']
NON_BREAKING_ELEMENTS = ['a', 'abbr', 'acronym', 'audio', 'b', 'bdi', 'bdo', 'big', 'button',
    'canvas', 'cite', 'code', 'data', 'datalist', 'del', 'dfn', 'em', 'embed', 'i', 'iframe',
    'img', 'input', 'ins', 'kbd', 'label', 'map', 'mark', 'meter', 'noscript', 'object', 'output',
    'picture', 'progress', 'q', 'ruby', 's', 'samp', 'script', 'select', 'slot', 'small', 'span',
    'strong', 'sub', 'sup', 'svg', 'template', 'textarea', 'time', 'u', 'tt', 'var', 'video', 'wbr']

#### PostgreSQL functions

In [None]:
#------------------------------ Function to print errors from PostgreSQL ------------------------------#

def print_psycopg2_exception(err):
    err_type, err_obj, traceback = sys.exc_info()
    line_num = traceback.tb_lineno
    print ("\npsycopg2 ERROR:", err, "on line number:", line_num)
    print ("psycopg2 traceback:", traceback, "-- type:", err_type)
    print ("\nextensions.Diagnostics:", err.diag)
    print ("pgerror:", err.pgerror)
    print ("pgcode:", err.pgcode, "\n")
    input("Press Enter to continue...")
    

In [None]:
#------------------------------ Update PostgreSQL for the email_key -------------------------------------#

def upd_postgres_record(email_key, extract_status, ignore_status, response_status, email_type):
    upd_query = "UPDATE ml_extract_track SET "
    upd_query = upd_query + "extract_status = '" + extract_status + "', email_type = '" + email_type + "', "
    upd_query = upd_query + "ignore_status = '" + ignore_status + "', response_status = '" + response_status + "', "
    upd_query = upd_query + "last_update_date = now() WHERE email_key = '" + email_key + "' AND extract_status = 'E'; "
    # print(upd_query)

    # Execute SQL Command and insert the details to PostgreSQL
    try:
        cursor.execute(upd_query)
        conn.commit()
        count = cursor.rowcount
        print("   ", count, " record updated successfully\n")
        # cnt = cnt + 1

    except Exception as err:
        print_psycopg2_exception(err)
        conn.rollback()
        

#### <font color=brown>Support Reference Email Cleanup function

In [None]:
def find_supp_ref_eom(trimmedTxt):
    finalBody = ""
    email_type = ""
    response_status = ""
    sal_st_found = False
    sal_end_found = False
    from_found = False
    fw_pattern = False
    lno = 0

    for w in suppref_replace_list:
            trimmedTxt = trimmedTxt.replace(w, "")
    
    lines = trimmedTxt.splitlines()
    for line in lines:
        lno = lno + 1
        
        if line.startswith(suppref_matches):
            continue
              
        lineLower = line.lower()
        l = len(line)
        word_cnt = len(line.split())

        if not (sal_st_found):
            if line.endswith((",", ":", "-", "!")) and word_cnt < 4:
                sal_st_found = True
                continue
                
            if lineLower.startswith(sal_st_matches):
                sal_st_found = True
                if word_cnt < 6:
                    continue
                else:
                    # Look for comma, hyphen or colon
                    comma_loc = lineLower.find(",")
                    if comma_loc == -1: comma_loc = lineLower.find(":")
                    if comma_loc == -1: comma_loc = lineLower.find("-")
                    if comma_loc != -1:
                        finalBody = finalBody + line[comma_loc+1:]
                        continue

        # Look for "Sincerely", "Thanks" etc. If found, write the file
        if lineLower.startswith(sal_end_matches):
            if word_cnt < 6 and lno > 2:
                sal_end_found = True
                # Get out of FOR LOOP now
                break

        if lineLower.endswith(addr_end_matches) and word_cnt < 8:
            addr_found = True
            continue

        if lineLower.endswith(sig_start_matches) and word_cnt < 5:
            continue
            
        if lineLower.startswith(po_box_matches):
            continue

        if (lineLower.find("p.o. box") != -1 or lineLower.find("p.o box") != -1 or lineLower.find("po box") != -1) and word_cnt < 15:
            continue

        # See which mataches and cleanups are applicable 5/29

        if lineLower.startswith(ignore_matches):
            continue
        
        finalBody = finalBody + line + "\n"
    
    return finalBody, sal_end_found

#### <font color=brown>Find end of message for all types of emails

In [None]:
#------------------------------- Find End Of Message for all types email messages ------------------------------#

def find_end_of_message(omhStripped):
    finalBody = ""
    email_type = ""
    response_status = ""
    sal_st_found = False
    sal_end_found = False
    from_found = False
    fw_pattern = False
    lno = 0

    lines = omhStripped.splitlines()
    for line in lines:
        lno = lno + 1
        if lno == 1:
            if line.startswith("re: ") and re.search('\d', line):
                email_type = "RESPONSE"
                response_status = 'R'
                loc = line.rfind("re: ")
                locf = line.rfind("fw: ")
                if loc > locf:
                    finalBody = finalBody + line[loc+3:] + "\n"
                else:
                    finalBody = finalBody + line[locf+3:] + "\n"
                
                continue

            if line.startswith("fw: "):
                # it is a whole new ball game! Read until "Thanks" or name
                # But skip lines starting with "From", "Sent", "To", email-id etc. Include "Subject"
                fw_pattern = True
                email_type = "FORWARD"
                response_status = 'F'
                from_count = 0
            
            if line.startswith("=?utf-8?"):
                continue

            finalBody = finalBody + line + ". "
            continue
        
        lineLower = line.lower()
        l = len(line)
        word_cnt = len(line.split())

        # Looks for Salutations like "hello", "good ", "dear " or "hi"
        if lno < 4 and lno > 1 and sal_st_found is False:
            if line.endswith((",", ":", "-", "!")) and word_cnt < 4:
                sal_st_found = True
                continue
            if lineLower.startswith(sal_st_matches):
                sal_st_found = True
                if word_cnt < 6:
                    continue
                else:
                    # Look for comma, hyphen or colon
                    comma_loc = lineLower.find(",")
                    if comma_loc == -1: comma_loc = lineLower.find(":")
                    if comma_loc == -1: comma_loc = lineLower.find("-")
                    if comma_loc == -1: comma_loc = lineLower.find("!")
                    if comma_loc != -1:
                        finalBody = finalBody + line[comma_loc+1:]
                        sal_st_found = True
                        continue
        
        if lno < 4 and line.endswith((",", ":", "-", "!")) and word_cnt < 3:
            sal_st_found = True
            continue
            
        # Look for "Sincerely", "Thanks" etc. If found, write the file
        if lineLower.startswith(sal_end_matches):
            if word_cnt < 6 and lno > 2:
                sal_end_found = True
                # Get out of FOR LOOP now
                break

        if lineLower.endswith(addr_end_matches) and word_cnt < 8:
            addr_found = True
            break

        if lineLower.endswith(sig_start_matches) and word_cnt < 5:
            addr_found = True
            continue
            
        if lineLower.startswith("from:"):
            if (fw_pattern):
                if from_count > 0:
                    break
                else:
                    from_count += 1
                    continue
            else:
                break

        if lineLower.startswith("on ") and lineLower.endswith("wrote:"):
            continue

        # This line is followed by "--Original Message--"
        if lineLower.startswith("---------------------------------------------"):
            break

        if lineLower.startswith("subject:"):
            finalBody = finalBody + line[8:]
            continue
        
        if lineLower.startswith(po_box_matches):
            continue

        if (lineLower.find("p.o. box") != -1 or lineLower.find("p.o box") != -1 or lineLower.find("po box") != -1) and word_cnt < 15:
            continue
            
        if lineLower.startswith(ignore_matches):
            continue

        if lineLower.startswith("www.") and word_cnt < 3:
            continue
        
        finalBody = finalBody + line + "\n"
    
    return finalBody, email_type, response_status, sal_end_found
            

#### <font color=brown>Final Cleanup function

In [None]:
#-------------------------------- Final Cleanup before writing to file ----------------------------------#

def final_cleanup(finalBody, email_type, sal_end_found):

    org_list = ""
    emailStripped = re.sub(email_pattern, ' ', finalBody)
    trimmedTxt = re.sub(r' {2,}' , ' ', emailStripped)
    
    cleanedText = trimmedTxt
    if email_type == "SUPP_REF":
        for w in suppref_replace_list:
            cleanedText = cleanedText.replace(w, " ")
            
    name_found = False
    nameStripped = cleanedText
    text1= NER(cleanedText)
    for word in text1.ents:
        if word.label_ == "PERSON":
            if sal_end_found is True or email_type == "SUPP_REF":
                altered_text = nameStripped.replace(word.text, " ")
                nameStripped = altered_text
                continue
            else:
                loc = nameStripped.find("\n" + word.text)
                if loc != -1:
                    word_cnt = 100 # some big number
                    nloc = nameStripped.find("\n", loc+1)
                    if nloc != -1:
                        line = nameStripped[loc+1:nloc]
                        word_cnt = len(line.split())
                    if nloc == -1 or word_cnt < 9:
                        altered_text = nameStripped[0:loc]
                        nameStripped = altered_text
                        name_found = True
                        continue     
                else:
                    altered_text = nameStripped.replace(word.text, " ")
                    nameStripped = altered_text
                    continue
    
    # Limit the message at the address line
    addrStripped = nameStripped
    addresses = pyap.parse(nameStripped, country='US')
    if addresses:
        # print("found addresses")
        for address in addresses:
            str_addr = str(address)
            if sal_end_found is True or name_found is True or email_type == "SUPP_REF":
                altered_text = addrStripped.replace(str_addr, " ")
                addrStripped = altered_text
                continue
            else:
                loc = addrStripped.find("\n"+ str_addr)
                if loc != -1:
                    altered_text = addrStripped[0:loc]
                    addrStripped = altered_text
                    break
                else:
                    altered_text = addrStripped.replace(str_addr, " ")
                    addrStripped = altered_text
                    continue 
    else:
        if sal_end_found is False and name_found is False and email_type != "SUPP_REF":
            # print("No address found.. trying for suite")
            # Look for lines ending in "Suite #"
            match = re.search(suite_pattern, addrStripped)
            if match != None:
                # print("Match at index %s, %s" % (match.start(), match.end()))
                match_str = addrStripped[match.start():match.end()]
                loc = addrStripped.find(match_str)
                altered_text = addrStripped[0:loc]
                # print("match_str: ", match_str, "altered_text:", altered_text)
                nloc = altered_text.rfind("\n")
                addrStripped = altered_text[0:nloc]
                # print("Suite Found:", addrStripped, "\n")
                
    for word in text1.ents:
        if word.label_ == "GPE":
            altered_text = addrStripped.replace(word.text, " ")
            addrStripped = altered_text
            continue
        # if word.label_ == "ORG":
            # org_list = org_list + word.text + "\n"
    
    emailStripped = addrStripped.replace("’", "'")
    urlStripped = re.sub(URL_REGEX, ' ', emailStripped)
    urlStripped = REMOVE_CTRL_CHAR.sub(' ', urlStripped)
    
    replacedText = urlStripped.lower()
    for w in replace_list:
        replacedText = replacedText.replace(w, " ")
            
    noDigitTxt = ''.join((x for x in replacedText if not x.isdigit()))
    
    noPuncTxt = re.sub(r"[^\w\s\'\’]", ' ', noDigitTxt)
    noOneCharTxt = ' '.join([word for word in noPuncTxt.split() if len(word)>1 ])
    cleanedText = noOneCharTxt
    
    cleanedTxt = re.sub(r' {2,}' , ' ', cleanedText)
    # print("Cleaned txt:\n", cleanedTxt, "\n", "ignore_status:", ignore_status, "response_status:", response_status, "email_type:", email_type)
    # input("Enter to continue..")
    
    # return cleanedTxt, org_list
    return cleanedTxt

In [None]:
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
dl_bucket = "pro-ads-datalake"
my_bucket = s3.Bucket(dl_bucket)
s3_prefix = "ml-testing-files/"

client = boto3.client('comprehend', region_name='us-east-1')
data_access_role_arn = "arn:aws:iam::XXXXXXXXXX:role/service-role/AmazonSageMaker-ExecutionRole-20230418T155781"
doc_classifier_arn = "arn:aws:comprehend:us-east-1:XXXXXXXXXX:document-classifier/TopicClassiferAug4"

#### <font color=brown>Function to get the latest email written to S3 

In [None]:
def get_most_recent_s3_object(bucket_name, prefix):
    paginator = s3_client.get_paginator( "list_objects_v2" )
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
    latest = None
    for page in page_iterator:
        if "Contents" in page:
            latest2 = max(page['Contents'], key=lambda x: x['LastModified'])
            if latest is None or latest2['LastModified'] > latest['LastModified']:
                latest = latest2 
    return latest

latest = get_most_recent_s3_object("pro-rds-emails", "Inquiry/")
print(latest)
print(latest['Key'])

In [None]:
import boto3
from operator import itemgetter

s3 = boto3.resource('s3')
myBucket = s3.Bucket('pro-rds-emails')
objects = []

for obj in myBucket.objects.filter(Prefix="Inquiry/"):
    # print(object_summary.key, object_summary.last_modified)
    objects.append({
        'key': obj.key,
        'last_modified': obj.last_modified
    })

sorted_objects = sorted(objects, key=itemgetter('last_modified'), reverse=True)

cnt = 1
for obj in sorted_objects:
    print(obj['key'], obj['last_modified'])
    cnt = cnt + 1
    if cnt > 10:
        break

In [None]:
#------------------------------- Connect to PostgreSQL and read records ---------------------------------#

cursor.close()
conn.close()
conn = psycopg2.connect(database="YOUR_DATABASE", user='<YOUR_USER>', password='<YOUR_PASSWORD>', 
                        host='<DATABASE_HOST>', port= '5432')
cursor = conn.cursor()

#### <font color=blue> Main Routine starts here

In [None]:
#--------------------------------------- Main Program Starts Here ----------------------------------------#

cnt = 0

for object in my_bucket.objects.filter(Prefix=s3_prefix):

    if object.key[-1] == "/": 
        continue

    out_file_name = object.key.split('/')[-1]
    email_key = out_file_name
    print(object.key)
    
    # if out_file_name != "f01b4829-ba8e-4992-927a-271165eb1a22.i.txt":
        # continue
    
    # Extract Subject and Body from JSON
    fileContent = object.get()['Body'].read().decode('utf-8')
    jsonContent = json.loads(fileContent)
    jsonSubject = jsonContent['subject']
    jsonSubject = jsonSubject.lower()
    jsonBody = jsonContent['body']
    fromEmail = jsonContent['fromEmailId']
    sender = fromEmail.lower()
    
    if sender.find("cms.hhs.gov") != -1:
        email_type = "CMS SENT"
        ignore_status = 'Y'
        upd_postgres_record(email_key, "P", ignore_status, response_status, email_type)
        continue
    
    # Remove html tags
    soup_plain = bs(jsonBody, features="html.parser")
    for element in soup_plain(strip_tags): element.extract()
    for element in soup_plain.find_all():
        if element.name not in NON_BREAKING_ELEMENTS: element.append('\n')
    emailBody = soup_plain.get_text()
    emailBody = "".join([s for s in emailBody.strip().splitlines(True) if s.strip()])
    emailBody = jsonSubject + "\n" + emailBody
    print(emailBody, "\n from:", fromEmail)
    
    # Initialize variables
    loc = 0 
    nloc = 0
    cidStripped = ""
    trimmedTxt = ""
    omhStripped = ""
    eomText = ""
    cleanedTxt = ""
    first_line = ""
    second_line = ""
    org_list = ""
    email_type = ""
    ignore_status = ""
    response_status = "" 
    sal_end_found = False 
    
    cnt += 1

    if any([x in emailBody for x in ooo_matches]):
        email_type = "OOO"
        ignore_status = "Y"
        upd_postgres_record(email_key, "P", ignore_status, response_status, email_type)
        continue
        
    if any([x in emailBody for x in volunteer_matches]):
        email_type = "VOLUNTEER"
        ignore_status = "Y"
        upd_postgres_record(email_key, "P", ignore_status, response_status, email_type)
        continue
    
    if any([x in emailBody for x in undelivered_matches]):
        email_type = "UNDELIVERABLE"
        ignore_status = "Y"
        upd_postgres_record(email_key, "P", ignore_status, response_status, email_type)
        continue
    
    cidStripped = re.sub(r'\[cid.*?\]', ' ', emailBody, flags=re.MULTILINE|re.IGNORECASE)

    # Remove multiple empty lines and remove multiple Spaces for address match to work with "re"
    trimmedTxt = '\n'.join([line.strip() for line in cidStripped.splitlines()])
    trimmedTxt = re.sub(r' {2,}' , ' ', trimmedTxt)
    
    first_line = trimmedTxt.splitlines()[0]
    if emailBody.count('\n') > 1:
        second_line = trimmedTxt.splitlines()[1]
    else:
        second_line = ""
    
    if first_line.startswith("test") or second_line.startswith("PersonName:TestJ") or sender.find("test") != -1:
        email_type = "TEST"
        ignore_status = 'Y'
        upd_postgres_record(email_key, "P", ignore_status, response_status, email_type)
        continue
    
    # if first_line.startswith(("rds support request ref", "f#", "h#", "o#")):
    if second_line.startswith("PersonName:"):
        email_type = "SUPP_REF"
        eomText, sal_end_found = find_supp_ref_eom(trimmedTxt)
        cleanedTxt = final_cleanup(eomText, email_type, sal_end_found)
        
    if first_line.startswith("rds support request ref") and not second_line.startswith("PersonName:"):
        email_type = "RE_SUPP_REF"
    
    if email_type != "SUPP_REF":
        # loremStripped = omhStripped.split("Lorem", 1)[0]
        disclaimStripped = trimmedTxt
        
        if "\nDisclaimer" in trimmedTxt or "\nDISCLAIMER" in trimmedTxt:
            disclaimStripped = trimmedTxt.split("\nDisclaimer", 1)[0]
            
        omhStripped = disclaimStripped.split("Original Message Header", 1)[0] 
        omhStripped = omhStripped.split("Original message header", 1)[0]
        # input("B4 find_end_of_message.. Press Enter to continue..")
        
        eomText, email_type, response_status, sal_end_found = find_end_of_message(omhStripped)
        # cleanedTxt, org_list = final_cleanup(eomText, email_type, sal_end_found)
        
        = final_cleanup(eomText, email_type, sal_end_found)
        
        print(cleanedTxt)
        upd_postgres_record(email_key, "C", ignore_status, response_status, email_type)

In [None]:
import boto3

client = boto3.client('comprehend', region_name='us-east-1')

cleanedTxt = "response file plan year rds support please review this issue again submitted an upload on received response file back today and all zeros are still appearing on response file continues to be missing reason codes and eligibility time frames please let me know why this error is still occurring as we need to get this solved to complete reopening by deadline thank you in advance for your assistance"
cleanedTxt = "unable to access site am unable to access site to complete my reconciliation it appears as though site is down"
cleanedTxt = "help thank you for your inquiry we're very sorry you're experiencing difficulties registering as account manager to begin your registration click account manager registration page or copy and paste following link into address bar of your browser refer to account manager registration for guidance on this process including step by step instructions"
cleanedTxtx = "change vendor for cost reporting thank you for your inquiry application id is currently in an incomplete status plan sponsors can only complete payment setup when an application is in any of following statuses approved reconciliation initiated reconciliation cost reporting opened reconciliation cost reporting closed reconciliation request completed as such you cannot complete payment setup for this application at this time after this application has been submitted and approved then you can complete payment setup"
cleanedTxt = "withdraw application rds application thank you for this information will update application status for application id to withdrawn an email notification will be sent once this action is complete"
cleanedTxt = "reconciliation issue with app id evansville please instruct me on why am unable to enter costs for this reconciliation manage final costs option is greyed out and not selectable even though finalize covered retirees step is completed"
cleanedtxt = "request we would like to receive so we can process our overpayment"
cleanedTxt = "re critical information about reconciliation deadline of your application in it had been determined there's no action needed by group health plan since there are no retirees for plan year we worked with from to close out cy application please refer to two emails for reference please let me know if you have any questions"
# cleanedTxt = "unable to request payment am unable to request payment at this time it doesn't allow me to confirm cost even after have reviewed it please advise me on how to proceed thank you school business executive cincinnatus csd opt opt"
# cleanedtxt = "plan id application cost reporter designee to whom can be of assistance our cost reporter for calendar year application is although he was previously assigned as designee st attached he stated he is unable to access application to report cost tried to remove his privilege and re enter him but website won't save and accept his designations to report costs and view send receive files attachment please review and advise as troubleshooting steps within guide has been unsuccessful"
# cleanedTxt = "upport request ref reporting interim costs cost reporting wanted to make sure taht vendor id is now properly identified on our application so that costs can be uploaded apparently costs had previoulsy been rejected believe that have fixed issue but wanted to confirm before next upload thank you for any assistance you can provide"
cleanedTxt = "rds reopening request approved this notice is to inform that retiree drug subsidy rds center has approved following reopening appeal tracking number determination that is subject of reopening request reconciliation final payment date reopening request was received by person who requested reopening new reconciliation deadline will allow plan sponsor to submit cost data beyond month timeframe set forth in and reopen pursuant to if plan sponsor wishes to increase amount of its subsidy for this application as stated in its reopening request plan sponsor must recomplete reconciliation by reconciliation deadline specified previously in this email in manner that reflects data and or subsidy amount plan sponsor specified in its request for reopening however plan sponsor is not required to recomplete reconciliation by that deadline if plan sponsor fails to or chooses not to will reinstate previous payment determination and amount of subsidy for application will not change if plan sponsor wishes to recomplete reconciliation read following for important information and steps plan sponsor must take for this application completing reconciliation changed status of this application to reconciliation initiated reconciliation step request list of covered retirees is marked with blue arrow to indicate that it is next step to be completed you are required to submit retiree list as described in to prior to requesting covered retiree list in reconciliation step this retiree list should contain at least one row for each beneficiary for which you are requesting subsidy please complete reconciliation step through step by reconciliation deadline specified in this email for more information go to reconciliation covered retirees after your retiree list has been processed and you have received retiree response file pursuant to matches names and identifying information for individuals submitted as qualifying covered retirees with database to determine which retirees are part eligible individuals who are not enrolled in part plan provides information concerning results of search such as names and other identifying information if necessary to sponsor or to designee please request covered retiree list in reconciliation step covered retiree list should be available within approximately minutes from time of request please download and evaluate covered retirees contained in this list if retiree record that was expected to be on covered retiree list crl is absent or if any of subsidy periods do not match plan sponsor's internal records discrepancy must be resolved before proceeding with remaining steps of reconciliation for more information on resolving discrepancies in covered retiree list go to request covered retiree lists the plan sponsor is required to review and agree to covered retiree list in reconciliation step consistent with sponsor will receive subsidy payment for each qualifying covered retiree enrolled with sponsor of qualified retiree prescription drug plan in plan year plan sponsors may only submit cost data for qualifying covered retirees benefit options and subsidy periods listed in covered retiree list that was downloaded from cost reporting cost data will be duplicated if same report is sent by both mainframe and data entry methods if this occurs one of reports must be deleted zeroed out to resolve duplication before reconciliation is submitted to after all corrected cost reports are received consistent with plan sponsor must close cost reporting and complete remaining steps of reconciliation by reconciliation deadline mainframe cost reports if revised mainframe cost report is submitted by same source that previously submitted cost report then revised cost report is replaced by old cost report if there are mainframe cost reports that plan sponsor wishes to retain plan sponsor does not have to do anything status remains mainframe submitted for more information go to reconciliation data entry cost reports the status of data entry cost reports has changed to data entry update entry required all data entry cost reports must be re saved or re submitted consistent with an account manager or designee may perform this action if they have appropriate privileges account managers with report costs privileges assigned data entry reporting method may access final cost data by selecting either enter update costs action within reconciliation step manage submission of final cost reports or final costs action on application list page designees with report costs privileges assigned data entry reporting method may access cost reports by selecting final costs on application list page for more information go to submit final cost data if further action is required on behalf of plan sponsor to effectuate this decision plan sponsor will be notified by if you need more information contact"
response = client.classify_document(
    Text=cleanedTxt, 
    EndpointArn='arn:aws:comprehend:us-east-1:XXXXXXXXXX:document-classifier-endpoint/rdsEmailClassifier'
)
print(response)