In [None]:
# imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import re
import pickle
import email
from tqdm import tqdm
import datetime
from dateutil import parser
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')

!pip install -q gpt-2-simple
import gpt_2_simple as gpt2
import tensorflow as tf
import re
!pip install --upgrade --no-cache-dir gdown

import nltk.translate.bleu_score as bleu


### Helper Functions

In [2]:
def remove_extensions(text):
    '''
    We removed attachments while extracting body but not the name of these attachments
    removing attachment_names based on what i encountered in subject and body
    '''
    ext_patterns = ["\S+\.doc","\S+\.jpeg","\S+\.jpg","\S+\.gif","\S+\.csv","\S+\.ppt","\S+\.dat","\S+\.xml","\S+\.xls","\S+\.sql","\S+\.nsf","\S+\.jar","\S+\.bin","\S+\.txt"]
    pattern = '|'.join(ext_patterns)
    text = re.sub(pattern,'',text)
    return text

def remove_personal_name(text):
    '''
    Helper function to Filter out names using NER
    '''
    s = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
    for ele in s:
        if isinstance(ele, nltk.Tree):
            if ele.label()=='PERSON':
                for word,pos_tag in ele:
                    try:     # words containing a special character will raise an error so handling it, these words weren't a name so we can safely skip it
                        val = re.sub(word,'',text)
                        text = val
                    except:
                        continue
    return text

def decontracted(phrase):
    """
    Returns decontracted phrases
    """
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"ain\'t", "am not", phrase)
    phrase = re.sub(r"let\'s", "let us", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def remove_timestamps(text):
    '''
    Remove all types of 'text' data from timestamps
    '''
    text = text.replace('AM','')
    text = text.replace('PM','')
    text = text.replace('A.M.','')
    text = text.replace('P.M.','')
    text = text.replace('a.m.','')
    text = text.replace('p.m.','')
    text = re.sub(r"\bam\b",'',text)
    text = re.sub(r"\bpm\b",'',text)
    return text

def final_transform(text):
    '''
    We clean the full text/body using regex and other cleaning functions
    '''
    # remove URL's
    remove_url = r'(www|http)\S+'     # https://stackoverflow.com/a/40823105
    remove_phone = '(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}'   # ONLY US numbers for now --> https://stackoverflow.com/a/16699507

    #remove ANY emails
    remove_email = r'\S+@\S+'  # https://stackoverflow.com/a/64036475


    pattern_list_1 = [remove_url,remove_phone,remove_email]

    for pattern in pattern_list_1:
        text = re.sub(pattern,'',text)

    # remove attachment_names
    text = remove_extensions(text)

    # remove any word with digit
    text = re.sub(r'\w*\d\w*', '', text)

    # remove any digit
    text = re.sub('\d','',text)

    # remove text between <>,()
    remove_tags = r'<.*>'
    remove_brackets = r'\(.*\)'
    remove_special_1 = r'\\|-'  # remove raw backslash or '-'
    remove_colon = r'\b[\w]+:' # removes 'something:'

    pattern_list_2 = [remove_tags,remove_brackets,remove_special_1,remove_colon]
    for pattern in pattern_list_2:
        text = re.sub(pattern,'',text)

    # remove anything which is not a character,apostrophy ; remember to give a space on replacing with this
    remove_nonchars = r'[^A-Za-z\']'
    text = re.sub(remove_nonchars,' ',text)

    # remove AM/PM as we have a lot of timestamps in emails
    text = remove_timestamps(text)

    # remove personal names using named entity recognition
    text = remove_personal_name(text)

    # takes care of \t & \n ; remember to give a space on replacing with this
    remove_space = r'\s+'
    text = re.sub(remove_space,' ',text)

    # take care of apostrophies
    text = decontracted(text)

    # remove other junk
    text = text.replace("IMAGE",'')
    text = re.sub(r"\bth\b",'',text)

    return text.strip()


### Main Functions

In [3]:
# model is to be initialized globally outside any function
gpt2.mount_gdrive()
gpt2.copy_checkpoint_from_gdrive(run_name='run1')
sess = gpt2.start_tf_sess()
gpt2.load_gpt2(sess, run_name='run1')

def final_function_1(sent):
    '''
    * take inputs
    * preprocess it
    * make predictions from model & return
    '''    

    # check length of sentence
    MAX_LEN = 30
    sent = ' '.join(sent.strip().split()[:MAX_LEN])
    # PREPROCESS
    sent = final_transform(sent)
    # inference
    prefix="<|startoftext|> "+sent
    p = gpt2.generate(sess,
                prefix=prefix,
                truncate="<|endoftext|>",
                length=MAX_LEN,
                run_name='run1',
                temperature=0.7,
                include_prefix=True,    
                return_as_list=True
                )[0]
                
    p = p[len(prefix):]
    return p.strip()

def final_function_2(inp,tar):
    '''
    * take inputs & targets
    * predict output for input sentence
    * calc bleu-score for (model-output & target) and return it
    '''
    out = final_function_1(inp)
    hypothesis = out.split()
    reference_tar = [tar.strip().split()]
    return bleu.sentence_bleu(reference_tar,hypothesis)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading checkpoint checkpoint/run1/model-2500
INFO:tensorflow:Restoring parameters from checkpoint/run1/model-2500


In [6]:
final_function_1("Hey Rita, did you get")

'a chance to look at the Credit Agreement and review the list of issues I put together in our meeting today'

In [5]:
final_function_2("Hey Rita, did you get","the previous email on the application")

2.0732986305800918e-232

# END