In [1]:
import boto3
import string
import sys
import numpy as np
from hashlib import md5
from sagemaker.mxnet.model import MXNetPredictor

In [9]:
if sys.version_info < (3,):
    maketrans = string.maketrans
else:
    maketrans = str.maketrans
    
def vectorize_sequences(sequences, vocabulary_length):
    results = np.zeros((len(sequences), vocabulary_length))
    for i, sequence in enumerate(sequences):
       results[i, sequence] = 1. 
    return results

def one_hot_encode(messages, vocabulary_length):
    data = []
    for msg in messages:
        temp = one_hot(msg, vocabulary_length)
        data.append(temp)
    return data

def text_to_word_sequence(text,
                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=True, split=" "):
    """Converts a text to a sequence of words (or tokens).
    # Arguments
        text: Input text (string).
        filters: list (or concatenation) of characters to filter out, such as
            punctuation. Default: `!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`,
            includes basic punctuation, tabs, and newlines.
        lower: boolean. Whether to convert the input to lowercase.
        split: str. Separator for word splitting.
    # Returns
        A list of words (or tokens).
    """
    if lower:
        text = text.lower()

    if sys.version_info < (3,):
        if isinstance(text, unicode):
            translate_map = dict((ord(c), unicode(split)) for c in filters)
            text = text.translate(translate_map)
        elif len(split) == 1:
            translate_map = maketrans(filters, split * len(filters))
            text = text.translate(translate_map)
        else:
            for c in filters:
                text = text.replace(c, split)
    else:
        translate_dict = dict((c, split) for c in filters)
        translate_map = maketrans(translate_dict)
        text = text.translate(translate_map)

    seq = text.split(split)
    return [i for i in seq if i]

def one_hot(text, n,
            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
            lower=True,
            split=' '):
    """One-hot encodes a text into a list of word indexes of size n.
    This is a wrapper to the `hashing_trick` function using `hash` as the
    hashing function; unicity of word to index mapping non-guaranteed.
    # Arguments
        text: Input text (string).
        n: int. Size of vocabulary.
        filters: list (or concatenation) of characters to filter out, such as
            punctuation. Default: `!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`,
            includes basic punctuation, tabs, and newlines.
        lower: boolean. Whether to set the text to lowercase.
        split: str. Separator for word splitting.
    # Returns
        List of integers in [1, n]. Each integer encodes a word
        (unicity non-guaranteed).
    """
    return hashing_trick(text, n,
                         hash_function='md5',
                         filters=filters,
                         lower=lower,
                         split=split)


def hashing_trick(text, n,
                  hash_function=None,
                  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                  lower=True,
                  split=' '):
    """Converts a text to a sequence of indexes in a fixed-size hashing space.
    # Arguments
        text: Input text (string).
        n: Dimension of the hashing space.
        hash_function: defaults to python `hash` function, can be 'md5' or
            any function that takes in input a string and returns a int.
            Note that 'hash' is not a stable hashing function, so
            it is not consistent across different runs, while 'md5'
            is a stable hashing function.
        filters: list (or concatenation) of characters to filter out, such as
            punctuation. Default: `!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`,
            includes basic punctuation, tabs, and newlines.
        lower: boolean. Whether to set the text to lowercase.
        split: str. Separator for word splitting.
    # Returns
        A list of integer word indices (unicity non-guaranteed).
    `0` is a reserved index that won't be assigned to any word.
    Two or more words may be assigned to the same index, due to possible
    collisions by the hashing function.
    The [probability](
        https://en.wikipedia.org/wiki/Birthday_problem#Probability_table)
    of a collision is in relation to the dimension of the hashing space and
    the number of distinct objects.
    """
    if hash_function is None:
        hash_function = hash
    elif hash_function == 'md5':
        hash_function = lambda w: int(md5(w.encode()).hexdigest(), 16)

    seq = text_to_word_sequence(text,
                                filters=filters,
                                lower=lower,
                                split=split)
    return [int(hash_function(w) % (n - 1) + 1) for w in seq]

In [10]:
vocabulary_length = 9013
model = MXNetPredictor('sms-spam-classifier-mxnet-2022-11-21-22-42-04-356')

In [11]:
test_messages = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! ubscribe6GBP/ mnth inc 3hrs 16 stop?txtStop"]
one_hot_test_messages = one_hot_encode(test_messages, vocabulary_length)
encoded_test_messages = vectorize_sequences(one_hot_test_messages, vocabulary_length)

result = model.predict(encoded_test_messages)
print(result)

{'predicted_probability': [[0.9998780488967896]], 'predicted_label': [[1.0]]}


In [14]:
s3 = boto3.resource('s3')

In [63]:
obj = s3.Object('spamemail-email', '6jpcf86git708tir8e2rof14jluc9lgp24gb5301')
value = obj.get()['Body'].read().decode('utf-8')

In [65]:
import mailparser

In [66]:
mail = mailparser.parse_from_string(value)

In [68]:
[mail.body]

'dfasdfasdfasdfasdf\r\n-- \r\nRegards,\r\n\r\n\r\n[image: AdityaSidharta]\r\n\r\n* Aditya Kelvianto Sidharta <http://adityasidharta.github.io> * • He/Him\r\n<http://adityasidharta.github.io> • Data Scientist\r\n<http://adityasidharta.github.io>\r\n\r\nContact: (917)-667-6838 <aditya.sdrt@gmail.com>\r\n\r\nEmail: aditya.sdrt@gmail.com\r\n\r\nLocation: New York City, New York <https://goo.gl/maps/mznG2BAVpzYT6GLZ9>\r\n\r\n[image: LinkedIn] <https://www.linkedin.com/in/adityasidharta/> [image:\r\nGithub] <https://github.com/AdityaSidharta> [image: Kaggle]\r\n<https://github.com/AdityaSidharta> [image: PersonalWebsite]\r\n<http://adityasidharta.github.io>\r\n\n--- mail_boundary ---\n<div dir="ltr"><div><br clear="all"></div>dfasdfasdfasdfasdf<br><div>-- <br><div dir="ltr" class="gmail_signature" data-smartmail="gmail_signature"><div dir="ltr"><div><div dir="ltr"><div><div dir="ltr"><div><div dir="ltr">Regards,</div><div dir="ltr"><br></div><div dir="ltr">\r\n<br>\r\n \r\n<table style="marg

In [62]:
[x.strip() for x in value.split('\n')][24]

'O1Bg=='

In [38]:
test_messages = [body_]
one_hot_test_messages = one_hot_encode(test_messages, vocabulary_length)
encoded_test_messages = vectorize_sequences(one_hot_test_messages, vocabulary_length)

result = model.predict(encoded_test_messages)
print(result)

{'predicted_probability': [[0.030335022136569023]], 'predicted_label': [[0.0]]}


In [39]:
date_ = value[0].split("Date:")[1].strip()
to_ = value[1].split("To:")[1].strip()
from_ = value[2].split("From:")[1].strip()
subject_ = value[3].split("Subject:")[1].strip()
body_ = []
for line in value[4:]:
    line = line.strip()
    if line != '':
        body_.append(line)
body_ = ' '.join(body_)

IndexError: list index out of range

In [None]:
def lambda_handler(event, context):
    print("Received event: " + json.dumps(event))
    try:
        bucket = event['Records'][0]['s3']['bucket']['name']
        key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
        today = datetime.now()

        header = s3.head_object(Bucket=bucket, Key=key)
        if 'x-amz-meta-customlabels' in header['ResponseMetadata']['HTTPHeaders']:
            custom_labels = header['ResponseMetadata']['HTTPHeaders']['x-amz-meta-customlabels'].split(',')
            print("detected custom labels : {}".format(custom_labels))
        else:
            custom_labels = []


        response = rekognition.detect_labels(Image={'S3Object':{'Bucket':bucket,'Name':key}},
            MaxLabels=10)
        print("rekognition response : {}".format(response))
        if 'Labels' in response:
            labels = [x['Name'] for x in response['Labels']]
            print("detected labels : {}".format(labels))
        else:
            labels = []

        total_labels = custom_labels + labels
        clean_labels = []
        if total_labels:
            for label in total_labels:
                clean_label = ''.join(x for x in label if x.isalpha()).lower()
                clean_labels.append(clean_label)

            opensearch_id = datetime.now().strftime("%Y%m%d%H%M%S%f")
            document = {
                    'label': ",".join(clean_labels), 
                    's3_path': "https://{}.s3.amazonaws.com/{}".format(bucket, key)
                }
            response = opensearch.index(
                index = index_name,
                body = document,
                id = opensearch_id,
                refresh = True
            )
            print("Adding to OpenSearch : index : {}, body : {}, id : {}".format(index_name, document, opensearch_id))

        return {
            "objectKey": key,
            "bucket": bucket,
            'createdTimestamp': today.isoformat(),
            "labels": total_labels,
        }
    except Exception as e:
        print(e)
        print('Error processing object {} from bucket {}.'.format(key, bucket))
        raise e