## Import Necessary packages

In [1]:
import pickle
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import emoji
from nltk import wordpunct_tokenize, word_tokenize, sent_tokenize
import io
import boto3

## Pull in pickled models created in previous script

In [3]:
loaded_model = pickle.load(open("model.pkl", 'rb'))
loaded_cv = pickle.load(open("cv.pkl", 'rb'))
my_dict = pickle.load(open("bad_words.pkl", 'rb'))
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

# Push Model, CV and Bad Word Dictionary to cloud storage (S3)

# All pickle files are stored in the "byu-capstone-appropriate-checker" bucket.

### Define bucket name and key, push pickled model to S3 using the 'model' key. 

In [6]:
bucket = "byu-capstone-appropriate-checker"
key = "model"

In [7]:
pickle_buffer = io.BytesIO()
s3_resource = boto3.resource('s3')

pickle_byte_obj = pickle.dumps(loaded_model) 
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket,key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': 'RG6E5VVDJ5A1ZH8E',
  'HostId': 'KXJoQLUSdtVflxLmDnC77MK6ErEUhMFXiiY0FOfqRt3ZY1J1Njn/fWfzXXNePw0IsKk67+4ZMwc=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'KXJoQLUSdtVflxLmDnC77MK6ErEUhMFXiiY0FOfqRt3ZY1J1Njn/fWfzXXNePw0IsKk67+4ZMwc=',
   'x-amz-request-id': 'RG6E5VVDJ5A1ZH8E',
   'date': 'Thu, 07 Apr 2022 16:00:52 GMT',
   'etag': '"177333cd821482a81c8bc66d82457226"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"177333cd821482a81c8bc66d82457226"'}

### Define bucket name and key, push pickled count vectorizer to S3 using the 'count_vectorizer' key. 

In [8]:
key = "count_vectorizer"

In [9]:
pickle_buffer = io.BytesIO()
s3_resource = boto3.resource('s3')

pickle_byte_obj = pickle.dumps(loaded_cv) 
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket,key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': 'DWPSTKHXBKPA66DH',
  'HostId': 'ASNPAeh756rLuXK1hToKeNwCKjMuiuD7vHAgZaI51XjMZ9C29TdJptvkIp4DPTPePzKnS+fCJOE=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'ASNPAeh756rLuXK1hToKeNwCKjMuiuD7vHAgZaI51XjMZ9C29TdJptvkIp4DPTPePzKnS+fCJOE=',
   'x-amz-request-id': 'DWPSTKHXBKPA66DH',
   'date': 'Thu, 07 Apr 2022 16:01:14 GMT',
   'etag': '"8dc4293e591793c0de1a445cf31bd076"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"8dc4293e591793c0de1a445cf31bd076"'}

### Define bucket name and key, push pickled blacklist dictionary to S3 using the 'blacklist_dictionary' key. 

In [10]:
key = "blacklist_dictionary"

In [11]:
pickle_buffer = io.BytesIO()
s3_resource = boto3.resource('s3')

pickle_byte_obj = pickle.dumps(my_dict) 
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket,key).put(Body=pickle_byte_obj)

{'ResponseMetadata': {'RequestId': 'Q5KR43SRHWB8K2P1',
  'HostId': 'JRGAKcNUJsALh6V1YCI7vCT2hCSTd0Bfa8ddfD897mbTwTnncuZDHtqLK4nt+2TEo1TjfqjwTPY=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'JRGAKcNUJsALh6V1YCI7vCT2hCSTd0Bfa8ddfD897mbTwTnncuZDHtqLK4nt+2TEo1TjfqjwTPY=',
   'x-amz-request-id': 'Q5KR43SRHWB8K2P1',
   'date': 'Thu, 07 Apr 2022 16:01:22 GMT',
   'etag': '"5972924d3e4aa204485d1fe1821eb8a9"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"5972924d3e4aa204485d1fe1821eb8a9"'}

# Retreive all the previously pushed pickle files from S3 to ensure they work correctly

## Retreive model from S3 using previously created key

In [12]:
bucket = "byu-capstone-appropriate-checker"
key = "model"  
s3 = boto3.client('s3')
response = s3.get_object(Bucket = bucket, Key = key)
model = pickle.loads(response['Body'].read())

## Retreive CV from S3 using previously created key

In [13]:
bucket = "byu-capstone-appropriate-checker"
key = "count_vectorizer" 
s3 = boto3.client('s3')
response = s3.get_object(Bucket = bucket, Key = key)
cv = pickle.loads(response['Body'].read())

## Retreive dictionary from S3 using previously created key

In [14]:
bucket = "byu-capstone-appropriate-checker"
key = "blacklist_dictionary"
s3 = boto3.client('s3')
response = s3.get_object(Bucket = bucket, Key = key)
my_dict = pickle.loads(response['Body'].read())

# Test pulled files with example prediction

## Create function to clean the data for input into the predict function below

In [15]:
sw = stopwords.words('english')
wn = WordNetLemmatizer()
sw_special = ["rt"]

def clean_data(text):
    text = emoji.demojize(text)
    text = text.lower() # coerce data to lower case
    tokens = wordpunct_tokenize(text) # tokenize individual words
    tokens = [tok for tok in tokens if tok.isalnum()] # removing punctuation
    tokens = [tok for tok in tokens if tok not in sw] # removing stop words
    tokens = [wn.lemmatize(tok) for tok in tokens] # lematizing lyrics - reducing to base words
    return " ".join(tokens)

## Create a function to take input, and predict using the previously instantiated model, vectorizer and dictionary

In [16]:
def predict(message, model, cv):
    # check if any of the words automatically imply inappropriate
    for word in message.split():
        for char in word:
            if char in punc:
                word = word.replace(char, "")
        if word.lower() in my_dict:
            return 2
    sample_text = clean_data(message)
    
    sample_text = [sample_text]
    sample_cv = cv.transform(sample_text)
    
    sample_df = pd.DataFrame(sample_cv.toarray(), columns = cv.get_feature_names())
    
    # predict on sample message
    val = model.predict(sample_df)[0]
    return val

## Test the predict function with user input, created model, and created count vectorizer - 0 indicates appropriate, 1 potentially inappropriate, and 2 blatantly inappropriate.

In [17]:
predict("i hate you", model, cv)

1