In [6]:
import tarfile
import os

import numpy as np
import boto3
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [None]:
spencer = boto3.session.Session(profile_name='spencer')

In [None]:
spencer.client('sts').get_caller_identity().get('Account')

In [None]:
boto3.client('sts').get_caller_identity().get('Account')

In [None]:
spencer_s3 = spencer.client("s3")

In [None]:
spencer_s3.download_file('sagemaker-us-east-1-652081700929', "final-distilbert-regression-800k-2020-06-15-07-04-51-437/output/model.tar.gz", "model.tar.gz")

In [None]:
with tarfile.open("model.tar.gz", "r:gz") as tar:
    tar.extractall(path="model")

In [None]:
# os.remove("model.tar.gz")

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True)
model = TFAutoModelForSequenceClassification.from_pretrained("model")

In [None]:
text = "this place is amazing!"

In [None]:
def predict(text):
    return model(tokenizer.encode(text, return_tensors='tf', max_length=512))[0].numpy()[0][0]

def scale(pred):
    return (pred - 1) * .25

def squish(scaled):
    return np.clip(scaled, 0, 1)

def pred_scale_and_squish(text):
    pred = predict(text)
    scaled = scale(pred)
    squished = squish(scaled)
    
    return pred, scaled, squished

In [None]:
predict(text)

In [None]:
bad_text = "this place is the worst!!!"

In [None]:
predict(bad_text)

In [None]:
%%timeit
predict(text)

In [None]:
s3 = boto3.client("s3")

In [None]:
s3.upload_file(Filename='model.tar.gz', Bucket='yelpsense', Key='models/sentiment/distilbert/regression/model.tar.gz')

In [None]:
pred_scale_and_squish(text)

In [None]:
pred_scale_and_squish(bad_text)

In [None]:
text = "this place is kinda good"

In [None]:
pred_scale_and_squish(text)

In [None]:
squish(3.8772535)

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained(
    'distilbert-base-uncased', use_fast=True)
model2 = TFAutoModelForSequenceClassification.from_pretrained(
    "spentaur/yelp")


In [None]:
def predict2(text):
    return model2(tokenizer2.encode(text, return_tensors='tf', max_length=512))[0].numpy()[0][0]

In [None]:
%%timeit
predict2(text)

In [None]:
%%timeit
predict2(text)

In [None]:
%%timeit
predict(text)

In [None]:
squish(predict2(text))

In [None]:
pred_scale_and_squish("alright")

In [4]:
review = """This restaurant is one of the bests in Champaign. Coming from the NYC/Philly/Princeton area, Ive found few places that really mimic the food quality and scene. There chicken sandwiches are great, especially the Nashville hot. The sauces are great too, I really like the honey chili sauce. On Tuesdays they have Tacos as specials and there chicken taco is amazing! There appetizers are also tasty, my favorite is the Brussels sprouts. I wish they would add it to there regular menu. Its so fresh and very flavorful. As a plus, they also have non-alcoholic mock tails for everyone."""

In [1]:
from transformers import pipeline

In [2]:
ts2 = pipeline("summarization")

In [3]:
ts = pipeline("summarization", model="t5-large", tokenizer="t5-large", framework="tf")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




In [4]:
ts(review, min_length=42, clean_up_tokenization_spaces=True, max_length=1024)

Your max_length is set to 1024, but you input_length is only 140. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


[{'summary_text': 'this restaurant is one of the bests in champaign. the chicken sandwiches are great, especially the Nashville hot. on tuesdays they have tacos as specials and their chicken taco is amazing.'}]

In [5]:
import tensorflow as tf

In [12]:
def old_summarize(ts, text, **generate_kwargs):
    # Add prefix to text
    prefix = ts.model.config.prefix if ts.model.config.prefix is not None else ""
    documents = (prefix + text,)

    # tokenize
    inputs = ts.tokenizer.encode_plus(
        *documents,
        return_tensors='tf',
        max_length=ts.tokenizer.max_len
    )
    
    print(type(inputs["input_ids"].dim()))

    summaries = ts.model.generate(
        inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
    )
    results = []
    for summary in summaries:
        record = {}
        record["summary_text"] = ts.tokenizer.decode(
            summary, skip_special_tokens=True, clean_up_tokenization_spaces=True,
        )

        results.append(record)
    return results

In [13]:
old_summarize(ts, review)

AttributeError: 'tensorflow.python.framework.ops.EagerTensor' object has no attribute 'dim'

In [18]:
ts(review, clean_up_tokenization_spaces=True)

Your max_length is set to 200, but you input_length is only 140. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


[{'summary_text': 'this restaurant is one of the bests in champaign. the chicken sandwiches are great, especially the Nashville hot. on tuesdays they have tacos as specials and their chicken taco is amazing.'}]

In [19]:
ts2(review, clean_up_tokenization_spaces=True)

Your max_length is set to 142, but you input_length is only 130. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


[{'summary_text': 'Coming from the NYC/Philly/Princeton area, Ive found few places that really mimic the food quality and scene. There chicken sandwiches are great, especially the Nashville hot. The sauces are great too, I really like the honey chili sauce. On Tuesdays they have Tacos as specials and there chicken taco is amazing!'}]

In [27]:
ts2.framework

'pt'

In [2]:
from transformers import pipeline

In [3]:
sentiment = pipeline("sentiment-analysis")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=629.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267844284.0, style=ProgressStyle(descri…




In [5]:
sentiment("this place is great!")[0]

{'label': 'POSITIVE', 'score': 0.9998815655708313}

In [4]:
from collections import namedtuple

In [6]:
Request = namedtuple('request', 'form')

In [16]:
request = Request(form={})

In [17]:
required = ['text', 'model']
missing = [field for field in required if field not in request.form.keys()]

In [18]:
missing

['text', 'model']

In [19]:
if not ['asdf']:
    print("asdf")

In [20]:
err = {field: f"the {field} field is required" for field in missing}

In [21]:
err

{'text': 'the text field is required', 'model': 'the model field is required'}

In [22]:
not_true = False

In [24]:
if not_true:
    print("asdf")