In [1]:
!lspci | grep -i nvidia

00:1e.0 3D controller: NVIDIA Corporation GV100 [Tesla V100 SXM2] (rev a1)


In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";

In [1]:
import os

import tensorflow as tf
import ktrain
from ktrain import text
import boto3
import sagemaker
import pandas as pd
from sklearn.model_selection import train_test_split

tf.debugging.set_log_device_placement(True)

In [2]:
bucket = 'yelp-dataset-pt-9'
model_prefix = 'spencer/models/ktrain/fasttext_regression'
data_prefix = 'spencer/data/csv/cleaned'

In [3]:
s3 = boto3.client("s3")

data = s3.get_object(Bucket='yelp-dataset-pt-9', Key=f'{data_prefix}/train.csv')['Body']

In [4]:
df = pd.read_csv(data, chunksize=1_000_000)

In [5]:
df = next(df)

In [6]:
df.head()

Unnamed: 0,stars,text
0,5,"scenic views were an a+, the very cold 29 degr..."
1,4,i came here with some business partners during...
2,5,fantastic value for the price. i have wanted ...
3,1,this review is for the breakfast. i totally l...
4,5,i travel to different citys on my job an i got...


In [7]:
df['text'].isna().sum()

0

In [8]:
df.shape

(1000000, 2)

In [9]:
not_na = df[df['text'].notna()]

In [10]:
len(df) - len(not_na)

0

In [11]:
not_na['stars'] = not_na['stars'] - 3

In [12]:
not_na.head()

Unnamed: 0,stars,text
0,2,"scenic views were an a+, the very cold 29 degr..."
1,1,i came here with some business partners during...
2,2,fantastic value for the price. i have wanted ...
3,-2,this review is for the breakfast. i totally l...
4,2,i travel to different citys on my job an i got...


In [15]:
train, test = train_test_split(not_na, test_size=0.05, random_state=42, stratify=not_na['stars'].values)

In [16]:
test.shape

(50000, 2)

In [17]:
test.to_csv("models/data/test.csv", index=False)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(not_na['text'].values, not_na['stars'].values, test_size=0.05, random_state=42, stratify=not_na['stars'].values)

In [14]:
X_train.shape, X_test.shape

((950000,), (50000,))

In [16]:
X_train[:5]

array(['my wife and i stayed here for our honeymoon. most of the employees we encountered were fairly friendly, but there were several who were not very friendly. there was a different front desk person everyday, and the concierge was really just attempting to sell tickets to shows and not very helpful. the room was spacious and comfortable, but could use some updates. the huge jet tub was filthy and had visible mold under some knobs and the faucet which was a huge disappointment to my wife who loves a bath. the shuttle system to the strip is  a little confusing, considering all the shuttles look the same for this property and it\'s "sister" properties. i  saw guests turned away from full shuttles at the pick up spots( tropicana and the mirage), knowing that they would have to wait an hour for the next shuttle. overall but not sure i would return',
       "stay away.  getting a window replaced is an absolute disaster from dishonesty from multiple safelite employees, to incompetence in 

In [17]:
y_train[:5]

array([ 0, -2, -1,  2, -1])

In [None]:
trn, val, preproc = text.texts_from_array(x_train=X_train, 
                                          y_train=y_train,
                                          x_test=X_test,
                                          y_test=y_test,
                                          preprocess_mode='distilbert',
                                          maxlen=500,
                                          lang='en')

task: text regression (supply class_names argument if this is supposed to be classification task)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


preprocessing train...
language: en
train sequence lengths:
	mean : 106
	95percentile : 295
	99percentile : 500


  'If this is incorrect, supply class_names argument.')


preprocessing test...
language: en
test sequence lengths:
	mean : 106
	95percentile : 296
	99percentile : 502


In [23]:
type(trn), type(val), type(preproc)

(ktrain.text.preprocessor.TransformerDataset,
 ktrain.text.preprocessor.TransformerDataset,
 ktrain.text.preprocessor.DistilBertPreprocessor)

In [22]:
print("adf")

adf


In [24]:
import pickle

pickle.dump( trn, open( "trn.p", "wb" ), protocol=4 )
pickle.dump( val, open( "val.p", "wb" ), protocol=4 )
pickle.dump( preproc, open( "preproc.p", "wb" ), protocol=4 )

In [25]:
import sagemaker

In [26]:
sess = sagemaker.Session()

In [27]:
sess.upload_data(path="trn.p", bucket='yelp-dataset-pt-9', key_prefix="spencer/data/ktrain/distilbert/1m")

's3://yelp-dataset-pt-9/spencer/data/ktrain/distilbert/1m/trn.p'

In [28]:
sess.upload_data(path="val.p", bucket='yelp-dataset-pt-9', key_prefix="spencer/data/ktrain/distilbert/1m")

's3://yelp-dataset-pt-9/spencer/data/ktrain/distilbert/1m/val.p'

In [29]:
sess.upload_data(path="preproc.p", bucket='yelp-dataset-pt-9', key_prefix="spencer/data/ktrain/distilbert/1m")

's3://yelp-dataset-pt-9/spencer/data/ktrain/distilbert/1m/preproc.p'

In [None]:
model = text.text_regression_model('distilbert', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)

In [None]:
learner.lr_find()
learner.lr_plot()

In [41]:
learner.fit_onecycle(2e-5, 2, checkpoint_folder='more_epochs_checkpoints')



begin training using onecycle policy with max lr of 2e-05...
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ShuffleDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Train for 20000 steps, validate for 938 steps
Epoch 1/3
Executing op OptimizeDataset in device /job:

<tensorflow.python.keras.callbacks.History at 0x7f1f2074ad30>

In [42]:
predictor = ktrain.get_predictor(learner.model, preproc=preproc)

In [43]:
predictor.save('model/yelp_distilbert_regression_4_epochs')

In [44]:
learner.view_top_losses(n=10, preproc=preproc)

Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op OptimizeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op __inference_distributed_function_732317 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Mean in device /job:localhost/replica:0/task:0/device:GPU:0
----------
id:27951 | loss:16.26 | true:2.0 | pred:-2.03)

----------
id:28410 | loss:16.16 | true:2.0 | pred:-2.02)

----------
id:25571 | loss:15.84 | true:2.0 | pred:-1.98)

----------
id:16763 | loss:15.75 | true:-2.0 | pred:1.97)

----------
id:258 | loss:15.57 | true:2.0 | pred:-1.95)

----------
id:3305 | loss:

In [55]:
print(y_test[27951])
X_test[27951]

2


"took my nissan versa to a dealership since the air conditioner was not blowing cold. the service people didn't seem to listen to what i had to say about the way the air would work intermittently. i left the car, they kept it 3 days. when i picked it up the bill was $593.66. at that time i was in the process of moving and didn't check out the car for about a month only to find out there was still the same problem with the a/c. i contacted nissan u.s. customer service but they were unable to get their head out of their ass to fix it. took it to japanese car service, they found it had been overcharged with freon, disconnected switch control cable, etc. just a horrible job done by nissan. they fixed all the problems they found that nissan screwed up and it now works better than when it was new. i have no plans to ever take my cars anywhere but to kevin and the gang."

In [58]:
print(y_test[9153])
X_test[9153]

-2


'chum is going to be taking it in the ass from bubba very soon.\nlol  hoss'

In [56]:
print(y_test[16763])
X_test[16763]

-2


'short wait. wonderful service. and people are communicating with one another. such friendly people, manners are being used. gives us new hope of young and old being able to communicate with each other. food is great.'

In [57]:
print(y_test[3305])
X_test[3305]

-2


'i used to love this store... but after vanessa and sherry left the customer service and grooming went down hill! i found a much better dog store (where sherry now works! woof gang next to smiths!) and not only are their prices better, the owners are very knowledgable and friendly! i now take all my business there and am so much happier! :)'

In [45]:
predictor.predict('This food is seriously the best!')

  'If this is incorrect, supply class_names argument.')


Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op OptimizeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0


2.010037

In [46]:
watsons_review = '''I'll give props to the ambience and decor but the food was subpar. I went for brunch and ordered the chicken club. The chicken was soggy, had no crunch, and tasted like yesterdays stale cereal. The fries were soft and a bit cold. I got my chicken Nashville dipped but didn't taste a difference from non-dipped. I asked the waitress if the chicken is usually soft she said no, walked away, and didn't come back till I was done with my meal asking for the ticket. Yet another place in Downton champaign I won't be going back to.'''

In [47]:
predictor.predict(watsons_review)

  'If this is incorrect, supply class_names argument.')


Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op OptimizeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0


-1.3867602

In [48]:
watsons_neutral_review = '''This review is more for our waiter than the food. He was great!  We ordered the chicken- both fried & roasted. The taste was actually very good. The fried was almost burnt looking. Considering the cost for two pieces I felt it was overpriced. The pieces were quite small. You have to order the other items on the side. We also got the bowl of rice & shredded chicken. Spicy no!  We like spicy- this was burn your moth hot. We had to send it back!  
Probably would not come back or recommend. Like I said our waiter was great - wish I knew his name!'''

In [49]:
predictor.predict(watsons_neutral_review)

  'If this is incorrect, supply class_names argument.')


Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op OptimizeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0


-0.33333328

In [50]:
watsons_4 = '''I've come to Watson shack on numerous occasions and would like to highlight that this place never disappoints. I've had their burger, chicken and waffle and I cannot remember a time when I had an issue with their food. Their chicken and waffle is my favorite so do not hesitate to order it. Their adult slushee is really good and satisfying. Every time I came here with friends or my boyfriend for brunch we were quickly seated and served. I recommend this place'''

In [51]:
predictor.predict(watsons_4)

  'If this is incorrect, supply class_names argument.')


Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op OptimizeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0


1.9210092

In [52]:
watsons_5 = '''We came on a Saturday for our first visit. We had heard a lot of great things so we were excited. It definitely lived up to the talk! I got the Nashville chicken sandwich and it was cooked perfectly and not too much breading. Great flavor and it wasn't completely coated in sauce where you can't taste the chicken at all. Our meal got dropped in the kitchen so they had to re make it but they brought us out a snack to eat while we waited. Great service!'''

In [53]:
predictor.predict(watsons_5)

  'If this is incorrect, supply class_names argument.')


Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op OptimizeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0


1.8544829