In [17]:
import pandas as pd
import sagemaker

In [3]:
df = pd.read_csv("s3://yelp-dataset-pt-9/spencer/data/sentiment/en/test_small.csv")

In [4]:
df.shape

(200000, 5)

In [5]:
df.head()

Unnamed: 0,text,stars,pos_neg_neu,pos_neg_3_is_pos,pos_neg_3_is_neg
0,this club is really pretty and played good mus...,4.0,POSITIVE,POSITIVE,POSITIVE
1,Took my staff out for Lunch. We had a great t...,5.0,POSITIVE,POSITIVE,POSITIVE
2,"I like this place! Tasty food, delicious drink...",5.0,POSITIVE,POSITIVE,POSITIVE
3,Best chain ever for fresh squeezed juices and ...,5.0,POSITIVE,POSITIVE,POSITIVE
4,Amazing place!! Definitely super busy during w...,5.0,POSITIVE,POSITIVE,POSITIVE


In [7]:
mean_str_len = df['text'].str.len().mean()

In [8]:
median_str_len = df['text'].str.len().median()

In [9]:
mean_str_len, median_str_len

(590.133825, 417.0)

In [16]:
df['text'].str.len().describe()

count    200000.000000
mean        590.133825
std         557.094094
min          10.000000
25%         233.000000
50%         417.000000
75%         746.000000
max        5000.000000
Name: text, dtype: float64

In [20]:
sagemaker.s3.S3Downloader.download("s3://yelp-dataset-pt-9/spencer/data/sentiment/en/fasttext/models/fasttext-pos-neg-3-is-pos-same-as-bal-copy-06-13/output/model.tar.gz", "./")

In [21]:
!tar -xf model.tar.gz

In [22]:
import fasttext

In [23]:
model = fasttext.load_model("model.bin")



In [26]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [30]:
import nltk

In [31]:
def predict(text):
    return model.predict(" ".join(nltk.word_tokenize(text)))

In [32]:
preds = df['text'].parallel_apply(predict)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=50000), Label(value='0 / 50000')))…

In [33]:
preds

0         ((__label__POSITIVE,), [0.9996795654296875])
1         ((__label__POSITIVE,), [0.9999239444732666])
2         ((__label__POSITIVE,), [0.9999966621398926])
3         ((__label__POSITIVE,), [0.9996669292449951])
4         ((__label__POSITIVE,), [0.9992591738700867])
                              ...                     
199995    ((__label__POSITIVE,), [0.9997562766075134])
199996    ((__label__POSITIVE,), [0.9999953508377075])
199997    ((__label__POSITIVE,), [0.9993935823440552])
199998    ((__label__NEGATIVE,), [0.9990699887275696])
199999    ((__label__POSITIVE,), [0.7241793274879456])
Name: text, Length: 200000, dtype: object

In [46]:
preds.str[0].str[0].str.split("__").str[-1]

0         POSITIVE
1         POSITIVE
2         POSITIVE
3         POSITIVE
4         POSITIVE
            ...   
199995    POSITIVE
199996    POSITIVE
199997    POSITIVE
199998    NEGATIVE
199999    POSITIVE
Name: text, Length: 200000, dtype: object

In [47]:
pred_labels = preds.str[0].str[0].str.split("__").str[-1]

In [74]:
preds.str[1].str[0]

0         0.999680
1         0.999924
2         0.999997
3         0.999667
4         0.999259
            ...   
199995    0.999756
199996    0.999995
199997    0.999394
199998    0.999070
199999    0.724179
Name: text, Length: 200000, dtype: float64

In [75]:
pred_prob = preds.str[1].str[0]

In [48]:
df['pred'] = pred_labels

In [76]:
df['pred_prob'] = pred_prob

In [77]:
df.head()

Unnamed: 0,text,stars,pos_neg_neu,pos_neg_3_is_pos,pos_neg_3_is_neg,pred,pred_prob
0,this club is really pretty and played good mus...,4.0,POSITIVE,POSITIVE,POSITIVE,POSITIVE,0.99968
1,Took my staff out for Lunch. We had a great t...,5.0,POSITIVE,POSITIVE,POSITIVE,POSITIVE,0.999924
2,"I like this place! Tasty food, delicious drink...",5.0,POSITIVE,POSITIVE,POSITIVE,POSITIVE,0.999997
3,Best chain ever for fresh squeezed juices and ...,5.0,POSITIVE,POSITIVE,POSITIVE,POSITIVE,0.999667
4,Amazing place!! Definitely super busy during w...,5.0,POSITIVE,POSITIVE,POSITIVE,POSITIVE,0.999259


In [50]:
sum(df['pos_neg_3_is_pos'] == df['pred'])

186554

In [51]:
sum(df['pos_neg_3_is_pos'] == df['pred']) / len(df)

0.93277

In [66]:
false_preds = df[df['pred'] != df['pos_neg_3_is_pos']]

In [67]:
false_preds[['text', 'pos_neg_3_is_pos', 'pred']].head()

Unnamed: 0,text,pos_neg_3_is_pos,pred
37,Well. We came back again. The margaritas are g...,NEGATIVE,POSITIVE
49,Reading some other reviews reminded me that I ...,NEGATIVE,POSITIVE
53,"Fantastic gluten free buffalo wings, though th...",POSITIVE,NEGATIVE
75,It's okay in terms of selection. I find cute c...,POSITIVE,NEGATIVE
76,I drove down from Chapel Hill to meet some old...,NEGATIVE,POSITIVE


In [68]:
len(false_preds)

13446

In [59]:
from ktrain import text
ts = text.TransformerSummarizer()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1625270765.0, style=ProgressStyle(descr…




In [61]:
true_preds = df[df['pred'] == df['pos_neg_3_is_pos']]

In [63]:
true_preds['text'].str.len().describe()

count    186554.000000
mean        585.296826
std         553.222508
min          13.000000
25%         232.000000
50%         414.000000
75%         739.000000
max        5000.000000
Name: text, dtype: float64

In [64]:
false_pred['text'].str.len().describe()

count    13446.000000
mean       657.243864
std        604.312512
min         10.000000
25%        254.000000
50%        476.000000
75%        851.000000
max       4993.000000
Name: text, dtype: float64

In [69]:
true_preds['text'].str.len().median(), false_preds['text'].str.len().median()

(414.0, 476.0)

In [70]:
true_preds['text'].str.len().mean(), false_preds['text'].str.len().mean()

(585.2968255840132, 657.243864346274)

In [82]:
false_preds['pos_neg_neu'].value_counts(normalize=True)

NEGATIVE    0.490778
NEUTRAL     0.349621
POSITIVE    0.159601
Name: pos_neg_neu, dtype: float64

In [84]:
false_preds['stars'].value_counts(normalize=True).sort_index()

1.0    0.152536
2.0    0.338242
3.0    0.349621
4.0    0.078165
5.0    0.081437
Name: stars, dtype: float64

In [60]:
def summarize(text):
    return ts.summarize(text)

In [78]:
df['str_len'] = df['text'].str.len()

In [79]:
df.sort_values("pred_prob").head(20)

Unnamed: 0,text,stars,pos_neg_neu,pos_neg_3_is_pos,pos_neg_3_is_neg,pred,pred_prob,str_len
76930,With new owners --QuadReal Property Management...,3.0,NEUTRAL,POSITIVE,NEGATIVE,POSITIVE,0.500061,519
2583,"Man... people freak out over this place, but I...",3.0,NEUTRAL,POSITIVE,NEGATIVE,NEGATIVE,0.500076,770
77199,Truly disappointed since Chef Nunez left. I h...,2.0,NEGATIVE,NEGATIVE,NEGATIVE,POSITIVE,0.500078,936
37989,This club is close to home but it's time to gi...,2.0,NEGATIVE,NEGATIVE,NEGATIVE,POSITIVE,0.500093,329
143029,The most boring bar I been to in Mississauga p...,1.0,NEGATIVE,NEGATIVE,NEGATIVE,POSITIVE,0.500099,130
73874,No amount of research could've prepared me for...,4.0,POSITIVE,POSITIVE,POSITIVE,POSITIVE,0.5001,1708
114337,"NO! Do Not Bother!\n\nOrdered 2 pizzas, they w...",1.0,NEGATIVE,NEGATIVE,NEGATIVE,POSITIVE,0.500107,898
132527,Don't bother with dollar margarita Tuesdays. T...,2.0,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,0.500118,370
57762,"I felt that the performers were very good, esp...",4.0,POSITIVE,POSITIVE,POSITIVE,NEGATIVE,0.500121,662
163432,Excellent food and service. Have been here 4 t...,4.0,POSITIVE,POSITIVE,POSITIVE,POSITIVE,0.500127,150


In [91]:
single = df.iloc[77199]
print(single['stars'])
print(single['text'])

2.0
Truly disappointed since Chef Nunez left.  I have been back many times & it's not even close to the beginning when there was impeccable service.  

Bartenders seem uninterested in serving you, unless you are a regular on video poker & most have no clue about wine.  Not sure why this restaurant was changed from Kennedy's to Presidio to Whist... when it was Kennedy's the menu (tuna chips), bartenders & wine cellar & service was fantastic.  

Brunch at Whist used to be delicious w/ a unique Bloody Mary bar.  The Lobster Roll at lunch was my all time fav.  Once it came out, Chef Nunez saw it & ran it back to the kitchen, said that it was on the wrong bread.  Quite the service!  After he left, it's been downhill.  I had the lobster roll twice & both times it came out on different bread so to say the least the service is inconsistent from the kitchen to sitting at the bar.  

May come back when they change it to whatever's next.


In [92]:
summary = ts.summarize(single['text'])
print(summary)

Brunch at Whist used to be delicious w/ a unique Bloody Mary bar. The Lobster Roll at lunch was my all time fav. After he left, it's been downhill. I had the lobster roll twice & both times it came out on different bread. May come back when they change it to whatever's next.


In [93]:
model.predict(" ".join(nltk.word_tokenize(summary)))

(('__label__POSITIVE',), array([0.96482998]))

In [94]:
model.predict(" ".join(nltk.word_tokenize(single['text'])))

(('__label__POSITIVE',), array([0.5000785]))

In [95]:
false_samples = false_preds.sample(100, random_state=42)

In [97]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [98]:
false_samples_summaries = false_samples['text'].progress_apply(summarize)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [99]:
false_samples_summaries.head()

169993    The portions of pasta could feed a small natio...
177078    The restuarant did have a vibrant feel to it a...
87157     The Rainforest Cafe was a place my kids liked ...
98481     The croissant was ok. A bit too crispy and fla...
15037     The pizza should have been remade. They forgot...
Name: text, dtype: object

In [100]:
false_samples['summaries'] = false_samples_summaries

In [119]:
def print_text_and_summary(row):
    print(row['stars'])
    print("\n")
    print("---------------------------------------------------------------------------------------------------------------------------")
    print("\n")
    print(row['text'])
    print("\n")
    print("---------------------------------------------------------------------------------------------------------------------------")
    print("\n")
    print(row['summaries'])

In [120]:
false_samples_iter = false_samples.iterrows()

In [123]:
print_text_and_summary(next(false_samples_iter)[1])

2.0


---------------------------------------------------------------------------------------------------------------------------


We took the kids to Las Vegas and stayed at MGM. The Rainforest Cafe was a place my kids liked to hang out. We had breakfast there and we were NOT impressed. The service wasn't great and the food was so-so. However, the atmosphere was great and kept the kids entertained.


---------------------------------------------------------------------------------------------------------------------------


The Rainforest Cafe was a place my kids liked to hang out. We had breakfast there and we were NOT impressed. The service wasn't great and the food was so-so. However, the atmosphere was great and kept the kids entertained. We took the kids to Las Vegas and stayed at MGM.


In [124]:
summary_pred = false_samples['summaries'].parallel_apply(predict)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25), Label(value='0 / 25'))), HBox…

In [125]:
summary_pred

169993    ((__label__POSITIVE,), [0.5720888376235962])
177078    ((__label__POSITIVE,), [0.9473980069160461])
87157     ((__label__POSITIVE,), [0.9924427270889282])
98481     ((__label__POSITIVE,), [0.9750025868415833])
15037     ((__label__NEGATIVE,), [0.8823544979095459])
                              ...                     
121583    ((__label__POSITIVE,), [0.6000519394874573])
6997      ((__label__POSITIVE,), [0.5208702087402344])
8107      ((__label__NEGATIVE,), [0.8519806861877441])
136615    ((__label__POSITIVE,), [0.7053841948509216])
134024    ((__label__POSITIVE,), [0.5641056299209595])
Name: summaries, Length: 100, dtype: object

In [126]:
summary_pred_labels = summary_pred.str[0].str[0].str.split("__").str[-1]

In [127]:
summary_pred_labels

169993    POSITIVE
177078    POSITIVE
87157     POSITIVE
98481     POSITIVE
15037     NEGATIVE
            ...   
121583    POSITIVE
6997      POSITIVE
8107      NEGATIVE
136615    POSITIVE
134024    POSITIVE
Name: summaries, Length: 100, dtype: object

In [128]:
false_samples['summary_pred_label'] = summary_pred_labels

In [130]:
false_samples.head()

Unnamed: 0,text,stars,pos_neg_neu,pos_neg_3_is_pos,pos_neg_3_is_neg,pred,summaries,summary_pred_label
169993,"So this meal was (for me) the biggest ""dud"" of...",3.0,NEUTRAL,POSITIVE,NEGATIVE,NEGATIVE,The portions of pasta could feed a small natio...,POSITIVE
177078,I was a bit disappointed with our overall lunc...,3.0,NEUTRAL,POSITIVE,NEGATIVE,NEGATIVE,The restuarant did have a vibrant feel to it a...,POSITIVE
87157,We took the kids to Las Vegas and stayed at MG...,2.0,NEGATIVE,NEGATIVE,NEGATIVE,POSITIVE,The Rainforest Cafe was a place my kids liked ...,POSITIVE
98481,Only giving a review on the croissant. I went ...,2.0,NEGATIVE,NEGATIVE,NEGATIVE,POSITIVE,The croissant was ok. A bit too crispy and fla...,POSITIVE
15037,"Basic Pizza Hut pizza. Not good, not horrible....",3.0,NEUTRAL,POSITIVE,NEGATIVE,NEGATIVE,The pizza should have been remade. They forgot...,NEGATIVE


In [129]:
sum(false_samples['pos_neg_3_is_pos'] == false_samples['summary_pred_label']) / len(false_samples)

0.26

In [132]:
sum(false_samples['pos_neg_3_is_pos'] == false_samples['pred']) / len(false_samples)

0.0

In [133]:
true_samples = true_preds.sample(100, random_state=42)

In [134]:
true_samples_summaries = true_samples['text'].progress_apply(summarize)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [137]:
true_samples_summaries.head()

164009    The climate control of each exhibit was done p...
132178    We had a coupon for 50% off that was printed i...
77497     The J.W. Marriot is the only one I would retur...
13682     Bring wine, your person and some snacks and en...
46718     This is the 1lb cod sandwich on whole wheat. F...
Name: text, dtype: object

In [138]:
true_samples['summaries'] = true_samples_summaries

In [139]:
true_samples_iter = true_samples.iterrows()

In [140]:
print_text_and_summary(next(true_samples_iter)[1])

2.0


---------------------------------------------------------------------------------------------------------------------------


Ok, I admit I did expected more from Biodome but then I looked at the price again and realized that what you pay for is what you get.

I had student discount offer even though I was just a visitor of Montreal. The climate control of each exhibit was done pretty well. I was sweating like crazy when I was in the rainforest. I liked how the animals were in plain view and it wasn't hard to track them down. Each exhibits were short and before you knew it, it was over. The entire Biodome can be done under an hour which isn't so exciting and a lot of the interactive monitors were either broken or weren't on. So, there were some animals which I have no idea what they were since the info booth was off.

I don't think Biodome was worth the full adult price but if you can get it at a discount you'll be much more happier. I was definitely disappointed, makes me wonder

In [141]:
summary_pred = true_samples['summaries'].parallel_apply(predict)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25), Label(value='0 / 25'))), HBox…

In [142]:
summary_pred_labels = summary_pred.str[0].str[0].str.split("__").str[-1]

In [143]:
true_samples['summary_pred_label'] = summary_pred_labels

In [144]:
true_samples.head()

Unnamed: 0,text,stars,pos_neg_neu,pos_neg_3_is_pos,pos_neg_3_is_neg,pred,summaries,summary_pred_label
164009,"Ok, I admit I did expected more from Biodome b...",2.0,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,The climate control of each exhibit was done p...,NEGATIVE
132178,Poor customer service. We had a coupon for 50%...,1.0,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,We had a coupon for 50% off that was printed i...,NEGATIVE
77497,This is a definite recommendation from me. Las...,5.0,POSITIVE,POSITIVE,POSITIVE,POSITIVE,The J.W. Marriot is the only one I would retur...,POSITIVE
13682,Free movies under the stars during the summer....,5.0,POSITIVE,POSITIVE,POSITIVE,POSITIVE,"Bring wine, your person and some snacks and en...",POSITIVE
46718,Fabulous cod sandwich and amazing seafood shop...,5.0,POSITIVE,POSITIVE,POSITIVE,POSITIVE,This is the 1lb cod sandwich on whole wheat. F...,POSITIVE


In [145]:
sum(true_samples['pos_neg_3_is_pos'] == true_samples['summary_pred_label']) / len(true_samples)

0.96

In [146]:
sum(true_samples['pos_neg_3_is_pos'] == true_samples['pred']) / len(true_samples)

1.0

In [147]:
samples = pd.concat([true_samples, false_samples])

In [148]:
samples.head()

Unnamed: 0,text,stars,pos_neg_neu,pos_neg_3_is_pos,pos_neg_3_is_neg,pred,summaries,summary_pred_label
164009,"Ok, I admit I did expected more from Biodome b...",2.0,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,The climate control of each exhibit was done p...,NEGATIVE
132178,Poor customer service. We had a coupon for 50%...,1.0,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE,We had a coupon for 50% off that was printed i...,NEGATIVE
77497,This is a definite recommendation from me. Las...,5.0,POSITIVE,POSITIVE,POSITIVE,POSITIVE,The J.W. Marriot is the only one I would retur...,POSITIVE
13682,Free movies under the stars during the summer....,5.0,POSITIVE,POSITIVE,POSITIVE,POSITIVE,"Bring wine, your person and some snacks and en...",POSITIVE
46718,Fabulous cod sandwich and amazing seafood shop...,5.0,POSITIVE,POSITIVE,POSITIVE,POSITIVE,This is the 1lb cod sandwich on whole wheat. F...,POSITIVE


In [149]:
sum(samples['pos_neg_3_is_pos'] == samples['summary_pred_label']) / len(samples)

0.61

In [150]:
sum(samples['pos_neg_3_is_pos'] == samples['pred']) / len(samples)

0.5

In [151]:
random_samples = df.sample(200, random_state=42)

In [152]:
random_samples_summaries = random_samples['text'].progress_apply(summarize)

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [153]:
random_samples['summaries'] = random_samples_summaries

In [154]:
summary_pred = random_samples['summaries'].parallel_apply(predict)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=50), Label(value='0 / 50'))), HBox…

In [155]:
summary_pred_labels = summary_pred.str[0].str[0].str.split("__").str[-1]

In [156]:
random_samples['summary_pred_label'] = summary_pred_labels

In [158]:
sum(random_samples['pos_neg_3_is_pos'] == random_samples['summary_pred_label']) / len(random_samples)

0.92

In [159]:
sum(random_samples['pos_neg_3_is_pos'] == random_samples['pred']) / len(random_samples)

0.935