A very good Transformers NLP tutorial: https://www.youtube.com/watch?v=szczpgOEdXs

In [1]:
from bs4 import BeautifulSoup
import requests
import re
from datetime import datetime
import time

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

print('Pandas version number:', pd.__version__)
print('Seaborn version number:', sns.__version__)

plt.rcParams['figure.dpi'] = 300

Pandas version number: 1.5.3
Seaborn version number: 0.12.2


In [2]:
def y_scraper():

  rlist = []

  for page in range(0, 60, 10):

    url = f'https://www.yelp.com/biz/nobu-malibu-malibu-4?start={page}&sort_by=date_desc'

    print(f"Scraping page {page}.")

    result = requests.get(url, allow_redirects=True)
    src = result.content
    soup = BeautifulSoup(src, 'lxml')

    results = soup.find_all('li', class_='margin-b5__09f24__pTvws border-color--default__09f24__NPAKY')

    regex = re.compile('^five-stars.*')

    for result in results:

      rating = result.find('div', class_=regex)
      date = result.find('span', class_="css-chan6m")
      review = result.find('p', class_="comment__09f24__D0cxf css-qgunke")

      if rating:

          rlist.append(
            {
                'Date': date.text,
                'Review': review.text,
                'Rating': int(rating['aria-label'][0]),
            }
        )

      else:
        continue

    time.sleep(2) # 2-second delay between iterations

  return rlist

In [3]:
# Run scraper function

yelp_df = pd.DataFrame(data=y_scraper())

Scraping page 0.
Scraping page 10.
Scraping page 20.
Scraping page 30.
Scraping page 40.
Scraping page 50.


In [4]:
# Optional: Check dataframe

yelp_df.head(3)

Unnamed: 0,Date,Review,Rating
0,7/10/2023,I did not think that such a place would let my...,5
1,7/9/2023,Troy is the rudest person who works here we we...,1
2,7/5/2023,First time here at Nobu Malibu for lunch and m...,5


In [5]:
yelp_df['Date'] = pd.to_datetime(yelp_df['Date'])

In [6]:
yelp_df.set_index('Date', inplace=True)

In [7]:
yelp_df.sort_index(ascending=False).head(3)

Unnamed: 0_level_0,Review,Rating
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-07-10,I did not think that such a place would let my...,5
2023-07-09,Troy is the rudest person who works here we we...,1
2023-07-05,First time here at Nobu Malibu for lunch and m...,5


In [8]:
# optional: check dataframe shape
yelp_df.shape

(60, 2)

In [9]:
# optional: check dataframe data types
yelp_df.dtypes

Review    object
Rating     int64
dtype: object

In [10]:
# optional: check dataframe Review contents

yelp_df.Review[1]

"Troy is the rudest person who works here we were just trying to take a photo and he wouldn't let us. He is a very, very terrible person with your wireless communication, skills and customer service Ever !!!"

In [11]:
!pip3 install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/7.2 MB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/7.2 MB[0m [31m29.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.2/7.2 MB[0m [31m69.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0

In [12]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [13]:
import torch
print(torch.__version__)

2.0.1+cu118


In [14]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

Downloading (…)okenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [15]:
# optional: test pipeline on one review
tokens = tokenizer.encode(yelp_df.Review[1], return_tensors='pt')
output = model(tokens)

In [None]:
# optional: check the logits
output.logits

tensor([[-2.7059, -2.5633, -1.2025,  1.6710,  3.8036]],
       grad_fn=<AddmmBackward0>)

In [None]:
# optional: check the labels of the model
model.config.id2label

{0: '1 star', 1: '2 stars', 2: '3 stars', 3: '4 stars', 4: '5 stars'}

In [None]:
# optional: check the softmax-processed output
output_sm = torch.nn.functional.softmax(output.logits, dim = -1)
print(output_sm)

tensor([[0.0013, 0.0015, 0.0059, 0.1050, 0.8862]], grad_fn=<SoftmaxBackward0>)


In [None]:
# .argmax returns the indices of the maximum values of a tensor across a dimension.
int(torch.argmax(output.logits))+1

5

In [18]:
sentiment_scores = []

for i in range(0, len(yelp_df.Review)):
  tokens = tokenizer.encode(yelp_df.Review[i], return_tensors='pt', truncation=True)
  output = model(tokens)
  sentiment_score = int(torch.argmax(output.logits))+1
  sentiment_scores.append(sentiment_score)

yelp_df['Sentiment score'] = sentiment_scores

In [19]:
yelp_df.head(3)

Unnamed: 0_level_0,Review,Rating,Sentiment score
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-07-10,I did not think that such a place would let my...,5,5
2023-07-09,Troy is the rudest person who works here we we...,1,1
2023-07-05,First time here at Nobu Malibu for lunch and m...,5,4


In [20]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [29]:
rating_avg = yelp_df['Rating'].mean()
sentiment_avg = yelp_df['Sentiment score'].mean()

# create subplots
fig = make_subplots(rows=1, cols=2,
                    specs=[[{}, {}]],
                    shared_yaxes=True,
                    shared_xaxes=True,
                    horizontal_spacing=0.01,
                    vertical_spacing=0,
                    column_titles=['Customer ratings <br> (avg = {rating_avg:.2f})'.format(rating_avg = rating_avg),
                                   'BERT sentiment scores <br> (avg = {sentiment_avg:.2f})'.format(sentiment_avg = sentiment_avg)]
                                   )

fig.append_trace(go.Histogram(
                     y=yelp_df['Rating'],
                     textposition='inside',
                     orientation='h',
                     name="count",
                     texttemplate="%{x}",
                    #  width=0.7,
                     showlegend=False,
                     marker_color='#4472c4'),
                     1, 1) # 1,1 represents row 1 column 1 in the plot grid

fig.append_trace(go.Histogram(
                     y=yelp_df['Sentiment score'],
                     textposition='inside',
                     orientation='h',
                    name="count", texttemplate="%{x}",
                     showlegend=False,
                     marker_color='orange'),
                     1, 2) # 1,2 represents row 1 column 2 in the plot grid

fig.update_xaxes(showticklabels=False,
                 row=1, col=1,
                 autorange='reversed'
                 )

fig.update_xaxes(showticklabels=False,
                 row=1, col=2)

fig.update_yaxes(linecolor='darkblue',
                 row=1, col=1,
                 gridcolor='lightgray',
                 griddash='dot',
                 )

fig.update_layout(
                  # title_text="Distribution of customer ratings & BERT sentiment scores",
                  width=700,
                  height=400,
                  title_x=0.5,
                  bargap=0.2,
                  xaxis1={'side': 'top'},
                  xaxis2={'side': 'top'},
                  yaxis_title="Rating/score",
                  plot_bgcolor="#FFF"
    )

fig.show()

In [22]:
yelp_df['Discrepancy'] = abs(yelp_df['Rating'] - yelp_df['Sentiment score'])

In [23]:
yelp_df.head(2)

Unnamed: 0_level_0,Review,Rating,Sentiment score,Discrepancy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-07-10,I did not think that such a place would let my...,5,5,0
2023-07-09,Troy is the rudest person who works here we we...,1,1,0


In [37]:
hist1 = px.histogram(yelp_df, x="Discrepancy",
                    width=400, height=400,
                     template="simple_white",
                     text_auto=True,
                    opacity=1)

hist1.update_layout(bargap=0.1,
                        title={
        'text': "Sample size = {number}".format(number = len(yelp_df)),
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

hist1.update_xaxes(title_text='Rating/score difference', tickvals=[0,1,2,3,4,5])

hist1.show()

In [25]:
scat1 = px.scatter(data_frame = yelp_df,
                     y=yelp_df['Rating'],
                     x=yelp_df.index,
                    width=600, height=400,
                     template="simple_white",
                     trendline="ols", color=yelp_df['Rating'],
                    opacity=0.8)

scat1.update_xaxes(title_text='', tickangle=-25,
                   tickfont=dict(family='Arial', color='black', size=14),
                   showline=True, linewidth=1, linecolor='black')

scat1.update_layout(
    title={
        'text': "Rating trendline",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

scat1.show()