<a href="https://colab.research.google.com/github/Charles1A/Scrape-and-sentiment-analysis/blob/main/Yelp_reviews_sentiment_analysis_with_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

A very good Transformers NLP tutorial: https://www.youtube.com/watch?v=szczpgOEdXs

In [None]:
# from bs4 import BeautifulSoup
# import requests
# import re

from datetime import datetime
import time

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

print('Pandas version number:', pd.__version__)
print('Seaborn version number:', sns.__version__)

plt.rcParams['figure.dpi'] = 300

Pandas version number: 2.0.3
Seaborn version number: 0.13.1


In [None]:
# Upload CSV file and convert to dataframe

yelp_df = pd.read_csv('/content/yelp_scrape_05-01-2024.csv', index_col=0)

In [None]:
yelp_df['Date'] = pd.to_datetime(yelp_df['Date'])

In [None]:
# Check dataframe

yelp_df.head(3)

Unnamed: 0,Date,Star_rating,Commentary
0,2024-04-30,4,"Great location, Friendly staffs 5 stars but fo..."
1,2024-04-30,4,"Ambience is great, wait time is awful, (Servic..."
2,2024-04-29,1,Worst service of all time. Even with a reserva...


In [None]:
yelp_df.set_index('Date', inplace=True)

In [None]:
yelp_df.sort_index(ascending=False).head(3)

Unnamed: 0_level_0,Unnamed: 0,Star_rating,Commentary
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-04-30,0,4,"Great location, Friendly staffs 5 stars but fo..."
2024-04-30,1,4,"Ambience is great, wait time is awful, (Servic..."
2024-04-29,2,1,Worst service of all time. Even with a reserva...


In [None]:
# Optional: check dataframe shape and data types
print('The number of rows & number of columns in the dataframe: \n', yelp_df.shape)
print('\n')
print('The column names and data types in the dataframe: \n', yelp_df.dtypes)

The number of rows & number of columns in the dataframe: 
 (140, 3)


The column names and data types in the dataframe: 
 Unnamed: 0      int64
Star_rating     int64
Commentary     object
dtype: object


In [None]:
# optional: check dataframe Review contents

yelp_df.Commentary[1]

'Ambience is great, wait time is awful, (Service) is OK, food was not that good this time'

In [None]:
!pip3 install transformers



In [None]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
import torch
print(torch.__version__)

2.2.1+cu121


In [None]:
# bert-base-multilingual-uncased-sentiment is a pre-trained model finetuned for sentiment analysis on product reviews

# A tokenizer is responsible for preprocessing text into an array of numbers as inputs to a model

tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [None]:
# optional: test pipeline on one review
tokens = tokenizer.encode(yelp_df.Commentary[1], return_tensors='pt')
output = model(tokens)

In [None]:
# optional: check the labels of the model
model.config.id2label

{0: '1 star', 1: '2 stars', 2: '3 stars', 3: '4 stars', 4: '5 stars'}

In [None]:
# The model outputs the final activations in the logits attributes
output.logits

tensor([[-0.4516,  1.0402,  1.6779,  0.0875, -1.9504]],
       grad_fn=<AddmmBackward0>)

In [None]:
# Apply the softmax function to the logits to retrieve the probabilities:
output_sm = torch.nn.functional.softmax(output.logits, dim = -1)
print(output_sm)

tensor([[0.0633, 0.2815, 0.5325, 0.1086, 0.0141]], grad_fn=<SoftmaxBackward0>)


In [None]:
# use .argmax to return the indices of the maximum values of a tensor across a dimension.
int(torch.argmax(output.logits))+1

3

In [None]:
sentiment_scores = []

for i in range(0, len(yelp_df.Commentary)):
  tokens = tokenizer.encode(yelp_df.Commentary[i], return_tensors='pt', truncation=True)
  output = model(tokens)
  sentiment_score = int(torch.argmax(output.logits))+1
  sentiment_scores.append(sentiment_score)

yelp_df['Sentiment score'] = sentiment_scores

In [None]:
yelp_df.head(3)

Unnamed: 0_level_0,Unnamed: 0,Star_rating,Commentary,Sentiment score
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-04-30,0,4,"Great location, Friendly staffs 5 stars but fo...",4
2024-04-30,1,4,"Ambience is great, wait time is awful, (Servic...",3
2024-04-29,2,1,Worst service of all time. Even with a reserva...,1


In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
rating_avg = yelp_df['Star_rating'].mean()
sentiment_avg = yelp_df['Sentiment score'].mean()

# create subplots
fig = make_subplots(rows=1, cols=2,
                    specs=[[{}, {}]],
                    shared_yaxes=True,
                    shared_xaxes=True,
                    horizontal_spacing=0.01,
                    vertical_spacing=0,
                    column_titles=['Customer ratings <br> (avg = {rating_avg:.2f})'.format(rating_avg = rating_avg),
                                   'BERT sentiment scores <br> (avg = {sentiment_avg:.2f})'.format(sentiment_avg = sentiment_avg)]
                                   )

fig.append_trace(go.Histogram(
                     y=yelp_df['Star_rating'],
                     textposition='inside',
                     orientation='h',
                     name="count",
                     texttemplate="%{x}",
                    #  width=0.7,
                     showlegend=False,
                     marker_color='#4472c4'),
                     1, 1) # 1,1 represents row 1 column 1 in the plot grid

fig.append_trace(go.Histogram(
                     y=yelp_df['Sentiment score'],
                     textposition='inside',
                     orientation='h',
                    name="count", texttemplate="%{x}",
                     showlegend=False,
                     marker_color='orange'),
                     1, 2) # 1,2 represents row 1 column 2 in the plot grid

fig.update_xaxes(showticklabels=False,
                 row=1, col=1,
                 autorange='reversed',
                 )

fig.update_xaxes(showticklabels=False,
                 row=1, col=2)

fig.update_yaxes(linecolor='darkblue',
                 row=1, col=1,
                 gridcolor='lightgray',
                 griddash='dot',
                 )

fig.update_layout(
                  # title_text="Distribution of customer ratings & BERT sentiment scores",
                  width=700,
                  height=400,
                  title_x=0.5,
                  bargap=0.2,
                  xaxis1={'side': 'top'},
                  xaxis2={'side': 'top'},
                  yaxis_title="Rating/score",
                  plot_bgcolor="#FFF"
    )

fig.show()

In [None]:
yelp_df['Discrepancy'] = abs(yelp_df['Star_rating'] - yelp_df['Sentiment score'])

In [None]:
yelp_df.head(2)

Unnamed: 0_level_0,Unnamed: 0,Star_rating,Commentary,Sentiment score,Discrepancy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-04-30,0,4,"Great location, Friendly staffs 5 stars but fo...",4,0
2024-04-30,1,4,"Ambience is great, wait time is awful, (Servic...",3,1


In [None]:
hist1 = px.histogram(yelp_df, x="Discrepancy",
                    width=400, height=400,
                     template="simple_white",
                     text_auto=True,
                    opacity=1)

hist1.update_layout(bargap=0.1,
                        title={
        'text': "Sample size = {number}".format(number = len(yelp_df)),
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

hist1.update_xaxes(title_text='Star_rating/sentiment score difference', tickvals=[0,1,2,3,4,5])

hist1.show()

In [None]:
scat1 = px.scatter(data_frame = yelp_df,
                     y=yelp_df['Star_rating'],
                     x=yelp_df.index,
                    width=500, height=400,
                     template="simple_white",
                     trendline="ols", color=yelp_df['Star_rating'],
                    opacity=0.8)

scat1.update_xaxes(title_text='', tickangle=-25,
                   tickfont=dict(family='Arial', color='black', size=14),
                   showline=True, linewidth=1, linecolor='black', )

scat1.update_layout(
    title={
        'text': "Star_rating trendline",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

scat1.show()

In [None]:
scat2 = px.scatter(data_frame = yelp_df,
                     y=yelp_df['Sentiment score'],
                     x=yelp_df.index,
                    width=600, height=400,
                     template="simple_white",
                     trendline="ols",
                   color=yelp_df['Sentiment score'],
                    opacity=0.8)

scat2.update_xaxes(title_text='', tickangle=-25,
                   tickfont=dict(family='Arial', color='black', size=14),
                   showline=True, linewidth=1, linecolor='black')

scat2.update_layout(
    title={
        'text': "Sentiment score trendline",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

scat2.show()