In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from pprint import pprint
from tqdm.notebook import tqdm

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

tqdm.pandas()

In [2]:
# perform only once
# nltk.download('all')

In [2]:
reviews = pd.read_csv('../data/processed/cleaned_combined_reviews_data.csv')
reviews.drop_duplicates(inplace=True)
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   review_id            1266 non-null   object 
 1   rating               1266 non-null   float64
 2   likes                1266 non-null   int64  
 3   date_review_scraped  1266 non-null   object 
 4   review_date          1266 non-null   object 
 5   source               1266 non-null   object 
 6   review_text          848 non-null    object 
 7   owner_response_text  1209 non-null   object 
 8   location_id          1266 non-null   int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 89.1+ KB


In [3]:
print(reviews.iloc[1265, 6])
print(type(reviews.iloc[1265, 6]))

print(reviews.iloc[0, 6])
print(type(reviews.iloc[0,6]))

nan
<class 'float'>
i brought my son to this location today for a back-to-school haircut. there is no proper welcoming into the store too they are not showing any respect towards customers… i clearly showed reference pictures of the style we wanted. the stylist (ben)did not listen patiently, rushed through the process, and completely ignored the details we explained. the haircut looked nothing like what we asked for. my kid was so upset with the result that he cried the entire time while doing..the experience upset him so much that now he doesn’t even want to go to school tomorrow. this was a very disappointing experience, and i would not recommend this stylist or this location. i hope management addresses this so other customers don’t go through the same frustration.
<class 'str'>


In [4]:
# slight data cleaning (will have to move this to appropriate notebook)

# contains nan values so I need to change them to default values and correct column data types
reviews['review_text'] = reviews['review_text'].fillna('')
reviews['owner_response_text'] = reviews['owner_response_text'].fillna('')

reviews = reviews.astype({
    'review_id': str,
    'source': str,
    'review_text': str,
    'owner_response_text': str
    })

reviews['date_review_scraped'] = pd.to_datetime(reviews['date_review_scraped'])
reviews['review_date'] = pd.to_datetime(reviews['review_date'])


In [5]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   review_id            1266 non-null   object        
 1   rating               1266 non-null   float64       
 2   likes                1266 non-null   int64         
 3   date_review_scraped  1266 non-null   datetime64[ns]
 4   review_date          1266 non-null   datetime64[ns]
 5   source               1266 non-null   object        
 6   review_text          1266 non-null   object        
 7   owner_response_text  1266 non-null   object        
 8   location_id          1266 non-null   int64         
dtypes: datetime64[ns](2), float64(1), int64(2), object(4)
memory usage: 89.1+ KB


In [6]:
# this review data was scraped from the web so it is likely that there are remenant HTML or special characters still in the text so lets check
# NOTE: move any data cleaning to the proper notebook file

html_specialChar_pattern = r'<[^>]+>|\n|\r|\t|&[a-z]+;'
problem_rows = reviews[ reviews['review_text'].str.contains(html_specialChar_pattern, regex=True, na=False) ]
print(problem_rows)

Empty DataFrame
Columns: [review_id, rating, likes, date_review_scraped, review_date, source, review_text, owner_response_text, location_id]
Index: []


In [7]:
example = reviews['review_text'][0]
pprint(example)

('i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed reference pictures of the style we '
 'wanted. the stylist (ben)did not listen patiently, rushed through the '
 'process, and completely ignored the details we explained. the haircut looked '
 'nothing like what we asked for. my kid was so upset with the result that he '
 'cried the entire time while doing..the experience upset him so much that now '
 'he doesn’t even want to go to school tomorrow. this was a very disappointing '
 'experience, and i would not recommend this stylist or this location. i hope '
 'management addresses this so other customers don’t go through the same '
 'frustration.')


In [8]:
# NLTK library functionalities

tokens = nltk.word_tokenize(example)
print(tokens[:10], '\n')

tagged = nltk.pos_tag(tokens) # pos => 'part of speech'
print(tagged[:10], '\n')

['i', 'brought', 'my', 'son', 'to', 'this', 'location', 'today', 'for', 'a'] 

[('i', 'NN'), ('brought', 'VBD'), ('my', 'PRP$'), ('son', 'NN'), ('to', 'TO'), ('this', 'DT'), ('location', 'NN'), ('today', 'NN'), ('for', 'IN'), ('a', 'DT')] 



In [9]:
entities = nltk.chunk.ne_chunk(tagged)
print(entities)

(S
  i/NN
  brought/VBD
  my/PRP$
  son/NN
  to/TO
  this/DT
  location/NN
  today/NN
  for/IN
  a/DT
  back-to-school/JJ
  haircut/NN
  ./.
  there/EX
  is/VBZ
  no/DT
  proper/JJ
  welcoming/NN
  into/IN
  the/DT
  store/NN
  too/RB
  they/PRP
  are/VBP
  not/RB
  showing/VBG
  any/DT
  respect/NN
  towards/IN
  customers…/NN
  i/NN
  clearly/RB
  showed/VBD
  reference/NN
  pictures/NNS
  of/IN
  the/DT
  style/NN
  we/PRP
  wanted/VBD
  ./.
  the/DT
  stylist/NN
  (/(
  ben/NN
  )/)
  did/VBD
  not/RB
  listen/VB
  patiently/RB
  ,/,
  rushed/VBD
  through/IN
  the/DT
  process/NN
  ,/,
  and/CC
  completely/RB
  ignored/VBD
  the/DT
  details/NNS
  we/PRP
  explained/VBD
  ./.
  the/DT
  haircut/NN
  looked/VBD
  nothing/NN
  like/IN
  what/WP
  we/PRP
  asked/VBD
  for/IN
  ./.
  my/PRP$
  kid/NN
  was/VBD
  so/RB
  upset/JJ
  with/IN
  the/DT
  result/NN
  that/IN
  he/PRP
  cried/VBD
  the/DT
  entire/JJ
  time/NN
  while/IN
  doing/VBG
  ../PDT
  the/DT
  experience/NN
  upset

# VADER Sentiment Scoring (Classical Method)

In [10]:
# VADER = Valence Aware Dictionary and sEntiment Reasoner --> ('bag of words approach' to sentiment analysis)
#
#       -> this method does not account for relationships between words

from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

pprint(example)
pprint(sia.polarity_scores(example))

('i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed reference pictures of the style we '
 'wanted. the stylist (ben)did not listen patiently, rushed through the '
 'process, and completely ignored the details we explained. the haircut looked '
 'nothing like what we asked for. my kid was so upset with the result that he '
 'cried the entire time while doing..the experience upset him so much that now '
 'he doesn’t even want to go to school tomorrow. this was a very disappointing '
 'experience, and i would not recommend this stylist or this location. i hope '
 'management addresses this so other customers don’t go through the same '
 'frustration.')
{'compound': -0.9493, 'neg': 0.191, 'neu': 0.742, 'pos': 0.068}


In [11]:
# Run polarity scoring on all reveiw text
vader_sentiment = reviews['review_text'].progress_apply(lambda text: sia.polarity_scores(text))
reviews[['vader_negative', 'vader_neutral', 'vader_positive', 'vader_compound']] = vader_sentiment.apply(pd.Series)

  0%|          | 0/1266 [00:00<?, ?it/s]

In [13]:
reviews.head()

Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,review_text,owner_response_text,location_id,vader_negative,vader_neutral,vader_positive,vader_compound
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025-12-16,2025-08-18,Google Maps,i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1,0.191,0.742,0.068,-0.9493
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025-12-16,2025-10-17,Google Maps,9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1,0.063,0.921,0.016,-0.6116
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025-12-16,2025-05-20,Google Maps,worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1,0.152,0.749,0.098,-0.4389
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025-12-16,2025-10-17,Google Maps,myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1,0.07,0.751,0.179,0.7574
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025-12-16,2025-01-20,Google Maps,terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1,0.178,0.717,0.105,-0.9174


In [14]:
fig1 = go.Figure()

colors = px.colors.sequential.Agsunset[:5]
ratings = sorted(reviews['rating'].unique())


for i, rating in enumerate(ratings):
    data = reviews[ reviews['rating'] == rating]['vader_compound' ]

    fig1.add_trace(go.Box(
        y=data,
        name=f'{rating} Rating',
        marker=dict(color=colors[i]),
        boxmean=True
    ))

fig1.update_layout(
    width=1000, height=600,
    title={'text':'VADER Compound Sentiment Score by Rating', 'x': 0.5},
    xaxis=dict(
        title='Rating',
        ticklabelstandoff=10
    ),
    yaxis=dict(
        title='Compound Score',
        ticklabelstandoff=10
    ),
    margin=dict(t=50, b=50, l=50, r=50)
)

fig1.show()

In [15]:
fig2 = make_subplots(
    rows=1, cols=3,
    subplot_titles=('VADER Positive', 'VADER Neutral', 'VADER Negative')
    )   

colors = px.colors.sequential.Agsunset[:5]
ratings = sorted(reviews['rating'].unique())

# Custom hover template with box statistics
hover_text = (
    '<b>%{fullData.name}</b><br>'
    'Min: %{customdata[0]:.3f}<br>'
    'Q1: %{customdata[1]:.3f}<br>'
    'Median: %{customdata[2]:.3f}<br>'
    'Mean: %{customdata[3]:.3f}<br>'
    'Q3: %{customdata[4]:.3f}<br>'
    'Max: %{customdata[5]:.3f}<br>'
    '<extra></extra>'
)

sentiments = [
    ('vader_positive', 1, 'Positive Score'),
    ('vader_neutral', 2, 'Neutral Score'),
    ('vader_negative', 3, 'Negative Score')
]

for col_name, col_idx, y_axis_title in sentiments:
    for i, rating in enumerate(ratings):
        data = reviews[ reviews['rating'] == rating ][col_name]
        stats = [data.min(), data.quantile(0.25), data.median(),
                 data.mean(), data.quantile(0.75), data.max()]
    
        fig2.add_trace(go.Box(
                y=data,
                name=f'{rating} Rating',
                marker=dict(color=colors[i]),
                boxmean=True,
                legendgroup=str(rating),
                showlegend=(col_idx==1), # only show legend for first column (grouped)
                hovertemplate=hover_text,
                customdata=[stats] * len(data)),
            row=1, col=col_idx
        )
    
    fig2.update_yaxes(title_text=y_axis_title, ticklabelstandoff=10,
                      row=1, col=col_idx)

fig2.update_layout(
    width=2000, height=750,
    title={'text': 'VADER Sentiment Scores Decomposition by Rating', 'x': 0.5},
    showlegend=True,
    margin=dict(t=100, b=50, l=50, r=50)
)

fig2.update_xaxes(ticklabelstandoff=10)

fig2.show()

# ROBERTA Sentiment Scoring (Pre-Trained Transformer Model --Hugging Face)

In [16]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import torch

In [17]:
# # Chandar Research Lab's NeoBERT model
# NeoBERT = 'chandar-lab/NeoBERT'
# NeoBERT_tokenizer = AutoTokenizer.from_pretrained(NeoBERT, trust_remote_code=True)
# NeoBERT_model = AutoModelForSequenceClassification.from_pretrained(NeoBERT, trust_remote_code=True)

# NOTE: cannot run this model on macOS (revist this later)

In [17]:
pprint(example)
print()
pprint(sia.polarity_scores(example))

('i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed reference pictures of the style we '
 'wanted. the stylist (ben)did not listen patiently, rushed through the '
 'process, and completely ignored the details we explained. the haircut looked '
 'nothing like what we asked for. my kid was so upset with the result that he '
 'cried the entire time while doing..the experience upset him so much that now '
 'he doesn’t even want to go to school tomorrow. this was a very disappointing '
 'experience, and i would not recommend this stylist or this location. i hope '
 'management addresses this so other customers don’t go through the same '
 'frustration.')

{'compound': -0.9493, 'neg': 0.191, 'neu': 0.742, 'pos': 0.068}


In [18]:
# META AI Lab's RoBERTa-base model trained on twitter comments release in 2019 [~125M parameters] (loading the model)
roberta = f"cardiffnlp/twitter-roberta-base-sentiment"
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta)
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta)

print(f'Number of parameters: {roberta_model.num_parameters()}')
print(f'Model Name: {roberta_model.config.model_type}')

Number of parameters: 124647939
Model Name: roberta


In [19]:
# Example of running the model on text
encoded_text = roberta_tokenizer(example, return_tensors='pt')
output = roberta_model(**encoded_text)
score = output[0][0].detach().numpy()
scores = softmax(score)
scores_dict = {
    f'roberta_negative': float(scores[0]),
    'roberta_neutral': float(scores[1]),
    'roberta_positive' : float(scores[2])
}
pprint(scores_dict)

{'roberta_negative': 0.9759707450866699,
 'roberta_neutral': 0.02143993228673935,
 'roberta_positive': 0.0025893172714859247}


In [20]:
# CALCULATE SENTIMENT SCORES (v1.0)

def sentiment_scores_roberta(text, model, tokenizer):
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(**encoded_text)
    scores = softmax(output[0][0].detach().numpy())
    scores_dict = {
        f'{model.config.model_type}_negative': float(scores[0]),
        f'{model.config.model_type}_neutral': float(scores[1]),
        f'{model.config.model_type}_positive': float(scores[2]),
    }

    return scores_dict

In [21]:
roberta_sentiment = reviews['review_text'].progress_apply(lambda text: sentiment_scores_roberta(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative', 'roberta_neutral', 'roberta_positive']] = roberta_sentiment.apply(pd.Series)

# ISSUE: RoBERTa-base has a maximum token length of 512 but some review text is longer than that

  0%|          | 0/1266 [00:00<?, ?it/s]

RuntimeError: The expanded size of the tensor (597) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 597].  Tensor sizes: [1, 514]

In [43]:
# CALCULATE SENTIMENT SCORES VIA TOKEN CHUNKING (v1.1)

def sentiment_scores_roberta_chunked(text, model, tokenizer, chunk_size=512):
    '''
    Process long review texts by chunking and using a weighted-average by unique tokens
    per chunk for sentiment score
    '''

    encoded_text = tokenizer(text, return_tensors='pt', truncation=False)
    token_ids = encoded_text['input_ids'][0]
    total_tokens = len(token_ids)

    # If text short enough to fit within one chunk process normally, else chunk it
    if total_tokens <= chunk_size:
        encoded_text = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        output = model(**encoded_text)
        scores = softmax(output[0][0].detach().numpy())
    else:
        chunk_scores = []
        chunk_weights = []
        overlap = 50   # setting the token overlap between chunks

        chunk_starts = list(range(0, total_tokens, chunk_size - overlap))

        for idx, i in enumerate(chunk_starts):
            chunk_end = min(i + chunk_size, total_tokens)
            chunk_ids = token_ids[i:chunk_end].unsqueeze(0)

            output = model(input_ids=chunk_ids)
            chunk_score = softmax(output[0][0].detach().numpy())
            chunk_scores.append(chunk_score)

            chunk_length = chunk_end - i

            if idx == 0: # first chunk
                unique_tokens = chunk_length
            else:        # other chunks
                unique_tokens = chunk_length - overlap
            
            chunk_weights.append(unique_tokens / total_tokens)

            # --- debugging ---
            # pprint(text)
            # print(f"total tokens: {total_tokens}")
            # print(f"chunk_starts: {chunk_starts}")
            # print(f"chunk_end: {chunk_end}")
            # print(f"chunk_length: {chunk_length}")
            # print(f"chunk_weights: {chunk_weights}")
    
        # Calculate weighted average
        chunk_scores = np.array(chunk_scores)
        chunk_weights = np.array(chunk_weights)
        scores = np.average(chunk_scores, axis=0, weights=chunk_weights)

    scores_dict = {
        f'{model.config.model_type}_negative': float(scores[0]),
        f'{model.config.model_type}_neutral': float(scores[1]),
        f'{model.config.model_type}_positive': float(scores[2]),
    }
    return scores_dict

In [None]:
# using chunked version of the application function

roberta_sentiment = reviews['review_text'].progress_apply(lambda text: sentiment_scores_roberta_chunked(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative_chunked', 'roberta_neutral_chunked', 'roberta_positive_chunked']] = roberta_sentiment.apply(pd.Series)

  0%|          | 0/1266 [00:00<?, ?it/s]

In [45]:
reviews.head()

Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,review_text,owner_response_text,location_id,vader_negative,vader_neutral,vader_positive,vader_compound,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked,roberta_negative_trunc,roberta_neutral_trunc,roberta_positive_trunc
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025-12-16,2025-08-18,Google Maps,i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1,0.191,0.742,0.068,-0.9493,0.975971,0.02144,0.002589,0.975971,0.02144,0.002589
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025-12-16,2025-10-17,Google Maps,9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1,0.063,0.921,0.016,-0.6116,0.755748,0.209195,0.035058,0.755748,0.209195,0.035058
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025-12-16,2025-05-20,Google Maps,worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1,0.152,0.749,0.098,-0.4389,0.913692,0.076231,0.010077,0.913692,0.076231,0.010077
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025-12-16,2025-10-17,Google Maps,myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1,0.07,0.751,0.179,0.7574,0.005103,0.038017,0.95688,0.005103,0.038017,0.95688
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025-12-16,2025-01-20,Google Maps,terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1,0.178,0.717,0.105,-0.9174,0.965808,0.030455,0.003738,0.965808,0.030455,0.003738


In [46]:
# CALCULATE SENTIMENT SCORE VIA TOKEN TRUNCATION (v1.2)

def sentiment_scores_roberta_trunc(text, model, tokenizer):
    encoded_text = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    output = model(**encoded_text)
    scores = softmax(output[0][0].detach().numpy())
    scores_dict = {
        f'{model.config.model_type}_negative': float(scores[0]),
        f'{model.config.model_type}_neutral': float(scores[1]),
        f'{model.config.model_type}_positive': float(scores[2]),
    }
    
    return scores_dict

In [47]:
roberta_sentiment_trunc = reviews['review_text'].progress_apply(lambda text: sentiment_scores_roberta_trunc(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative_trunc', 'roberta_neutral_trunc', 'roberta_positive_trunc']] = roberta_sentiment.apply(pd.Series)

  0%|          | 0/1266 [00:00<?, ?it/s]

In [48]:
display(reviews.head())

# NOTE: we will probably only see differences in reviews with more than 512 tokens (this only occurs once in the reviews dataset; the first and only occurence of this is at row 999)
display(reviews.iloc[999,:])

Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,review_text,owner_response_text,location_id,vader_negative,vader_neutral,vader_positive,vader_compound,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked,roberta_negative_trunc,roberta_neutral_trunc,roberta_positive_trunc
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025-12-16,2025-08-18,Google Maps,i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1,0.191,0.742,0.068,-0.9493,0.975971,0.02144,0.002589,0.975971,0.02144,0.002589
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025-12-16,2025-10-17,Google Maps,9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1,0.063,0.921,0.016,-0.6116,0.755748,0.209195,0.035058,0.755748,0.209195,0.035058
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025-12-16,2025-05-20,Google Maps,worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1,0.152,0.749,0.098,-0.4389,0.913692,0.076231,0.010077,0.913692,0.076231,0.010077
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025-12-16,2025-10-17,Google Maps,myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1,0.07,0.751,0.179,0.7574,0.005103,0.038017,0.95688,0.005103,0.038017,0.95688
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025-12-16,2025-01-20,Google Maps,terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1,0.178,0.717,0.105,-0.9174,0.965808,0.030455,0.003738,0.965808,0.030455,0.003738


review_id                                ChdDSUhNMG9nS0VJQ0FnSURjMGV5VjJBRRAB
rating                                                                    1.0
likes                                                                       0
date_review_scraped                                       2025-12-16 00:00:00
review_date                                               2020-12-17 00:00:00
source                                                            Google Maps
review_text                 this must’ve been the worst customer service e...
owner_response_text         we’re sorry to see this. if you would like a f...
location_id                                                                 3
vader_negative                                                          0.134
vader_neutral                                                           0.745
vader_positive                                                          0.122
vader_compound                                                  

In [49]:
def token_count(text, tokenizer):
    """Count tokens in text"""
    encoded_text = tokenizer(text, return_tensors='pt', truncation=False)
    return len(encoded_text['input_ids'][0])

In [50]:
reviews['review_token_count'] = reviews['review_text'].progress_apply(lambda text: token_count(text, roberta_tokenizer))
reviews.head()

  0%|          | 0/1266 [00:00<?, ?it/s]

Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,review_text,owner_response_text,location_id,vader_negative,vader_neutral,vader_positive,vader_compound,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked,roberta_negative_trunc,roberta_neutral_trunc,roberta_positive_trunc,review_token_count
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025-12-16,2025-08-18,Google Maps,i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1,0.191,0.742,0.068,-0.9493,0.975971,0.02144,0.002589,0.975971,0.02144,0.002589,157
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025-12-16,2025-10-17,Google Maps,9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1,0.063,0.921,0.016,-0.6116,0.755748,0.209195,0.035058,0.755748,0.209195,0.035058,111
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025-12-16,2025-05-20,Google Maps,worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1,0.152,0.749,0.098,-0.4389,0.913692,0.076231,0.010077,0.913692,0.076231,0.010077,80
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025-12-16,2025-10-17,Google Maps,myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1,0.07,0.751,0.179,0.7574,0.005103,0.038017,0.95688,0.005103,0.038017,0.95688,65
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025-12-16,2025-01-20,Google Maps,terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1,0.178,0.717,0.105,-0.9174,0.965808,0.030455,0.003738,0.965808,0.030455,0.003738,225


In [51]:
print('total token count of all reviews:', reviews['review_token_count'].sum())

total token count of all reviews: 39201


# FIXME (COMPLETE THIS FIRST)

In [None]:
# FIXME: visualize the difference between the chunking + attention-mask method vs. truncating method we use



In [29]:
fig1.write_html('../reports/figures/VADERCompoundSentimentByRating.html')
fig2.write_html('../reports/figures/VADERSentimentScoreDecompByRating.html')

# LLM Model Exploration + Application

### Uses of Summarization Feature (Feature Development)

    ~[1] (LLM-data-cleaning)
        Upon reading a random sample of review text, it was clear that many review may have been written in a rush or without proofreading. Using a LLM model, we will clean the `review_text` column for better grammar, clarity, and sentence structure. Doing this may result in potentially more accurate scoring by our sentiment-scoring models (VADER/roBERTa). Also, this overall will just help clean up the review text for better readability if one desires to just plain read the reviews left by customers.

    ~[2] (Reviews Summarizer)
        Creating this feature may also just help us understand the common things, specifically weaknesses or strengths, customers are saying about the store's service. Knowing this can be extremely useful, helping the business improve customer service, customer satisfaction, and store/brand image in the local area. 

### Model exploration

In [41]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv('../.env')
HF_TOKEN = os.getenv('HUGGINGFACE_ACCESS_TOKEN')

login(token=HF_TOKEN) # access to models available on HuggingFace

In [42]:
# REMOVE ME (not enough fine grained control over the model parameters and outputs)
from transformers import pipeline

review_summary_pipe = pipeline('text-generation',
                               model='google/gemma-3-270m-it')

Device set to use mps:0


In [44]:
output = review_summary_pipe(example)
print(output[0]['generated_text'])

i brought my son to this location today for a back-to-school haircut. there is no proper welcoming into the store too they are not showing any respect towards customers… i clearly showed reference pictures of the style we wanted. the stylist (ben)did not listen patiently, rushed through the process, and completely ignored the details we explained. the haircut looked nothing like what we asked for. my kid was so upset with the result that he cried the entire time while doing..the experience upset him so much that now he doesn’t even want to go to school tomorrow. this was a very disappointing experience, and i would not recommend this stylist or this location. i hope management addresses this so other customers don’t go through the same frustration.
This store is a great place to shop for any haircut or styling services. The stylists are knowledgeable and passionate about their work. The store is clean and well-maintained.
I was looking for a hair stylist and this store is very well-mai

In [45]:
output = review_summary_pipe(example,
                             min_length=100,
                             num_beams=5,
                             temperature=0.75,
                             repetition_penalty=1.2,
                             do_sample=True)
pprint(output[0]['generated_text'])

('i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed reference pictures of the style we '
 'wanted. the stylist (ben)did not listen patiently, rushed through the '
 'process, and completely ignored the details we explained. the haircut looked '
 'nothing like what we asked for. my kid was so upset with the result that he '
 'cried the entire time while doing..the experience upset him so much that now '
 'he doesn’t even want to go to school tomorrow. this was a very disappointing '
 'experience, and i would not recommend this stylist or this location. i hope '
 'management addresses this so other customers don’t go through the same '
 'frustration.\n'
 'i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed

Using the pipeline method seemingly does not give us good outputs and probably indicates that we need more find grain control and prompting to the model

In [46]:
# Load model directly (more fine-grain control with the model -- WILL USE THIS METHOD)
from transformers import AutoTokenizer, AutoModelForCausalLM

gemma3_270M_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-270m-it")
gemma3_270M_model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m-it")
messages = [
    {"role": "user", 
     "content": f"Revise this review text for proper grammar and clarity: {example}"},
]
inputs = gemma3_270M_tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(gemma3_270M_model.device)

outputs = gemma3_270M_model.generate(**inputs, 
                         max_new_tokens=1024,
                         temperature=0.85)

print(gemma3_270M_tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Here's a revised version of the review text, focusing on grammar, clarity, and conciseness:

"I brought my son to this location today for a back-to-school haircut. There is no proper welcoming into the store, and the stylists are not showing any respect for customers. I clearly showed reference pictures of the style I desired. The stylist (ben) did not listen patiently, rushed through the process, and completely ignored the details we explained. The haircut looked nothing like what we asked for. My child was so upset with the result, and he cried the entire time. The experience was very disappointing, and I would not recommend this stylist or this location. I hope management addresses this issue so other customers don't experience similar frustration."
<end_of_turn>


In [47]:
# Trying the google/gemma-3-1b-it (1 billion parameter version)

gemma3_1B_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
gemma3_1B_model = AutoModelForCausalLM.from_pretrained("google/gemma-3-1b-it")

In [None]:
messages = [
    {"role": "user",
     "content": f"Revise this review text I have provided for proper grammar and clarity: {example}"},
]

inputs = gemma3_1B_tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors='pt'
).to(gemma3_1B_model.device)

outputs = gemma3_1B_model.generate(**inputs,
                                   max_new_tokens=1024,
                                   temperature=0.85)

print(gemma3_1B_tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Okay, here are a few revisions of your review, aiming for improved grammar, clarity, and a more professional tone. I’ve provided a few options with slightly different focuses:

**Option 1 (More Formal & Detailed):**

“Today, I brought my son to this location for a back-to-school haircut. Unfortunately, the experience was significantly disappointing. There was no welcoming atmosphere, and the staff did not appear to respect customers. I clearly presented reference pictures of the desired style, and the stylist, Ben, did not listen attentively. He rushed through the process, completely disregarding the details we provided. As a result, the haircut did not match our expectations at all. My son was understandably upset with the final result, crying throughout the process. This experience caused him significant distress, and he now refuses to go to school tomorrow. I would not recommend this stylist or this location, and I hope management will address this issue to prevent similar frustrati

In [36]:
# Stricter prompting for the model

messages = [
    {"role": "system",
     "content": "You are an expert copy editor. Your task is only to correct grammar and improve clarity. You only output the revised text, nothing else."},
    {"role": "user",
     "content": f"Revise this review text I have provided for proper grammar and clarity: {example}"},
]

inputs = gemma3_1B_tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors='pt'
).to(gemma3_1B_model.device)

outputs = gemma3_1B_model.generate(**inputs,
                                   max_new_tokens=1024,
                                   temperature=0.85)

pprint(gemma3_1B_tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

('I brought my son to this location today for a back-to-school haircut. There '
 'was no welcoming atmosphere, and the staff did not show respect to '
 'customers. I clearly showed reference pictures of the desired style. The '
 'stylist, Ben, did not listen patiently, rushed through the process, and '
 'completely ignored the details we explained. The haircut did not match our '
 'request at all. My child was extremely upset with the result, crying '
 'throughout the process. This experience deeply upset him, and he now refuses '
 'to go to school tomorrow. This was a very disappointing experience, and I '
 'would not recommend this stylist or this location. I hope management '
 'addresses this to prevent other customers from experiencing similar '
 'frustration.<end_of_turn>')


In [37]:
example2 = reviews['review_text'][1]
pprint(example2)

('9/16/25 9:30am got my hair cut at the parmer and mopac location. got home '
 'and found i was bleeding in the back of my neck and had razer burnes above '
 'my right eye and two places on my left ear. i will never use great clips '
 'again. this is the man that did it all. i went back and showed him what he '
 'had done. he apologised and refunded my money. i asked to. see the manager '
 'and he said he was the manager, very unlikey.')


In [38]:
messages = [
    {"role": "system",
     "content": "You are an expert copy editor. Your only task is to correct grammar and improve clarity. You only output the revised text, nothing else."},
    {"role": "user",
     "content": f"Revise this review text I have provided for proper grammar and clarity: {example2}"},
]

inputs = gemma3_1B_tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors='pt'
).to(gemma3_1B_model.device)

outputs = gemma3_1B_model.generate(**inputs,
                                   max_new_tokens=1024,
                                   temperature=0.85)

pprint(gemma3_1B_tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

('9/16/25, 9:30am, I got my hair cut at the Parker & Mopac location. I '
 'returned home and discovered I was bleeding in my neck, with Razer burns '
 'above my right eye and two places on my left ear. I will never use Great '
 'Clips again. This man did all of this. I returned to show him what he had '
 'done. He apologized and refunded my money. I asked for it. I then asked to '
 'speak to the manager, who stated he was the manager, which seemed '
 'unlikely.<end_of_turn>')


### Model selection reasoning

Abstractive Summarization Feature GOAL: find a model that is able to take a large sum of review text (partitioned by location) and summarize what the reviewers are most commonly saying about the location and its service AND is able to clean up text

1) First idea was to use summarization tasked models (e.g. facebook/bart-large-cnn) but after doing some light usage testing, I found that these models did not meet my requirements for text summarization in the context of google reviews. I discovered that these models were the 'Extractive Summarization' models rather than the 'Abstractive Summarization' model which I did not particularly need for achieving the goal of summarization of vast amounts review text. Also these models have relatively small input token limits

2) Second idea was to find abstractive summarization models (e.g. google/pegasus-xsum) but again after doing some light usage testing on only a small portion of example review text, I found that these models were not substantive enough providing very short summaries even after some parameter tuning on model outputs as well as prone to hallucination as demonstrated by other users of the model. Also these models had very small token input size limits (~500 tokens) and small context windows, which would not satisfy my needs to potentially thousands of input tokens

3) Upon more research, I compared different task categories: "extractive summarization", "abstractive summarization", AND "text generation". I found that text generation had the capabilites of more abstractive summarization with larger input token limits but similarly to abstractive summarization, text generation tasked models are prone to hallucinations in model outputs. Even though hallucination are a likely possiblity using text generation model, I think the risk is very well worth the potential ability to complete the goal to a high degree and based on my trialing of these models, they seem to perform well with user prompting (which I think may lower hallucination rates or getting undesirable outputs from the model)For the model, I am selecting the Google Gemma 3 270M parameter model since it is lightweight (<1B parameters) and has an input/output size limit of 32K tokens which checks all the requirements I have for completing this summarization feature

### PART 1 (LLM-data-cleaning feature): Using this model to clean my review text data and re-run my sentiment analysis

In [None]:
# finding rows with no empty text strings
no_text_mask = reviews['review_text'] == ''
display(reviews.loc[no_text_mask, ['review_text', 'rating', 'roberta_negative_chunked', 'roberta_neutral_chunked', 'roberta_positive_chunked']])


# using the model to clean the data
def clean_review_text(text, tokenizer, model):
    """Clean review text using Text Generation Model for grammar and clarity"""
    if not text or text.strip() == '':
        return text
    
    messages = [
        {"role": "system",
         "content": "You are an expert copy editor. Your task is only to correct grammar and improve clarity. You only output the revised text, nothing else."},
        {"role": "user",
         "content": f"Revise this review text for proper grammar and clarity: {text}"},
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors='pt'
    ).to(model.device)

    outputs = model.generate(**inputs, max_new_tokens=1024, temperature=0.85)
    cleaned_text = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]).strip()
    
    return cleaned_text

print(f"Cleaning review text with '{gemma3_1B_model.config.model_type}'...")
reviews['review_text_cleaned'] = reviews['review_text'].progress_apply(
    lambda text: clean_review_text(text, gemma3_1B_tokenizer, gemma3_1B_model)
)

Unnamed: 0,review_text,rating,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked
390,,5.0,0.258294,0.451272,0.290433
420,,5.0,0.258294,0.451272,0.290433
423,,4.0,0.258294,0.451272,0.290433
424,,1.0,0.258294,0.451272,0.290433
425,,4.0,0.258294,0.451272,0.290433
...,...,...,...,...,...
1261,,3.0,0.258294,0.451272,0.290433
1262,,5.0,0.258294,0.451272,0.290433
1263,,4.0,0.258294,0.451272,0.290433
1264,,1.0,0.258294,0.451272,0.290433


Cleaning review text with gemma3_text...


  0%|          | 0/1266 [00:00<?, ?it/s]

  0%|          | 0/1266 [00:00<?, ?it/s]

  0%|          | 0/1266 [00:00<?, ?it/s]

In [None]:
# removing <end_of_turn> token from the model output string (Data post-processing)

print('outputs with <end_of_turn> toke:', reviews['review_text_cleaned'].str.contains('<end_of_turn').sum())
reviews['review_text_cleaned'] = reviews['review_text_cleaned'].str.replace('<end_of_turn>', '', regex=False)
print('string cleaning applied...')
print('outputs with <end_of_turn> token:', reviews['review_text_cleaned'].str.contains('<end_of_turn').sum())


outputs with <end_of_turn> toke: 848
string cleaning applied...
outputs with <end_of_turn> toke: 0


In [71]:
vader_sentiment = reviews['review_text_cleaned'].progress_apply(lambda text: sia.polarity_scores(text))
reviews[['vader_negative_cleaned_text', 'vader_neutral_cleaned_text', 'vader_positive_cleaned_text', 'vader_compound_cleaned_text']] = vader_sentiment.apply(pd.Series)

roberta_sentiment = reviews['review_text_cleaned'].progress_apply(lambda text: sentiment_scores_roberta_chunked(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative_chunked_cleaned_text', 'roberta_neutral_chunked_cleaned_text', 'roberta_positive_chunked_cleaned_text']] = roberta_sentiment.apply(pd.Series)

  0%|          | 0/1266 [00:00<?, ?it/s]

  0%|          | 0/1266 [00:00<?, ?it/s]

In [76]:
display(reviews[['review_text', 'review_text_cleaned', 
                 'vader_negative','vader_neutral', 'vader_positive', 'vader_compound',
                 'vader_negative_cleaned_text','vader_neutral_cleaned_text', 'vader_positive_cleaned_text', 'vader_compound_cleaned_text']])
display(reviews[['review_text', 'review_text_cleaned',
                 'roberta_negative_chunked', 'roberta_neutral_chunked', 'roberta_positive_chunked',
                 'roberta_negative_chunked_cleaned_text', 'roberta_neutral_chunked_cleaned_text', 'roberta_positive_chunked_cleaned_text']])

Unnamed: 0,review_text,review_text_cleaned,vader_negative,vader_neutral,vader_positive,vader_compound,vader_negative_cleaned_text,vader_neutral_cleaned_text,vader_positive_cleaned_text,vader_compound_cleaned_text
0,i brought my son to this location today for a ...,I brought my son to this location today for a ...,0.191,0.742,0.068,-0.9493,0.184,0.728,0.089,-0.9108
1,9/16/25 9:30am got my hair cut at the parmer a...,"9/16/25 at 9:30am, I received my hair cut at t...",0.063,0.921,0.016,-0.6116,0.061,0.913,0.026,-0.4756
2,worst great clips i’ve ever been to. older man...,The worst haircuts I’ve ever received. An olde...,0.152,0.749,0.098,-0.4389,0.151,0.777,0.072,-0.6808
3,myra did my long hair straight cut since glenn...,"Myra recently had a long, straight haircut, wh...",0.070,0.751,0.179,0.7574,0.034,0.767,0.200,0.8858
4,terrible experience at great clips i had the w...,"At Great Clips, my experience was terrible. I ...",0.178,0.717,0.105,-0.9174,0.194,0.719,0.087,-0.9644
...,...,...,...,...,...,...,...,...,...,...
1261,,,0.000,0.000,0.000,0.0000,0.000,0.000,0.000,0.0000
1262,,,0.000,0.000,0.000,0.0000,0.000,0.000,0.000,0.0000
1263,,,0.000,0.000,0.000,0.0000,0.000,0.000,0.000,0.0000
1264,,,0.000,0.000,0.000,0.0000,0.000,0.000,0.000,0.0000


Unnamed: 0,review_text,review_text_cleaned,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked,roberta_negative_chunked_cleaned_text,roberta_neutral_chunked_cleaned_text,roberta_positive_chunked_cleaned_text
0,i brought my son to this location today for a ...,I brought my son to this location today for a ...,0.975971,0.021440,0.002589,0.971884,0.025221,0.002894
1,9/16/25 9:30am got my hair cut at the parmer a...,"9/16/25 at 9:30am, I received my hair cut at t...",0.755748,0.209195,0.035058,0.743154,0.225105,0.031741
2,worst great clips i’ve ever been to. older man...,The worst haircuts I’ve ever received. An olde...,0.913692,0.076231,0.010077,0.933166,0.060491,0.006344
3,myra did my long hair straight cut since glenn...,"Myra recently had a long, straight haircut, wh...",0.005103,0.038017,0.956880,0.002622,0.023674,0.973704
4,terrible experience at great clips i had the w...,"At Great Clips, my experience was terrible. I ...",0.965808,0.030455,0.003738,0.967306,0.029201,0.003492
...,...,...,...,...,...,...,...,...
1261,,,0.258294,0.451272,0.290433,0.258294,0.451272,0.290433
1262,,,0.258294,0.451272,0.290433,0.258294,0.451272,0.290433
1263,,,0.258294,0.451272,0.290433,0.258294,0.451272,0.290433
1264,,,0.258294,0.451272,0.290433,0.258294,0.451272,0.290433


In [84]:
# lets see the differences in the model outputs between the raw and cleaned text

diff = pd.DataFrame([], columns=['diff_abs_vader_compound', 'diff_vader_compound', 'diff_abs_roberta_negative_chunked', 'diff_roberta_negative_chunked',
                                 'diff_abs_roberta_neutral_chunked', 'diff_roberta_neutral_chunked', 'diff_abs_roberta_positive_chunked', 'diff_roberta_positive_chunked',])
diff['diff_abs_vader_compound'] = abs(reviews['vader_compound'] - reviews['vader_compound_cleaned_text'])
diff['diff_vader_compound'] = reviews['vader_compound'] - reviews['vader_compound_cleaned_text']

diff['diff_abs_roberta_negative_chunked'] = abs(reviews['roberta_negative_chunked'] - reviews['roberta_negative_chunked_cleaned_text'])
diff['diff_roberta_negative_chunked'] = reviews['roberta_negative_chunked'] - reviews['roberta_negative_chunked_cleaned_text']

diff['diff_abs_roberta_neutral_chunked'] = abs(reviews['roberta_neutral_chunked'] - reviews['roberta_neutral_chunked_cleaned_text'])
diff['diff_roberta_neutral_chunked'] = reviews['roberta_neutral_chunked'] - reviews['roberta_neutral_chunked_cleaned_text']

diff['diff_abs_roberta_positive_chunked'] = abs(reviews['roberta_positive_chunked'] - reviews['roberta_positive_chunked_cleaned_text'])
diff['diff_roberta_positive_chunked'] = reviews['roberta_positive_chunked'] - reviews['roberta_positive_chunked_cleaned_text']

display(diff)
diff.describe()

Unnamed: 0,diff_abs_vader_compound,diff_vader_compound,diff_abs_roberta_negative_chunked,diff_roberta_negative_chunked,diff_abs_roberta_neutral_chunked,diff_roberta_neutral_chunked,diff_abs_roberta_positive_chunked,diff_roberta_positive_chunked
0,0.0385,-0.0385,0.004086,0.004086,0.003782,-0.003782,0.000305,-0.000305
1,0.1360,-0.1360,0.012594,0.012594,0.015911,-0.015911,0.003317,0.003317
2,0.2419,0.2419,0.019474,-0.019474,0.015740,0.015740,0.003733,0.003733
3,0.1284,-0.1284,0.002481,0.002481,0.014343,0.014343,0.016824,-0.016824
4,0.0470,0.0470,0.001499,-0.001499,0.001254,0.001254,0.000245,0.000245
...,...,...,...,...,...,...,...,...
1261,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1262,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1263,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1264,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


Unnamed: 0,diff_abs_vader_compound,diff_vader_compound,diff_abs_roberta_negative_chunked,diff_roberta_negative_chunked,diff_abs_roberta_neutral_chunked,diff_roberta_neutral_chunked,diff_abs_roberta_positive_chunked,diff_roberta_positive_chunked
count,1266.0,1266.0,1266.0,1266.0,1266.0,1266.0,1266.0,1266.0
mean,0.127974,-0.019363,0.027687,-0.000506,0.036674,0.014152,0.03071,-0.013646
std,0.253706,0.283517,0.070064,0.075339,0.069088,0.076934,0.080863,0.085418
min,0.0,-1.4749,0.0,-0.516046,0.0,-0.590573,0.0,-0.901197
25%,0.0,-0.007575,0.0,-0.00011,0.0,-0.000433,0.0,-0.004789
50%,0.0,0.0,0.001611,0.0,0.006435,0.0,0.001812,0.0
75%,0.13255,0.0,0.017532,0.002974,0.038992,0.016618,0.019333,0.000587
max,1.7909,1.7909,0.939395,0.939395,0.590573,0.47679,0.901197,0.650155


In [151]:
# lets visualize the differences
import plotly.figure_factory as ff

fig3 = ff.create_distplot(
    [diff[col] for col in diff.columns],
    [col for col in diff.columns],
    show_hist=False,
    show_rug=True,
    show_curve=True
)

fig3.update_layout(
    width=1500, height=800,
    title={'text': 'Distribution of the Differences on Sentiment Model Scoring on Original vs. Cleaned text', 'x': 0.5},
    xaxis=dict(title='Differences in model scoring', ticklabelstandoff=10),
    yaxis=dict(title='Density', ticklabelstandoff=10),
    margin=dict(t=75,b=175,l=50,r=50)
)

fig3.add_annotation(
    text="1.) VADER compound sentiment scoring is a value normalized from -1 to 1 and does not represent a percentage while roBERTa sentiment scoring represents a percentage, so the<br>differences are respective of each model's scoring method"
         " (differences in normalized scoring for VADER and differences in confidence percentage for roBERTa)"
         "<br><br>2.) These distributions show the differences in sentiment scores between LLM-cleaned and original review text",
    showarrow=False,
    xref='paper', yref='paper',
    align='left',
    x=0, y=-0.25
)

fig3.show()

In [None]:
# need to explore the data where we find the largest differences (FIXME)

In [153]:
fig3.write_html('../reports/figures/DistributionOfDiffInScoringOriginalVsCleanText.html')

### PART 2 (Reviews Summarizer feature): Using a model to summarize all reviews and point out key insights