In [3]:
import pandas as pd
import numpy as np
import nltk
from pprint import pprint
from tqdm.notebook import tqdm

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

tqdm.pandas()

In [2]:
# perform only once
# nltk.download('all')

In [4]:
reviews = pd.read_csv('../data/processed/cleaned_combined_reviews_data.csv')
reviews.drop_duplicates(inplace=True)
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   review_id            1266 non-null   object 
 1   rating               1266 non-null   float64
 2   likes                1266 non-null   int64  
 3   date_review_scraped  1266 non-null   object 
 4   review_date          1266 non-null   object 
 5   source               1266 non-null   object 
 6   review_text          848 non-null    object 
 7   owner_response_text  1209 non-null   object 
 8   location_id          1266 non-null   int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 89.1+ KB


In [5]:
print(reviews.iloc[1265, 6])
print(type(reviews.iloc[1265, 6]))

print(reviews.iloc[0, 6])
print(type(reviews.iloc[0,6]))

nan
<class 'float'>
i brought my son to this location today for a back-to-school haircut. there is no proper welcoming into the store too they are not showing any respect towards customers… i clearly showed reference pictures of the style we wanted. the stylist (ben)did not listen patiently, rushed through the process, and completely ignored the details we explained. the haircut looked nothing like what we asked for. my kid was so upset with the result that he cried the entire time while doing..the experience upset him so much that now he doesn’t even want to go to school tomorrow. this was a very disappointing experience, and i would not recommend this stylist or this location. i hope management addresses this so other customers don’t go through the same frustration.
<class 'str'>


In [6]:
# slight data cleaning (will have to move this to appropriate notebook)

# contains nan values so I need to change them to default values and correct column data types
reviews['review_text'] = reviews['review_text'].fillna('')
reviews['owner_response_text'] = reviews['owner_response_text'].fillna('')

reviews = reviews.astype({
    'review_id': str,
    'source': str,
    'review_text': str,
    'owner_response_text': str
    })

reviews['date_review_scraped'] = pd.to_datetime(reviews['date_review_scraped'])
reviews['review_date'] = pd.to_datetime(reviews['review_date'])


In [7]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   review_id            1266 non-null   object        
 1   rating               1266 non-null   float64       
 2   likes                1266 non-null   int64         
 3   date_review_scraped  1266 non-null   datetime64[ns]
 4   review_date          1266 non-null   datetime64[ns]
 5   source               1266 non-null   object        
 6   review_text          1266 non-null   object        
 7   owner_response_text  1266 non-null   object        
 8   location_id          1266 non-null   int64         
dtypes: datetime64[ns](2), float64(1), int64(2), object(4)
memory usage: 89.1+ KB


In [8]:
# this review data was scraped from the web so it is likely that there are remenant HTML or special characters still in the text so lets check
# NOTE: move any data cleaning to the proper notebook file

html_specialChar_pattern = r'<[^>]+>|\n|\r|\t|&[a-z]+;'
problem_rows = reviews[ reviews['review_text'].str.contains(html_specialChar_pattern, regex=True, na=False) ]
print(problem_rows)

Empty DataFrame
Columns: [review_id, rating, likes, date_review_scraped, review_date, source, review_text, owner_response_text, location_id]
Index: []


In [9]:
example = reviews['review_text'][0]
pprint(example)

('i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed reference pictures of the style we '
 'wanted. the stylist (ben)did not listen patiently, rushed through the '
 'process, and completely ignored the details we explained. the haircut looked '
 'nothing like what we asked for. my kid was so upset with the result that he '
 'cried the entire time while doing..the experience upset him so much that now '
 'he doesn’t even want to go to school tomorrow. this was a very disappointing '
 'experience, and i would not recommend this stylist or this location. i hope '
 'management addresses this so other customers don’t go through the same '
 'frustration.')


In [10]:
# NLTK library functionalities

tokens = nltk.word_tokenize(example)
print(tokens[:10], '\n')

tagged = nltk.pos_tag(tokens) # pos => 'part of speech'
print(tagged[:10], '\n')

['i', 'brought', 'my', 'son', 'to', 'this', 'location', 'today', 'for', 'a'] 

[('i', 'NN'), ('brought', 'VBD'), ('my', 'PRP$'), ('son', 'NN'), ('to', 'TO'), ('this', 'DT'), ('location', 'NN'), ('today', 'NN'), ('for', 'IN'), ('a', 'DT')] 



In [11]:
entities = nltk.chunk.ne_chunk(tagged)
print(entities)

(S
  i/NN
  brought/VBD
  my/PRP$
  son/NN
  to/TO
  this/DT
  location/NN
  today/NN
  for/IN
  a/DT
  back-to-school/JJ
  haircut/NN
  ./.
  there/EX
  is/VBZ
  no/DT
  proper/JJ
  welcoming/NN
  into/IN
  the/DT
  store/NN
  too/RB
  they/PRP
  are/VBP
  not/RB
  showing/VBG
  any/DT
  respect/NN
  towards/IN
  customers…/NN
  i/NN
  clearly/RB
  showed/VBD
  reference/NN
  pictures/NNS
  of/IN
  the/DT
  style/NN
  we/PRP
  wanted/VBD
  ./.
  the/DT
  stylist/NN
  (/(
  ben/NN
  )/)
  did/VBD
  not/RB
  listen/VB
  patiently/RB
  ,/,
  rushed/VBD
  through/IN
  the/DT
  process/NN
  ,/,
  and/CC
  completely/RB
  ignored/VBD
  the/DT
  details/NNS
  we/PRP
  explained/VBD
  ./.
  the/DT
  haircut/NN
  looked/VBD
  nothing/NN
  like/IN
  what/WP
  we/PRP
  asked/VBD
  for/IN
  ./.
  my/PRP$
  kid/NN
  was/VBD
  so/RB
  upset/JJ
  with/IN
  the/DT
  result/NN
  that/IN
  he/PRP
  cried/VBD
  the/DT
  entire/JJ
  time/NN
  while/IN
  doing/VBG
  ../PDT
  the/DT
  experience/NN
  upset

# VADER Sentiment Scoring (Classical Method)

In [12]:
# VADER = Valence Aware Dictionary and sEntiment Reasoner --> ('bag of words approach' to sentiment analysis)
#
#       -> this method does not account for relationships between words

from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

pprint(example)
pprint(sia.polarity_scores(example))

('i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed reference pictures of the style we '
 'wanted. the stylist (ben)did not listen patiently, rushed through the '
 'process, and completely ignored the details we explained. the haircut looked '
 'nothing like what we asked for. my kid was so upset with the result that he '
 'cried the entire time while doing..the experience upset him so much that now '
 'he doesn’t even want to go to school tomorrow. this was a very disappointing '
 'experience, and i would not recommend this stylist or this location. i hope '
 'management addresses this so other customers don’t go through the same '
 'frustration.')
{'compound': -0.9493, 'neg': 0.191, 'neu': 0.742, 'pos': 0.068}


In [13]:
# Run polarity scoring on all reveiw text
vader_sentiment = reviews['review_text'].progress_apply(lambda text: sia.polarity_scores(text))
reviews[['vader_negative', 'vader_neutral', 'vader_positive', 'vader_compound']] = vader_sentiment.apply(pd.Series)

  0%|          | 0/1266 [00:00<?, ?it/s]

In [14]:
reviews.head()

Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,review_text,owner_response_text,location_id,vader_negative,vader_neutral,vader_positive,vader_compound
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025-12-16,2025-08-18,Google Maps,i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1,0.191,0.742,0.068,-0.9493
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025-12-16,2025-10-17,Google Maps,9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1,0.063,0.921,0.016,-0.6116
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025-12-16,2025-05-20,Google Maps,worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1,0.152,0.749,0.098,-0.4389
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025-12-16,2025-10-17,Google Maps,myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1,0.07,0.751,0.179,0.7574
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025-12-16,2025-01-20,Google Maps,terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1,0.178,0.717,0.105,-0.9174


In [15]:
fig1 = go.Figure()

colors = px.colors.sequential.Agsunset[:5]
ratings = sorted(reviews['rating'].unique())


for i, rating in enumerate(ratings):
    data = reviews[ reviews['rating'] == rating]['vader_compound' ]

    fig1.add_trace(go.Box(
        y=data,
        name=f'{rating} Rating',
        marker=dict(color=colors[i]),
        boxmean=True
    ))

fig1.update_layout(
    width=1000, height=600,
    title={'text':'VADER Compound Sentiment Score by Rating', 'x': 0.5},
    xaxis=dict(
        title='Rating',
        ticklabelstandoff=10
    ),
    yaxis=dict(
        title='Compound Score',
        ticklabelstandoff=10
    ),
    margin=dict(t=50, b=50, l=50, r=50)
)

fig1.show()

In [16]:
fig2 = make_subplots(
    rows=1, cols=3,
    subplot_titles=('VADER Positive', 'VADER Neutral', 'VADER Negative')
    )   

colors = px.colors.sequential.Agsunset[:5]
ratings = sorted(reviews['rating'].unique())

# Custom hover template with box statistics
hover_text = (
    '<b>%{fullData.name}</b><br>'
    'Min: %{customdata[0]:.3f}<br>'
    'Q1: %{customdata[1]:.3f}<br>'
    'Median: %{customdata[2]:.3f}<br>'
    'Mean: %{customdata[3]:.3f}<br>'
    'Q3: %{customdata[4]:.3f}<br>'
    'Max: %{customdata[5]:.3f}<br>'
    '<extra></extra>'
)

sentiments = [
    ('vader_positive', 1, 'Positive Score'),
    ('vader_neutral', 2, 'Neutral Score'),
    ('vader_negative', 3, 'Negative Score')
]

for col_name, col_idx, y_axis_title in sentiments:
    for i, rating in enumerate(ratings):
        data = reviews[ reviews['rating'] == rating ][col_name]
        stats = [data.min(), data.quantile(0.25), data.median(),
                 data.mean(), data.quantile(0.75), data.max()]
    
        fig2.add_trace(go.Box(
                y=data,
                name=f'{rating} Rating',
                marker=dict(color=colors[i]),
                boxmean=True,
                legendgroup=str(rating),
                showlegend=(col_idx==1), # only show legend for first column (grouped)
                hovertemplate=hover_text,
                customdata=[stats] * len(data)),
            row=1, col=col_idx
        )
    
    fig2.update_yaxes(title_text=y_axis_title, ticklabelstandoff=10,
                      row=1, col=col_idx)

fig2.update_layout(
    width=2000, height=750,
    title={'text': 'VADER Sentiment Scores Decomposition by Rating', 'x': 0.5},
    showlegend=True,
    margin=dict(t=100, b=50, l=50, r=50)
)

fig2.update_xaxes(ticklabelstandoff=10)

fig2.show()

# ROBERTA Sentiment Scoring (Pre-Trained Transformer Model --Hugging Face)

In [17]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import torch

In [None]:
# # Chandar Research Lab's NeoBERT model
# NeoBERT = 'chandar-lab/NeoBERT'
# NeoBERT_tokenizer = AutoTokenizer.from_pretrained(NeoBERT, trust_remote_code=True)
# NeoBERT_model = AutoModelForSequenceClassification.from_pretrained(NeoBERT, trust_remote_code=True)

# NOTE: cannot run this model on macOS (revist this later: FIXME)

In [18]:
pprint(example)
print()
pprint(sia.polarity_scores(example))

('i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed reference pictures of the style we '
 'wanted. the stylist (ben)did not listen patiently, rushed through the '
 'process, and completely ignored the details we explained. the haircut looked '
 'nothing like what we asked for. my kid was so upset with the result that he '
 'cried the entire time while doing..the experience upset him so much that now '
 'he doesn’t even want to go to school tomorrow. this was a very disappointing '
 'experience, and i would not recommend this stylist or this location. i hope '
 'management addresses this so other customers don’t go through the same '
 'frustration.')

{'compound': -0.9493, 'neg': 0.191, 'neu': 0.742, 'pos': 0.068}


In [19]:
# META AI Lab's RoBERTa-base model trained on twitter comments release in 2019 [~125M parameters] (loading the model)
roberta = f"cardiffnlp/twitter-roberta-base-sentiment"
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta)
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta)

print(f'Number of parameters: {roberta_model.num_parameters()}')
print(f'Model Name: {roberta_model.config.model_type}')

Number of parameters: 124647939
Model Name: roberta


In [20]:
# Example of running the model on text
encoded_text = roberta_tokenizer(example, return_tensors='pt')
output = roberta_model(**encoded_text)
score = output[0][0].detach().numpy()
scores = softmax(score)
scores_dict = {
    f'roberta_negative': float(scores[0]),
    'roberta_neutral': float(scores[1]),
    'roberta_positive' : float(scores[2])
}
pprint(scores_dict)

{'roberta_negative': 0.9759707450866699,
 'roberta_neutral': 0.02143993228673935,
 'roberta_positive': 0.0025893172714859247}


In [21]:
# CALCULATE SENTIMENT SCORES (v1.0)

def sentiment_scores_roberta(text, model, tokenizer):
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(**encoded_text)
    scores = softmax(output[0][0].detach().numpy())
    scores_dict = {
        f'{model.config.model_type}_negative': float(scores[0]),
        f'{model.config.model_type}_neutral': float(scores[1]),
        f'{model.config.model_type}_positive': float(scores[2]),
    }

    return scores_dict

In [21]:
roberta_sentiment = reviews['review_text'].progress_apply(lambda text: sentiment_scores_roberta(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative', 'roberta_neutral', 'roberta_positive']] = roberta_sentiment.apply(pd.Series)

# ISSUE: RoBERTa-base has a maximum token length of 512 but some review text is longer than that

  0%|          | 0/1266 [00:00<?, ?it/s]

RuntimeError: The expanded size of the tensor (597) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 597].  Tensor sizes: [1, 514]

In [22]:
# CALCULATE SENTIMENT SCORES VIA TOKEN CHUNKING (v1.1)

def sentiment_scores_roberta_chunked(text, model, tokenizer, chunk_size=512):
    '''
    Process long review texts by chunking and using a weighted-average by unique tokens
    per chunk for sentiment score
    '''

    encoded_text = tokenizer(text, return_tensors='pt', truncation=False)
    token_ids = encoded_text['input_ids'][0]
    total_tokens = len(token_ids)

    # If text short enough to fit within one chunk process normally, else chunk it
    if total_tokens <= chunk_size:
        encoded_text = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        output = model(**encoded_text)
        scores = softmax(output[0][0].detach().numpy())
    else:
        chunk_scores = []
        chunk_weights = []
        overlap = 50   # setting the token overlap between chunks

        chunk_starts = list(range(0, total_tokens, chunk_size - overlap))

        for idx, i in enumerate(chunk_starts):
            chunk_end = min(i + chunk_size, total_tokens)
            chunk_ids = token_ids[i:chunk_end].unsqueeze(0)

            output = model(input_ids=chunk_ids)
            chunk_score = softmax(output[0][0].detach().numpy())
            chunk_scores.append(chunk_score)

            chunk_length = chunk_end - i

            if idx == 0: # first chunk
                unique_tokens = chunk_length
            else:        # other chunks
                unique_tokens = chunk_length - overlap
            
            chunk_weights.append(unique_tokens / total_tokens)

            # --- debugging ---
            # pprint(text)
            # print(f"total tokens: {total_tokens}")
            # print(f"chunk_starts: {chunk_starts}")
            # print(f"chunk_end: {chunk_end}")
            # print(f"chunk_length: {chunk_length}")
            # print(f"chunk_weights: {chunk_weights}")
    
        # Calculate weighted average
        chunk_scores = np.array(chunk_scores)
        chunk_weights = np.array(chunk_weights)
        scores = np.average(chunk_scores, axis=0, weights=chunk_weights)

    scores_dict = {
        f'{model.config.model_type}_negative': float(scores[0]),
        f'{model.config.model_type}_neutral': float(scores[1]),
        f'{model.config.model_type}_positive': float(scores[2]),
    }
    return scores_dict

In [23]:
# using chunked version of the application function

roberta_sentiment = reviews['review_text'].progress_apply(lambda text: sentiment_scores_roberta_chunked(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative_chunked', 'roberta_neutral_chunked', 'roberta_positive_chunked']] = roberta_sentiment.apply(pd.Series)

  0%|          | 0/1266 [00:00<?, ?it/s]

In [25]:
reviews.head()

Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,review_text,owner_response_text,location_id,vader_negative,vader_neutral,vader_positive,vader_compound,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025-12-16,2025-08-18,Google Maps,i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1,0.191,0.742,0.068,-0.9493,0.975971,0.02144,0.002589
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025-12-16,2025-10-17,Google Maps,9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1,0.063,0.921,0.016,-0.6116,0.755748,0.209195,0.035058
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025-12-16,2025-05-20,Google Maps,worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1,0.152,0.749,0.098,-0.4389,0.913692,0.076231,0.010077
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025-12-16,2025-10-17,Google Maps,myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1,0.07,0.751,0.179,0.7574,0.005103,0.038017,0.95688
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025-12-16,2025-01-20,Google Maps,terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1,0.178,0.717,0.105,-0.9174,0.965808,0.030455,0.003738


In [26]:
# CALCULATE SENTIMENT SCORE VIA TOKEN TRUNCATION (v1.2)

def sentiment_scores_roberta_trunc(text, model, tokenizer):
    encoded_text = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    output = model(**encoded_text)
    scores = softmax(output[0][0].detach().numpy())
    scores_dict = {
        f'{model.config.model_type}_negative': float(scores[0]),
        f'{model.config.model_type}_neutral': float(scores[1]),
        f'{model.config.model_type}_positive': float(scores[2]),
    }
    
    return scores_dict

In [27]:
roberta_sentiment_trunc = reviews['review_text'].progress_apply(lambda text: sentiment_scores_roberta_trunc(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative_trunc', 'roberta_neutral_trunc', 'roberta_positive_trunc']] = roberta_sentiment.apply(pd.Series)

  0%|          | 0/1266 [00:00<?, ?it/s]

In [28]:
display(reviews.head())

# NOTE: we will probably only see differences in reviews with more than 512 tokens (this only occurs once in the reviews dataset; the first and only occurence of this is at row 999)
display(reviews.iloc[999,:])

Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,review_text,owner_response_text,location_id,vader_negative,vader_neutral,vader_positive,vader_compound,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked,roberta_negative_trunc,roberta_neutral_trunc,roberta_positive_trunc
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025-12-16,2025-08-18,Google Maps,i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1,0.191,0.742,0.068,-0.9493,0.975971,0.02144,0.002589,0.975971,0.02144,0.002589
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025-12-16,2025-10-17,Google Maps,9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1,0.063,0.921,0.016,-0.6116,0.755748,0.209195,0.035058,0.755748,0.209195,0.035058
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025-12-16,2025-05-20,Google Maps,worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1,0.152,0.749,0.098,-0.4389,0.913692,0.076231,0.010077,0.913692,0.076231,0.010077
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025-12-16,2025-10-17,Google Maps,myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1,0.07,0.751,0.179,0.7574,0.005103,0.038017,0.95688,0.005103,0.038017,0.95688
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025-12-16,2025-01-20,Google Maps,terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1,0.178,0.717,0.105,-0.9174,0.965808,0.030455,0.003738,0.965808,0.030455,0.003738


review_id                                ChdDSUhNMG9nS0VJQ0FnSURjMGV5VjJBRRAB
rating                                                                    1.0
likes                                                                       0
date_review_scraped                                       2025-12-16 00:00:00
review_date                                               2020-12-17 00:00:00
source                                                            Google Maps
review_text                 this must’ve been the worst customer service e...
owner_response_text         we’re sorry to see this. if you would like a f...
location_id                                                                 3
vader_negative                                                          0.134
vader_neutral                                                           0.745
vader_positive                                                          0.122
vader_compound                                                  

In [29]:
# display the difference between the chunking vs. truncating methods for token processing with roBERTa

diff_token_processing = pd.DataFrame([], columns=['diff_roberta_positive', 'diff_roberta_neutral', 'diff_roberta_negative'])

diff_token_processing['diff_roberta_positive'] = reviews['roberta_positive_chunked'] - reviews['roberta_positive_trunc']
diff_token_processing['diff_roberta_neutral'] = reviews['roberta_neutral_chunked'] - reviews['roberta_neutral_trunc']
diff_token_processing['diff_roberta_negative'] = reviews['roberta_negative_chunked'] - reviews['roberta_negative_trunc']


display(diff_token_processing)
diff_token_processing.describe()

Unnamed: 0,diff_roberta_positive,diff_roberta_neutral,diff_roberta_negative
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
...,...,...,...
1261,0.0,0.0,0.0
1262,0.0,0.0,0.0
1263,0.0,0.0,0.0
1264,0.0,0.0,0.0


Unnamed: 0,diff_roberta_positive,diff_roberta_neutral,diff_roberta_negative
count,1266.0,1266.0,1266.0
mean,0.0,0.0,0.0
std,0.0,0.0,0.0
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,0.0,0.0,0.0


In [29]:
fig1.write_html('../reports/figures/VADERCompoundSentimentByRating.html')
fig2.write_html('../reports/figures/VADERSentimentScoreDecompByRating.html')

# LLM Model Exploration + Application

### Uses of Summarization Feature (Feature Development)

    ~[1] (LLM-data-cleaning)
        Upon reading a random sample of review text, it was clear that many review may have been written in a rush or without proofreading. Using a LLM model, we will clean the `review_text` column for better grammar, clarity, and sentence structure. Doing this may result in potentially more accurate scoring by our sentiment-scoring models (VADER/roBERTa). Also, this overall will just help clean up the review text for better readability if one desires to just plain read the reviews left by customers.

    ~[2] (Reviews Summarizer)
        Creating this feature may also just help us understand the common things, specifically weaknesses or strengths, customers are saying about the store's service. Knowing this can be extremely useful, helping the business improve customer service, customer satisfaction, and store/brand image in the local area. 

### Model exploration

In [30]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv('../.env')
HF_TOKEN = os.getenv('HUGGINGFACE_ACCESS_TOKEN')

login(token=HF_TOKEN) # access to models available on HuggingFace

In [31]:
# REMOVE ME (not enough fine grained control over the model parameters and outputs)
from transformers import pipeline

review_summary_pipe = pipeline('text-generation',
                               model='google/gemma-3-270m-it')

Device set to use mps:0


In [62]:
output = review_summary_pipe(f"clean the text for proper grammar and clarity: {example}")
print(output[0]['generated_text'])

clean the text for proper grammar and clarity: i brought my son to this location today for a back-to-school haircut. there is no proper welcoming into the store too they are not showing any respect towards customers… i clearly showed reference pictures of the style we wanted. the stylist (ben)did not listen patiently, rushed through the process, and completely ignored the details we explained. the haircut looked nothing like what we asked for. my kid was so upset with the result that he cried the entire time while doing..the experience upset him so much that now he doesn’t even want to go to school tomorrow. this was a very disappointing experience, and i would not recommend this stylist or this location. i hope management addresses this so other customers don’t go through the same frustration.
This is a very disappointing experience. I brought my son to this location today for a back-to-school haircut. There is no proper welcoming into the store too they are not showing any respect to

In [63]:
output = review_summary_pipe(example,
                             min_length=100,
                             num_beams=5,
                             temperature=0.75,
                             repetition_penalty=1.2,
                             do_sample=True)
pprint(output[0]['generated_text'])

('i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed reference pictures of the style we '
 'wanted. the stylist (ben)did not listen patiently, rushed through the '
 'process, and completely ignored the details we explained. the haircut looked '
 'nothing like what we asked for. my kid was so upset with the result that he '
 'cried the entire time while doing..the experience upset him so much that now '
 'he doesn’t even want to go to school tomorrow. this was a very disappointing '
 'experience, and i would not recommend this stylist or this location. i hope '
 'management addresses this so other customers don’t go through the same '
 'frustration.\n'
 'i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed

Using the pipeline method seemingly does not give us good outputs and indicates that we need more find grain control and prompting to the model

In [34]:
# Load model directly (more fine-grain control with the model -- WILL USE THIS METHOD)
from transformers import AutoTokenizer, AutoModelForCausalLM

gemma3_270M_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-270m-it")
gemma3_270M_model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m-it")

In [35]:
messages = [
    {"role": "user", 
     "content": f"Revise this review text for proper grammar and clarity: {example}"},
]
inputs = gemma3_270M_tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(gemma3_270M_model.device)

outputs = gemma3_270M_model.generate(**inputs, 
                         max_new_tokens=1024,
                         temperature=0.85)

print(gemma3_270M_tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Here's a revised version of the review text, aiming for better grammar and clarity:

"I brought my son to this location today for a back-to-school haircut. There is no proper welcoming into the store, and they are not showing any respect for customers. I clearly showed reference pictures of the style I wanted. The stylist (ben) did not listen patiently, rushed through the process, and completely ignored the details we explained. The haircut looked nothing like what we asked for. My child was so upset with the result, and he cried throughout the experience. The experience was extremely disappointing, and I would not recommend this stylist or this location. I hope management addresses this issue so other customers do not experience similar frustration."<end_of_turn>


In [36]:
# Trying the google/gemma-3-1b-it (1 billion parameter version)

gemma3_1B_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
gemma3_1B_model = AutoModelForCausalLM.from_pretrained("google/gemma-3-1b-it")

In [37]:
messages = [
    {"role": "user",
     "content": f"Revise this review text I have provided for proper grammar and clarity: {example}"},
]

inputs = gemma3_1B_tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors='pt'
).to(gemma3_1B_model.device)

outputs = gemma3_1B_model.generate(**inputs,
                                   max_new_tokens=1024,
                                   temperature=0.85)

print(gemma3_1B_tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Okay, here are a few revisions of your review, aiming for improved grammar, clarity, and a more professional tone. I've provided a few options, ranging from slightly more formal to a bit more direct, depending on the overall tone you're aiming for.

**Option 1 (More Formal & Detailed):**

“Today, I brought my son to this location for a back-to-school haircut. Unfortunately, the experience was deeply disappointing. There was no welcoming atmosphere, and the staff didn't seem to value our time. I clearly showed the stylist, Ben, reference pictures of the desired style. However, he didn’t listen attentively to our requests, rushed through the process, and completely ignored the details we provided. The resulting haircut was significantly different from what we expected. My son was understandably upset, crying throughout the service and refusing to go to school tomorrow. This was a very frustrating and disheartening experience, and I wouldn’t recommend this stylist or this location to othe

In [38]:
# Stricter prompting for the model

messages = [
    {"role": "system",
     "content": "You are an expert copy editor. Your task is only to correct grammar and improve clarity. You only output the revised text, nothing else."},
    {"role": "user",
     "content": f"Revise this review text I have provided for proper grammar and clarity: {example}"},
]

inputs = gemma3_1B_tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors='pt'
).to(gemma3_1B_model.device)

outputs = gemma3_1B_model.generate(**inputs,
                                   max_new_tokens=1024,
                                   temperature=0.85)

pprint(gemma3_1B_tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

('I brought my son to this location today for a back-to-school haircut. There '
 'was no welcoming atmosphere; the staff did not show respect to customers. I '
 'clearly showed reference pictures of the desired style. The stylist, Ben, '
 'did not listen patiently, rushed through the process, and completely ignored '
 'the details we explained. The haircut did not match our request at all. My '
 'child was extremely upset with the result, crying throughout the process, '
 'and now he refuses to go to school tomorrow. This was a very disappointing '
 'experience, and I would not recommend this stylist or this location. I hope '
 'management addresses this so other customers don’t experience the same '
 'frustration.<end_of_turn>')


In [39]:
example2 = reviews['review_text'][1]
pprint(example2)

('9/16/25 9:30am got my hair cut at the parmer and mopac location. got home '
 'and found i was bleeding in the back of my neck and had razer burnes above '
 'my right eye and two places on my left ear. i will never use great clips '
 'again. this is the man that did it all. i went back and showed him what he '
 'had done. he apologised and refunded my money. i asked to. see the manager '
 'and he said he was the manager, very unlikey.')


In [40]:
messages = [
    {"role": "system",
     "content": "You are an expert copy editor. Your only task is to correct grammar and improve clarity. You only output the revised text, nothing else."},
    {"role": "user",
     "content": f"Revise this review text I have provided for proper grammar and clarity: {example2}"},
]

inputs = gemma3_1B_tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors='pt'
).to(gemma3_1B_model.device)

outputs = gemma3_1B_model.generate(**inputs,
                                   max_new_tokens=1024,
                                   temperature=0.85)

pprint(gemma3_1B_tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

('9/16/25, 9:30 am – I got my hair cut at the Parker & Mopac location. Upon '
 'returning home, I discovered I was bleeding in my back of the neck and had '
 'razer burns above my right eye, and two places on my left ear. I will never '
 'use Great Clips again. This man did everything. I returned to show him what '
 'he had done. He apologized and refunded my money. I asked for it. I then '
 'asked to see the manager, who stated he was the manager, which seemed '
 'unlikely.<end_of_turn>')


### Model selection reasoning

Abstractive Summarization Feature GOAL: find a model that is able to take a large sum of review text (partitioned by location) and summarize what the reviewers are most commonly saying about the location and its service AND is able to clean up text

1) First idea was to use summarization tasked models (e.g. facebook/bart-large-cnn) but after doing some light usage testing, I found that these models did not meet my requirements for text summarization in the context of google reviews. I discovered that these models were the 'Extractive Summarization' models rather than the 'Abstractive Summarization' model which I did not particularly need for achieving the goal of summarization of vast amounts review text. Also these models have relatively small input token limits

2) Second idea was to find abstractive summarization models (e.g. google/pegasus-xsum) but again after doing some light usage testing on only a small portion of example review text, I found that these models were not substantive enough providing very short summaries even after some parameter tuning on model outputs as well as prone to hallucination as demonstrated by other users of the model. Also these models had very small token input size limits (~500 tokens) and small context windows, which would not satisfy my needs to potentially thousands of input tokens

3) Upon more research, I compared different task categories: "extractive summarization", "abstractive summarization", AND "text generation". I found that text generation had the capabilites of more abstractive summarization with larger input token limits but similarly to abstractive summarization, text generation tasked models are prone to hallucinations in model outputs. Even though hallucination are a likely possiblity using text generation model, I think the risk is very well worth the potential ability to complete the goal to a high degree and based on my trialing of these models, they seem to perform well with user prompting (which I think may lower hallucination rates or getting undesirable outputs from the model)For the model, I am selecting the Google Gemma 3 270M parameter model since it is lightweight (<1B parameters) and has an input/output size limit of 32K tokens which checks all the requirements I have for completing this summarization feature

### PART 1 (LLM-data-cleaning feature): Using this model to clean my review text data and re-run my sentiment analysis

In [None]:
# finding rows with no empty text strings
no_text_mask = reviews['review_text'] == ''
display(reviews.loc[no_text_mask, ['review_text', 'rating', 'roberta_negative_chunked', 'roberta_neutral_chunked', 'roberta_positive_chunked']])


# using the model to clean the data
def clean_review_text(text, tokenizer, model):
    """Clean review text using Text Generation Model for grammar and clarity"""
    if not text or text.strip() == '':
        return text
    
    messages = [
        {"role": "system",
         "content": "You are an expert copy editor. Your task is only to correct grammar and improve clarity. You only output the revised text, nothing else."},
        {"role": "user",
         "content": f"Revise this review text for proper grammar and clarity: {text}"},
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors='pt'
    ).to(model.device)

    outputs = model.generate(**inputs, max_new_tokens=1024, temperature=0.85)
    cleaned_text = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]).strip()
    
    return cleaned_text

print(f"Cleaning review text with '{gemma3_1B_model.config.model_type}'...")
reviews['review_text_cleaned'] = reviews['review_text'].progress_apply(
    lambda text: clean_review_text(text, gemma3_1B_tokenizer, gemma3_1B_model)
)

Unnamed: 0,review_text,rating,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked
390,,5.0,0.258294,0.451272,0.290433
420,,5.0,0.258294,0.451272,0.290433
423,,4.0,0.258294,0.451272,0.290433
424,,1.0,0.258294,0.451272,0.290433
425,,4.0,0.258294,0.451272,0.290433
...,...,...,...,...,...
1261,,3.0,0.258294,0.451272,0.290433
1262,,5.0,0.258294,0.451272,0.290433
1263,,4.0,0.258294,0.451272,0.290433
1264,,1.0,0.258294,0.451272,0.290433


Cleaning review text with 'gemma3_text'...


  0%|          | 0/1266 [00:00<?, ?it/s]

In [69]:
# removing <end_of_turn> token from the model output string (Data post-processing)

print('outputs with <end_of_turn> token:', reviews['review_text_cleaned'].str.contains('<end_of_turn').sum())
reviews['review_text_cleaned'] = reviews['review_text_cleaned'].str.replace('<end_of_turn>', '', regex=False)
print('string cleaning applied...')
print('outputs with <end_of_turn> token:', reviews['review_text_cleaned'].str.contains('<end_of_turn').sum())


outputs with <end_of_turn> token: 848
string cleaning applied...
outputs with <end_of_turn> token: 0


In [70]:
vader_sentiment = reviews['review_text_cleaned'].progress_apply(lambda text: sia.polarity_scores(text))
reviews[['vader_negative_cleaned_text', 'vader_neutral_cleaned_text', 'vader_positive_cleaned_text', 'vader_compound_cleaned_text']] = vader_sentiment.apply(pd.Series)

roberta_sentiment = reviews['review_text_cleaned'].progress_apply(lambda text: sentiment_scores_roberta_chunked(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative_chunked_cleaned_text', 'roberta_neutral_chunked_cleaned_text', 'roberta_positive_chunked_cleaned_text']] = roberta_sentiment.apply(pd.Series)

  0%|          | 0/1266 [00:00<?, ?it/s]

  0%|          | 0/1266 [00:00<?, ?it/s]

In [71]:
display(reviews[['review_text', 'review_text_cleaned', 
                 'vader_negative','vader_neutral', 'vader_positive', 'vader_compound',
                 'vader_negative_cleaned_text','vader_neutral_cleaned_text', 'vader_positive_cleaned_text', 'vader_compound_cleaned_text']])
display(reviews[['review_text', 'review_text_cleaned',
                 'roberta_negative_chunked', 'roberta_neutral_chunked', 'roberta_positive_chunked',
                 'roberta_negative_chunked_cleaned_text', 'roberta_neutral_chunked_cleaned_text', 'roberta_positive_chunked_cleaned_text']])

Unnamed: 0,review_text,review_text_cleaned,vader_negative,vader_neutral,vader_positive,vader_compound,vader_negative_cleaned_text,vader_neutral_cleaned_text,vader_positive_cleaned_text,vader_compound_cleaned_text
0,i brought my son to this location today for a ...,I brought my son to this location today for a ...,0.191,0.742,0.068,-0.9493,0.184,0.739,0.077,-0.9305
1,9/16/25 9:30am got my hair cut at the parmer a...,"9/16/25, 9:30am – I got my hair cut at the Par...",0.063,0.921,0.016,-0.6116,0.070,0.900,0.030,-0.4756
2,worst great clips i’ve ever been to. older man...,The worst haircuts I’ve ever received. An olde...,0.152,0.749,0.098,-0.4389,0.183,0.706,0.111,-0.6369
3,myra did my long hair straight cut since glenn...,"Myra recently had a long hair straight cut, wh...",0.070,0.751,0.179,0.7574,0.064,0.743,0.194,0.8221
4,terrible experience at great clips i had the w...,My experience at Great Clips was truly terribl...,0.178,0.717,0.105,-0.9174,0.196,0.707,0.097,-0.9585
...,...,...,...,...,...,...,...,...,...,...
1261,,,0.000,0.000,0.000,0.0000,0.000,0.000,0.000,0.0000
1262,,,0.000,0.000,0.000,0.0000,0.000,0.000,0.000,0.0000
1263,,,0.000,0.000,0.000,0.0000,0.000,0.000,0.000,0.0000
1264,,,0.000,0.000,0.000,0.0000,0.000,0.000,0.000,0.0000


Unnamed: 0,review_text,review_text_cleaned,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked,roberta_negative_chunked_cleaned_text,roberta_neutral_chunked_cleaned_text,roberta_positive_chunked_cleaned_text
0,i brought my son to this location today for a ...,I brought my son to this location today for a ...,0.975971,0.021440,0.002589,0.972349,0.024828,0.002823
1,9/16/25 9:30am got my hair cut at the parmer a...,"9/16/25, 9:30am – I got my hair cut at the Par...",0.755748,0.209195,0.035058,0.613978,0.320026,0.065996
2,worst great clips i’ve ever been to. older man...,The worst haircuts I’ve ever received. An olde...,0.913692,0.076231,0.010077,0.928978,0.064322,0.006700
3,myra did my long hair straight cut since glenn...,"Myra recently had a long hair straight cut, wh...",0.005103,0.038017,0.956880,0.001351,0.020897,0.977751
4,terrible experience at great clips i had the w...,My experience at Great Clips was truly terribl...,0.965808,0.030455,0.003738,0.968991,0.027726,0.003284
...,...,...,...,...,...,...,...,...
1261,,,0.258294,0.451272,0.290433,0.258294,0.451272,0.290433
1262,,,0.258294,0.451272,0.290433,0.258294,0.451272,0.290433
1263,,,0.258294,0.451272,0.290433,0.258294,0.451272,0.290433
1264,,,0.258294,0.451272,0.290433,0.258294,0.451272,0.290433


In [73]:
# lets see the differences in the model outputs between the raw and cleaned text

diff = pd.DataFrame([], columns=['diff_abs_vader_compound', 'diff_vader_compound', 'diff_abs_roberta_negative_chunked', 'diff_roberta_negative_chunked',
                                 'diff_abs_roberta_neutral_chunked', 'diff_roberta_neutral_chunked', 'diff_abs_roberta_positive_chunked', 'diff_roberta_positive_chunked',])
diff['diff_abs_vader_compound'] = abs(reviews['vader_compound'] - reviews['vader_compound_cleaned_text'])
diff['diff_vader_compound'] = reviews['vader_compound'] - reviews['vader_compound_cleaned_text']

diff['diff_abs_roberta_negative_chunked'] = abs(reviews['roberta_negative_chunked'] - reviews['roberta_negative_chunked_cleaned_text'])
diff['diff_roberta_negative_chunked'] = reviews['roberta_negative_chunked'] - reviews['roberta_negative_chunked_cleaned_text']

diff['diff_abs_roberta_neutral_chunked'] = abs(reviews['roberta_neutral_chunked'] - reviews['roberta_neutral_chunked_cleaned_text'])
diff['diff_roberta_neutral_chunked'] = reviews['roberta_neutral_chunked'] - reviews['roberta_neutral_chunked_cleaned_text']

diff['diff_abs_roberta_positive_chunked'] = abs(reviews['roberta_positive_chunked'] - reviews['roberta_positive_chunked_cleaned_text'])
diff['diff_roberta_positive_chunked'] = reviews['roberta_positive_chunked'] - reviews['roberta_positive_chunked_cleaned_text']

display(diff)
diff.describe()

Unnamed: 0,diff_abs_vader_compound,diff_vader_compound,diff_abs_roberta_negative_chunked,diff_roberta_negative_chunked,diff_abs_roberta_neutral_chunked,diff_roberta_neutral_chunked,diff_abs_roberta_positive_chunked,diff_roberta_positive_chunked
0,0.0188,-0.0188,0.003622,0.003622,0.003388,-0.003388,0.000234,-0.000234
1,0.1360,-0.1360,0.141769,0.141769,0.110831,-0.110831,0.030938,-0.030938
2,0.1980,0.1980,0.015286,-0.015286,0.011909,0.011909,0.003377,0.003377
3,0.0647,-0.0647,0.003751,0.003751,0.017120,0.017120,0.020871,-0.020871
4,0.0411,0.0411,0.003183,-0.003183,0.002729,0.002729,0.000454,0.000454
...,...,...,...,...,...,...,...,...
1261,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1262,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1263,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1264,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


Unnamed: 0,diff_abs_vader_compound,diff_vader_compound,diff_abs_roberta_negative_chunked,diff_roberta_negative_chunked,diff_abs_roberta_neutral_chunked,diff_roberta_neutral_chunked,diff_abs_roberta_positive_chunked,diff_roberta_positive_chunked
count,1266.0,1266.0,1266.0,1266.0,1266.0,1266.0,1266.0,1266.0
mean,0.125091,-0.019028,0.028147,0.000962,0.039189,0.012643,0.033029,-0.013605
std,0.247785,0.276939,0.072037,0.077338,0.076098,0.084664,0.084506,0.08971
min,0.0,-1.8236,0.0,-0.602351,0.0,-0.594459,0.0,-0.794062
25%,0.0,-0.006275,0.0,-0.000201,0.0,-0.000444,0.0,-0.004354
50%,0.0,0.0,0.001557,0.0,0.006381,0.0,0.001714,0.0
75%,0.1369,0.0,0.017736,0.002913,0.041954,0.016852,0.020439,0.000517
max,1.8236,1.7553,0.793643,0.793643,0.594459,0.50014,0.794062,0.658506


In [76]:
# lets visualize the differences
import plotly.figure_factory as ff

fig3 = ff.create_distplot(
    [diff[col] for col in diff.columns],
    [col for col in diff.columns],
    show_hist=False,
    show_rug=True,
    show_curve=True
)

fig3.update_layout(
    width=1500, height=800,
    title={'text': 'Distribution of the Differences on Sentiment Model Scoring on Original vs. Cleaned text', 'x': 0.5},
    xaxis=dict(title='Differences in model scoring', ticklabelstandoff=10),
    yaxis=dict(title='Density', ticklabelstandoff=10),
    margin=dict(t=75,b=175,l=50,r=50)
)

fig3.add_annotation(
    text="1.) VADER compound sentiment scoring is a value normalized from -1 to 1 and does not represent a percentage while roBERTa sentiment scoring represents a percentage, so the<br>differences are respective of each model's scoring method"
         " (differences in normalized scoring for VADER and differences in confidence percentage for roBERTa)"
         "<br><br>2.) These distributions show the differences in sentiment scoring between LLM-cleaned and original review text",
    showarrow=False,
    xref='paper', yref='paper',
    align='left',
    x=0, y=-0.25
)

fig3.show()

In [153]:
fig3.write_html('../reports/figures/DistributionOfDiffInScoringOriginalVsCleanText.html')

Lets explore the review data where the differences between sentiment score between the original text and LLM-cleaned text is quite large

In [96]:
# need to explore the data where we find the largest differences (FIXME)
pd.set_option('display.max_colwidth', None)

# VADER differences
vader_mask = diff['diff_abs_vader_compound'] > 0.25
vader_index = diff[vader_mask].index
print(f"Number of large differences in VADER compound sentiment scoring: {len(vader_index)}")
print("Large changes in VADER compound sentiment scoring:")
display(reviews.loc[vader_index, ['review_text', 'review_text_cleaned','rating', 'vader_compound', 'vader_compound_cleaned_text']].head(10))


Number of large differences in VADER compound sentiment scoring: 211
Large changes in VADER compound sentiment scoring:


Unnamed: 0,review_text,review_text_cleaned,rating,vader_compound,vader_compound_cleaned_text
8,"the haircut was not good. there are so many other barbers around, i honestly don’t know how this place is still open. the owner was yelling and swearing when the internet went out, which felt really unprofessional. this is supposed to be a family-friendly place, and i expected better. on top of that, they had no way to take payment except cash. we had to leave and find an atm just to pay. it felt like they had no backup plan and no idea how to handle it. the price was not discounted for inconvenience. definitely not going back. one star all day. save your time and go somewhere else.","The haircut was unsatisfactory. With so many other barbers nearby, I genuinely don’t know how this establishment remains open. The owner was yelling and swearing when the internet went out, which felt incredibly unprofessional. It’s supposed to be a family-friendly place, and I expected a better experience. Furthermore, they only accepted cash payments, requiring us to leave and find an ATM to pay. This indicated a lack of contingency plans and a failure to handle the situation effectively. The price was not discounted for the inconvenience. I will not be returning. A one-star rating for the entire day. Save your time and choose a different option.",1.0,0.4335,-0.5173
10,"stay away from the hairstylist that always wears a mask. i dont know her name but she is extremely rude and racist. i was top on the wait list and this caucasian gentleman walks in and he is 2nd on the list. she asked him for a haircut first, even though he explicitly said that he was 3nd on the list and i was before him. she didn't care and proceeded to do him first. she did me twice since then and she is hands down the most rude hairstylist i've ever had. now i'm at the point that if she isb the only one in the shop, i walk out of there. only reason i'm giving 2 stars is because areli and the asian guy is great 👍","Stay away from the hairstylist who consistently wears a mask. I don’t know her name, but she is extremely rude and racist. I was at the top of the wait list and a Caucasian gentleman walked in, he was second on the list. She asked him for a haircut first, even though he explicitly stated he was second on the list, and I was before him. She didn’t care and proceeded to do me twice, and she is undoubtedly the most rude hairstylist I’ve ever had. Now, I’m at the point where if she’s the only one in the shop, I’ll walk out. Only reason I’m giving two stars is because the stylist and the Asian gentleman are great 👍",2.0,-0.8506,-0.1342
12,"i went in for a simple haircut and left with a disaster. not only was the cut uneven and rushed, but they actually left a visible bald spot on the side of my head. this is beyond unprofessional. i now have to wear a hat everywhere until it grows back. if you care at all about how you look, do not go to great clips.","I went in for a simple haircut and left with a disaster. The cut was uneven and rushed, and they left a noticeable bald spot on my side of the head. This is beyond unprofessional. I now have to wear a hat everywhere until it grows back. If you care about your appearance, please don’t go to great clips.",1.0,-0.8267,0.0258
16,"without getting into it too much and really show my disproval of my haircut, because i still want to come back to this location, just never go to the hairstylist that wears a face mask. if she’s not wearing a face mask, then her name starts with an a. never ever making that mistake again. sorry not sorry about it, tipped more than i should have if at all. edited to say: the haircut felt rushed, despite it feeling like she was efficient enough and knew what she was doing. she confirmed how i wanted the haircut and proceeded to do the opposite. felt like egg on my face walking out of that shop.","The haircut felt rushed, despite it appearing efficient and knowing what to do. She confirmed my desired style and proceeded to do the opposite. It felt like an insult to my request, and I tipped more than I should have. Sorry for the oversight, but I tipped generously.",2.0,-0.5408,0.465
17,"the vietnamese guy is terrible!!! i wanted to get the hair cut done with the lady but this man literally rushed to get my haircut done, he rushed through the haircut and did a shoddy job. i am so sad the lady is still much better, the asian guy just wants to quickly cut more hair and doesnt care about the quality. avoid!!","The Vietnamese man was terrible. I wanted to get a haircut, but this man rushed through the process, doing a shoddy job. I’m so disappointed; the lady is significantly better, and he seems solely focused on quickly cutting more hair without regard for quality. Avoid!",1.0,-0.936,-0.6284
19,"went in oct 9th to have some layers put in. (i had grown my hair out to almost one length.) i showed the asian gentleman how short i wanted the shortest layer, to my shoulders. he chopped off my hair like it was a viral trend. he combed my hair straight back, i thought he was just checking for length. the next thing i felt was the scissors through all of my hair at once. instant dread/fear/regret. he then proceeded to show me all my hair in his fist. he did a few more little snips and in less than 3 minutes my hair was ""done"". this was also done dry. not even a spritz of water. the bulk at end of my hair is gone. the layers/cut is uneven. the shortest layer is at my ears. i have never in my adult life, cried after a haircut. the woman working with him said not to charge me and apologized. i've been having layers put in my hair for over 15 years and never has someone just chopped my hair off at once. i'm too scared to even have someone try to fix this because it just means cutting more hair off in my already thin cut. fwiw i think the man's name was than.","I went in October 9th to have layers cut. (I had grown my hair out to almost one length.) I showed the Asian gentleman how short I wanted the shortest layer to be, ending at my shoulders. He chopped off my hair like it was a viral trend. He combed my hair straight back, and I thought he was just checking for length. The next thing I felt was the scissors through all of my hair at once. An instant wave of dread, fear, and regret washed over me. He then proceeded to show me all my hair in his fist. He did a few more little snips, and in less than 3 minutes, my hair was “done.” It was dry, not even a spritz of water. The bulk at the end of my hair is gone. The layers/cut were uneven, and the shortest layer was at my ears. I’ve never in my adult life cried after a haircut. The woman working with him said not to charge me, and apologized. I’ve been having layers cut for over 15 years and have never had someone just chop my hair off at once. I’m too scared to even have someone try to fix this, because it just means cutting more hair off in my already thin cut. For the record, I think the man’s name was than.",1.0,-0.34,-0.9274
20,"an **absolutely disastrous** experience. had a haircut with ben (vietnamese guy) today, and i didn’t like his service, so i chose not to tip him. instead of accepting that, he **literally yelled at me**, stepped closer in an intimidating way, and **basically forced me to tip him**—even though he ruined my haircut. i’ve always tipped my hairdressers, and the owners can check my payment history to confirm that. but apparently, with ben, tipping isn’t about good service—it’s just **mandatory**, even if you leave with a bad haircut. i’d strongly urge the owners to check the cameras and see for themselves how **disgustingly** i was treated.","An absolutely disastrous experience. I had a haircut with Ben (a Vietnamese guy) today, and I didn’t like his service; therefore, I didn’t tip him. Instead of accepting that, he literally yelled at me, stepped closer in an intimidating manner, and essentially forced me to tip him—even though he ruined my haircut. I’ve always tipped my hairdressers, and the owners can verify my payment history to confirm that. However, with Ben, tipping isn’t about good service; it’s simply mandatory, even if I leave with a bad haircut. I strongly urge the owners to check the cameras and see for themselves how disgustingly I was treated.",1.0,-0.25,-0.908
27,"come here if you want to pay to finish your haircut at home. it amazes me how someone can mess up a fade and leave chunks of hair while looking like a staircase. the lady who cut my hair felt like she was doing it in stabbing motions instead of going up, you know like clippers are designed to. the guy that tended to my son left half of the sideburns and long chunks everywhere. again i asked for simple fades. left looking like we got our haircut by hobos with rusty garage scissors. don't respond with your bs customer service email that will only say ""sorry, here's $5 off your next visit."" wouldn't even bring my dog here.","Come here if you want to pay for a home haircut. It’s astonishing how someone can ruin a fade and leave uneven chunks of hair, looking like a staircase. The stylist felt like she was performing the cut with stabbing motions, not up, like clippers are designed for. The man who tended to my son left significant portions of his sideburns and long strands scattered everywhere. Again, I requested simple fades. The result was a haircut that resembled a hastily constructed salon by those with rusty garage scissors. I won’t respond to your nonsensical customer service email – it will offer nothing but a paltry $5 off my next visit. Wouldn’t even bring my dog here.",1.0,0.8074,0.1406
30,usually does good haircut for budget price. this time service person did a terrible job for haircut hence giving 1 star. never going to this saloon again.,"Usually, the haircuts are good for a budget price. However, this time the service person did a terrible job, resulting in a one-star rating. I will never return to this salon again.",1.0,0.296,-0.0516
32,"i had a terrible experience at great clips. the stylist didn't listen to a single thing i said. i had to explain how i wanted my haircut, but she kept interrupting me. there were no notes from my previous visit two weeks ago, so i had to start from scratch. i was very disappointed with the service.","I had a terrible experience at Great Clips. The stylist didn’t listen to my instructions, and I had to repeatedly explain my desired haircut. She frequently interrupted me, and there were no notes from my previous visit two weeks ago, so I had to start from scratch. I was very disappointed with the service.",1.0,-0.8654,-0.5709


In [None]:
# roBERTa drastic positive confidence changes
roberta_positive_diff_mask = diff['diff_abs_roberta_positive_chunked'] > 0.20   # if the differences between confidence scoring is greater than 20%
roberta_positive_diff_index = diff[roberta_positive_diff_mask].index
print(f"Number of large differences in roBERTa positive sentiment confidence: {len(roberta_positive_diff_index)}")
print("Large changes in confidence for roBERTa positive sentiment:")
display(reviews.loc[roberta_positive_diff_index, ['review_text', 'review_text_cleaned', 'rating', 'roberta_positive_chunked', 'roberta_positive_chunked_cleaned_text']].head(10))

# roBERTa drastic negative confidence changes
roberta_negative_diff_mask = diff['diff_abs_roberta_negative_chunked'] > 0.20
roberta_negative_diff_index = diff[roberta_negative_diff_mask].index
print(f"Number of large differences in roBERTa negative sentiment confidence: {len(roberta_negative_diff_index)}")
print("Large changes in confidence for roBERTa negative sentiment:")
display(reviews.loc[roberta_negative_diff_index, ['review_text', 'review_text_cleaned', 'rating', 'roberta_negative_chunked', 'roberta_negative_chunked_cleaned_text']].head(10))

Number of large differences in roBERTa positive sentiment confidence: 55
Large changes in confidence for roBERTa positive sentiment:


Unnamed: 0,review_text,review_text_cleaned,rating,roberta_positive_chunked,roberta_positive_chunked_cleaned_text
43,"i usually really don't like getting a haircut and i have let my hair grow for a long time because of how bad the past one was. t the man who cut my hair made me feel welcome and he did a stupendous job with my hair, he took me through each step in the mirror before during and after the whole process. i 100% recommend this establishment 5 star review has been changed to a 3 star review 🥲 10 months later and my haircut here was so rushed, my bangs were cut way too short and when i asked him if he could fix it it just got worse. unfortunately i dont know if i will come back to this location considering how awesome the service was last time. not the same stylist, was most likely my biggest mistake.","I usually don’t enjoy haircuts, and I’ve let my hair grow for a long time because of the previous one. The man who cut my hair made me feel welcome, and he did an exceptional job with my hair – he walked me through each step in the mirror before and during the process. I 100% recommend this establishment; the 5-star review has been changed to a 3-star review 🥲 10 months later, my haircut was rushed, my bangs were cut too short, and when I asked him to fix it, it worsened. Unfortunately, I don’t know if I’ll return, considering how amazing the service was last time. It wasn’t the same stylist, and it was likely my biggest mistake.",3.0,0.078734,0.413408
56,this is works great clips of all with very worst staff ever. they keep the customers waiting without checking them in with no proper orders for haircuts. never again going at this location.,"This salon is excellent – the service is outstanding, and the staff is consistently very poor. They frequently keep customers waiting without checking them in, and there’s no proper order taken for haircuts. I will never return to this location.",1.0,0.003965,0.395665
79,men's hair cut and beard trim. stylist took me from looking like i might live in my parents basement to looking like a businessman.,The men’s haircut and beard trim transformed me from looking like I might live in my parents’ basement to looking like a businessman.,5.0,0.118934,0.507246
129,the slaughter and manchaca store let the cutter go that did the best job and cut the way i like it. they have not been the same since. guess i'll try the one up north near my office near parmer lane. and this is sam not debbie.......,"The Slaughter and Manchaca store did an excellent job, and I particularly liked the way it cut. They’ve been noticeably different since then. I’ll try the one up north near my office, near Parmer Lane. This is Sam, not Debbie…",2.0,0.156924,0.950987
130,"i show up first. then some idiot signs in online, shows up 10 min after i'm already there and gets served first. last time i go here or any other ""great clips"" again. not when there's 4 other haircut places who understand first come first serve.","I arrive first. Then, some online signs indicate that I’m served before me, ten minutes after I arrive. Last time I’ve been here or any other “great clips,” not when there are four other haircut places that prioritize first-come, first-served service.",1.0,0.062172,0.307209
152,"i do not pay much attention to my hair, and as such i don't like to spend much to get it cut. great clips has a wonderful deal on tuesdays, and they are always able to get me in and out in hurry.","I don't pay much attention to my hair, so I don't like to spend much on haircuts. Great Clips offers a wonderful deal on Tuesdays, and they’re always able to get me in and out quickly.",4.0,0.45939,0.748229
232,typical syndicated place to get a quick trim. every cut looks the same.,"A typical, quick trim location. Every cut appears strikingly similar.",3.0,0.06537,0.576324
241,"decent cut, but the lady rushed it because they were closing.","The cut was decent, but the staff rushed it due to closing time.",3.0,0.112223,0.354725
242,quick cheap haircuts you pay for what you get here,Quick and affordable haircuts – you get what you pay for here.,3.0,0.053409,0.294199
244,almost zero wait time when checked in prior to visit,There was almost no wait time when checking in prior to our visit.,5.0,0.245835,0.63181


Number of large differences in roBERTa negative sentiment confidence: 49
Large changes in confidence for roBERTa negative sentiment:


Unnamed: 0,review_text,review_text_cleaned,rating,roberta_negative_chunked,roberta_negative_chunked_cleaned_text
43,"i usually really don't like getting a haircut and i have let my hair grow for a long time because of how bad the past one was. t the man who cut my hair made me feel welcome and he did a stupendous job with my hair, he took me through each step in the mirror before during and after the whole process. i 100% recommend this establishment 5 star review has been changed to a 3 star review 🥲 10 months later and my haircut here was so rushed, my bangs were cut way too short and when i asked him if he could fix it it just got worse. unfortunately i dont know if i will come back to this location considering how awesome the service was last time. not the same stylist, was most likely my biggest mistake.","I usually don’t enjoy haircuts, and I’ve let my hair grow for a long time because of the previous one. The man who cut my hair made me feel welcome, and he did an exceptional job with my hair – he walked me through each step in the mirror before and during the process. I 100% recommend this establishment; the 5-star review has been changed to a 3-star review 🥲 10 months later, my haircut was rushed, my bangs were cut too short, and when I asked him to fix it, it worsened. Unfortunately, I don’t know if I’ll return, considering how amazing the service was last time. It wasn’t the same stylist, and it was likely my biggest mistake.",3.0,0.699611,0.252675
49,"great haircut but i would really appreciate if the staff here didn't talk too much. this girl lisa that was working the last shift of the day here starts asking me a million questions and she doesn't even know me to begin with. for the record, i don't know lisa and i don't really know why she's asking me a lot of random questions. lisa just keeps asking me about a lot of generic stuff. i would really appreciate it if she had left me alone and just do her job of cutting my hair. i'm getting really annoyed with her asking me too many questions and she really needs to stop. some of us really get annoyed with that. i just got off work when i came into the store and i don't feeling talking to anyone and i was very tired that night. the people here should really start being very mindful of the customers before they engage in a conversation. some customers don't want you talking to them and want to be left alone. i'm not trying to cause any problems but i just want the staff here to be very mindful before starting a conversation and to respect the customers.","The haircut was great, but I’d appreciate it if the staff didn’t engage in lengthy conversations. Specifically, the employee, Lisa, who worked the last shift, frequently asked me a lot of questions, even though I didn’t know her. She seemed uninterested in me and kept asking generic questions. I’d prefer if she just focused on her job – cutting my hair. I’m finding the frequent questioning frustrating, and I’d appreciate it if she respected my need for quiet time. I’m feeling quite tired after work, and it’s important for staff to be mindful of customers before initiating conversation. Some customers prefer to be left alone, and I’m simply hoping for a more respectful and considerate approach from everyone.",3.0,0.842449,0.49551
56,this is works great clips of all with very worst staff ever. they keep the customers waiting without checking them in with no proper orders for haircuts. never again going at this location.,"This salon is excellent – the service is outstanding, and the staff is consistently very poor. They frequently keep customers waiting without checking them in, and there’s no proper order taken for haircuts. I will never return to this location.",1.0,0.964414,0.318918
72,the haircut was good but every time i go the online coupons that bring me in never work at the end of the haircut. please send me coupons that actually work. you'll get me in for $9:99 or $12:99 and then charge me $19 bucks for a haircut before tip at the end..geesh,"The haircut was good, but every time I go online, the coupons never work at the end of the service. Please send me working coupons – I’d prefer $9.99 or $12.99, and then charge me $19 for the service before tip. Seriously!",3.0,0.45984,0.677983
79,men's hair cut and beard trim. stylist took me from looking like i might live in my parents basement to looking like a businessman.,The men’s haircut and beard trim transformed me from looking like I might live in my parents’ basement to looking like a businessman.,5.0,0.245333,0.028909
84,"this used to be my to go spot for a quick, cheap, just take a tiny bit off my long straight all one length hair. prior to my last two visits, various women have cut my hair and have all done a great job. this visit and the previous, two different men did the cutting and they didn't do a very good job. one used clippers and my hair was all chewed up on the bottom. and today another man there tried to use clippers and i insisted on scissors for the cutting. he whipped through cutting my hair in about 30 seconds and left me with one side of the hair on my head noticeable longer. so my advice, stay away from the men there!","This used to be my go-to spot for a quick, affordable haircut – a simple, straight-length style. Prior to my last two visits, various women have successfully cut my hair, and each has done an excellent job. This visit and the previous two men did the cutting, but they didn't do a very good job. One used clippers, and my hair was severely chewed at the bottom. Today, another man tried to use clippers, and I insisted on scissors for the cutting. He quickly and efficiently trimmed my hair in about 30 seconds, leaving me with one side noticeably longer. My advice: avoid these men.",1.0,0.530658,0.287705
107,"ok service, and quick. messed however up my colluch. does not take great clips coupons (?) i had a 8.99 coupon they didn't even take a dollar off. won't be coming here again. complete ripoff am surprised great clips accepts them in their franchise.","The service was quick, and the order was processed efficiently. However, I had a problem with my coupon – it didn’t take a dollar off. I was given an 8.99 coupon, but they didn’t even offer a discount. I won’t be returning. It was a complete rip-off. I was surprised that Great Clips accepts these coupons at their franchise location.",1.0,0.83455,0.524553
127,"men's hair isn't even that complicated but 3 separate times i've come here and all 3 times i left with a bad haircut. this pic is even *after* i asked them to fix this line. they literally cannot cut hair even. let me save you at least $16: if you're a lady or a dude looking for a haircut that's even, you have to go somewhere else.","Men’s hair is surprisingly simple, but I’ve experienced this repeatedly – three times I’ve come here and left with a bad haircut. This photo is even worse after I requested a correction. They literally can’t cut hair, and it’s frustrating. Let me save you at least $16. If you’re looking for a haircut that’s even remotely decent, you should go elsewhere.",1.0,0.691518,0.943309
129,the slaughter and manchaca store let the cutter go that did the best job and cut the way i like it. they have not been the same since. guess i'll try the one up north near my office near parmer lane. and this is sam not debbie.......,"The Slaughter and Manchaca store did an excellent job, and I particularly liked the way it cut. They’ve been noticeably different since then. I’ll try the one up north near my office, near Parmer Lane. This is Sam, not Debbie…",2.0,0.360455,0.004789
130,"i show up first. then some idiot signs in online, shows up 10 min after i'm already there and gets served first. last time i go here or any other ""great clips"" again. not when there's 4 other haircut places who understand first come first serve.","I arrive first. Then, some online signs indicate that I’m served before me, ten minutes after I arrive. Last time I’ve been here or any other “great clips,” not when there are four other haircut places that prioritize first-come, first-served service.",1.0,0.643112,0.077117


#### Conclusion on LLM-cleaned data
This subset of original and cleaned reviews demonstrates both the strengths and weaknesses of LLMs in cleaning natural language data. While the `Distribution of Differences in Sentiment Model Scoring on Original vs. Cleaned Text` plot shows that most sentiment scores changed only slightly for both RoBERTa and VADER methods, the outliers reveal key advantages and disadvantages of generative models like gemma3-1B-it. The VADER method shows 211 instances where LLM-cleaned text drastically changed compound sentiment scores, while RoBERTa shows ~50 instances each for positive and negative sentiment where confidence changed significantly. Closer inspection reveals that LLM-cleaned text sometimes accurately captures the reviewer's intent pushing the sentiment scoring in the "correct" direction but other times misinterprets the original meaning (e.g. cases where the reviewer is very sarcastic) or hallucinates content, leading the scoring for sentiment to be inaccurate relative to the content of the review. This largely explains the large sentiment score differences when using the different versions of the review text. 

To conclude, LLMs have the potential to help clean and/or extract signals from raw data, but in this setup we should exercise extreme caution. Preserving the raw text is likely preferable to relying on this model to remove noise, since its cleaning sometimes distorts sentiment and meaning. A stronger model or a task-specific fine-tune might make LLM-assisted cleaning more reliable, but without those improvements, downstream analysis is better grounded on the original data.

### PART 2 (Reviews Summarizer feature): Using a model to summarize all reviews and point out key insights

#### Goal: To replicate well-known features like Amazon's generative AI product reviews summaries for our use-case

In [100]:
reviews.rename(columns={'review_token_count':'roberta_token_count'}, inplace=True)
reviews.columns

Index(['review_id', 'rating', 'likes', 'date_review_scraped', 'review_date',
       'source', 'review_text', 'owner_response_text', 'location_id',
       'vader_negative', 'vader_neutral', 'vader_positive', 'vader_compound',
       'roberta_negative_chunked', 'roberta_neutral_chunked',
       'roberta_positive_chunked', 'roberta_negative_trunc',
       'roberta_neutral_trunc', 'roberta_positive_trunc',
       'roberta_token_count', 'review_text_cleaned',
       'vader_negative_cleaned_text', 'vader_neutral_cleaned_text',
       'vader_positive_cleaned_text', 'vader_compound_cleaned_text',
       'roberta_negative_chunked_cleaned_text',
       'roberta_neutral_chunked_cleaned_text',
       'roberta_positive_chunked_cleaned_text'],
      dtype='object')

In [None]:
def token_count(text, tokenizer):
    """Count tokens in text"""
    encoded_text = tokenizer(text, return_tensors='pt', truncation=False)
    return len(encoded_text['input_ids'][0])

In [103]:
# generating the token count using roBERTa/gemma tokenizers for both original and LLM-cleaned text

reviews['roberta_token_count'] = reviews['review_text'].progress_apply(lambda text: token_count(text, roberta_tokenizer))
reviews['roberta_token_count_clean_text'] = reviews['review_text_cleaned'].progress_apply(lambda text: token_count(text, roberta_tokenizer))
reviews['gemma_token_count'] = reviews['review_text'].progress_apply(lambda text: token_count(text, gemma3_1B_tokenizer))
reviews['gemma_token_count_clean_text'] = reviews['review_text_cleaned'].progress_apply(lambda text: token_count(text, gemma3_1B_tokenizer))

display(reviews[['roberta_token_count', 'roberta_token_count_clean_text', 'gemma_token_count', 'gemma_token_count_clean_text']])

  0%|          | 0/1266 [00:00<?, ?it/s]

  0%|          | 0/1266 [00:00<?, ?it/s]

  0%|          | 0/1266 [00:00<?, ?it/s]

  0%|          | 0/1266 [00:00<?, ?it/s]

Unnamed: 0,roberta_token_count,roberta_token_count_clean_text,gemma_token_count,gemma_token_count_clean_text
0,157,149,152,145
1,111,106,112,107
2,80,84,76,79
3,65,73,65,71
4,225,230,223,218
...,...,...,...,...
1261,2,2,1,1
1262,2,2,1,1
1263,2,2,1,1
1264,2,2,1,1


In [1]:
# grab only the reviews that have text (i.e. reviews with just ratings won't tell us anything because the customer did not say anything)
text_mask = reviews['review_text'] != ''
reviews[text_mask]['roberta_token_count'].sum()

print(f"Total Token Count (roberta tokenizer | original text) : {reviews[text_mask]['roberta_token_count'].sum()}")
print(f"Total Token Count (roberta tokenizer | cleaned text) : {reviews[text_mask]['roberta_token_count_clean_text'].sum()}")
print(f"Total Token Count (gemma tokenizer | original text) : {reviews[text_mask]['gemma_token_count'].sum()}")
print(f"Total Token Count (gemma tokenizer | cleaned text) : {reviews[text_mask]['gemma_token_count_clean_text'].sum()}")

NameError: name 'reviews' is not defined

The gemma3-1B model has a max input size of 32K tokens. We will likely need a more capable model to handle more reviews and thus more tokens

In [None]:
# try using both the 1B and 4B parameter gemma-3 model for this feature 

gemma3_4B_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-4b-it")
gemma3_4B_model = AutoModelForCausalLM.from_pretrained("google/gemma-3-4b-it")

# NOTE: cannot run this model on macOS; not enough local memory (revist this later: FIXME)

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# checking the text token counts with the 4B tokenizer

reviews['gemma_4B_token_count'] = reviews['review_text'].progress_apply(lambda text: token_count(text, gemma3_4B_tokenizer))
reviews['gemma_4B_token_count_clean_text'] = reviews['review_text_cleaned'].progress_apply(lambda text: token_count(text, gemma3_4B_tokenizer))

print(f"Total Token Count (gemma 4B tokenizer | original text) : {reviews[text_mask]['gemma_4B_token_count'].sum()}")
print(f"Total Token Count (gemma 4B tokenizer | cleaned text) : {reviews[text_mask]['gemma_4B_token_count_clean_text'].sum()}")
display(reviews[['gemma_4B_token_count', 'gemma_4B_token_count_clean_text']])