In [2]:
# general python modules
import psutil
import gc
import os
from dotenv import load_dotenv
from pprint import pprint

# vizualization modules
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# special modules
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
from transformers import pipeline
from scipy.special import softmax
from huggingface_hub import login



load_dotenv('../.env')
HF_TOKEN = os.getenv('HUGGINGFACE_ACCESS_TOKEN')
login(token=HF_TOKEN) # access to models available on HuggingFace
tqdm.pandas()

In [2]:
# perform only once
# nltk.download('all')

In [4]:
reviews = pd.read_csv('../data/processed/cleaned_combined_reviews_data.csv')
reviews.drop_duplicates(inplace=True)
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   review_id            1266 non-null   object 
 1   rating               1266 non-null   float64
 2   likes                1266 non-null   int64  
 3   date_review_scraped  1266 non-null   object 
 4   review_date          1266 non-null   object 
 5   source               1266 non-null   object 
 6   review_text          848 non-null    object 
 7   owner_response_text  1209 non-null   object 
 8   location_id          1266 non-null   int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 89.1+ KB


In [5]:
print(reviews.iloc[1265, 6])
print(type(reviews.iloc[1265, 6]))

print(reviews.iloc[0, 6])
print(type(reviews.iloc[0,6]))

nan
<class 'float'>
i brought my son to this location today for a back-to-school haircut. there is no proper welcoming into the store too they are not showing any respect towards customers… i clearly showed reference pictures of the style we wanted. the stylist (ben)did not listen patiently, rushed through the process, and completely ignored the details we explained. the haircut looked nothing like what we asked for. my kid was so upset with the result that he cried the entire time while doing..the experience upset him so much that now he doesn’t even want to go to school tomorrow. this was a very disappointing experience, and i would not recommend this stylist or this location. i hope management addresses this so other customers don’t go through the same frustration.
<class 'str'>


In [6]:
# slight data cleaning (will have to move this to appropriate notebook)

# contains nan values so I need to change them to default values and correct column data types
reviews['review_text'] = reviews['review_text'].fillna('')
reviews['owner_response_text'] = reviews['owner_response_text'].fillna('')

reviews = reviews.astype({
    'review_id': str,
    'source': str,
    'review_text': str,
    'owner_response_text': str
    })

reviews['date_review_scraped'] = pd.to_datetime(reviews['date_review_scraped'])
reviews['review_date'] = pd.to_datetime(reviews['review_date'])


In [7]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   review_id            1266 non-null   object        
 1   rating               1266 non-null   float64       
 2   likes                1266 non-null   int64         
 3   date_review_scraped  1266 non-null   datetime64[ns]
 4   review_date          1266 non-null   datetime64[ns]
 5   source               1266 non-null   object        
 6   review_text          1266 non-null   object        
 7   owner_response_text  1266 non-null   object        
 8   location_id          1266 non-null   int64         
dtypes: datetime64[ns](2), float64(1), int64(2), object(4)
memory usage: 89.1+ KB


In [8]:
# this review data was scraped from the web so it is likely that there are remenant HTML or special characters still in the text so lets check
# NOTE: move any data cleaning to the proper notebook file

html_specialChar_pattern = r'<[^>]+>|\n|\r|\t|&[a-z]+;'
problem_rows = reviews[ reviews['review_text'].str.contains(html_specialChar_pattern, regex=True, na=False) ]
print(problem_rows)

Empty DataFrame
Columns: [review_id, rating, likes, date_review_scraped, review_date, source, review_text, owner_response_text, location_id]
Index: []


In [9]:
example = reviews['review_text'][0]
pprint(example)

('i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed reference pictures of the style we '
 'wanted. the stylist (ben)did not listen patiently, rushed through the '
 'process, and completely ignored the details we explained. the haircut looked '
 'nothing like what we asked for. my kid was so upset with the result that he '
 'cried the entire time while doing..the experience upset him so much that now '
 'he doesn’t even want to go to school tomorrow. this was a very disappointing '
 'experience, and i would not recommend this stylist or this location. i hope '
 'management addresses this so other customers don’t go through the same '
 'frustration.')


In [10]:
# NLTK library functionalities

tokens = nltk.word_tokenize(example)
print(tokens[:10], '\n')

tagged = nltk.pos_tag(tokens) # pos => 'part of speech'
print(tagged[:10], '\n')

['i', 'brought', 'my', 'son', 'to', 'this', 'location', 'today', 'for', 'a'] 

[('i', 'NN'), ('brought', 'VBD'), ('my', 'PRP$'), ('son', 'NN'), ('to', 'TO'), ('this', 'DT'), ('location', 'NN'), ('today', 'NN'), ('for', 'IN'), ('a', 'DT')] 



In [11]:
entities = nltk.chunk.ne_chunk(tagged)
print(entities)

(S
  i/NN
  brought/VBD
  my/PRP$
  son/NN
  to/TO
  this/DT
  location/NN
  today/NN
  for/IN
  a/DT
  back-to-school/JJ
  haircut/NN
  ./.
  there/EX
  is/VBZ
  no/DT
  proper/JJ
  welcoming/NN
  into/IN
  the/DT
  store/NN
  too/RB
  they/PRP
  are/VBP
  not/RB
  showing/VBG
  any/DT
  respect/NN
  towards/IN
  customers…/NN
  i/NN
  clearly/RB
  showed/VBD
  reference/NN
  pictures/NNS
  of/IN
  the/DT
  style/NN
  we/PRP
  wanted/VBD
  ./.
  the/DT
  stylist/NN
  (/(
  ben/NN
  )/)
  did/VBD
  not/RB
  listen/VB
  patiently/RB
  ,/,
  rushed/VBD
  through/IN
  the/DT
  process/NN
  ,/,
  and/CC
  completely/RB
  ignored/VBD
  the/DT
  details/NNS
  we/PRP
  explained/VBD
  ./.
  the/DT
  haircut/NN
  looked/VBD
  nothing/NN
  like/IN
  what/WP
  we/PRP
  asked/VBD
  for/IN
  ./.
  my/PRP$
  kid/NN
  was/VBD
  so/RB
  upset/JJ
  with/IN
  the/DT
  result/NN
  that/IN
  he/PRP
  cried/VBD
  the/DT
  entire/JJ
  time/NN
  while/IN
  doing/VBG
  ../PDT
  the/DT
  experience/NN
  upset

# VADER Sentiment Scoring (Classical Method)

In [None]:
# VADER = Valence Aware Dictionary and sEntiment Reasoner --> ('bag of words approach' to sentiment analysis)
#
#       -> this method does not account for relationships between words

sia = SentimentIntensityAnalyzer()

pprint(example)
pprint(sia.polarity_scores(example))

('i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed reference pictures of the style we '
 'wanted. the stylist (ben)did not listen patiently, rushed through the '
 'process, and completely ignored the details we explained. the haircut looked '
 'nothing like what we asked for. my kid was so upset with the result that he '
 'cried the entire time while doing..the experience upset him so much that now '
 'he doesn’t even want to go to school tomorrow. this was a very disappointing '
 'experience, and i would not recommend this stylist or this location. i hope '
 'management addresses this so other customers don’t go through the same '
 'frustration.')
{'compound': -0.9493, 'neg': 0.191, 'neu': 0.742, 'pos': 0.068}


In [13]:
# Run polarity scoring on all reveiw text
vader_sentiment = reviews['review_text'].progress_apply(lambda text: sia.polarity_scores(text))
reviews[['vader_negative', 'vader_neutral', 'vader_positive', 'vader_compound']] = vader_sentiment.apply(pd.Series)

  0%|          | 0/1266 [00:00<?, ?it/s]

In [14]:
reviews.head()

Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,review_text,owner_response_text,location_id,vader_negative,vader_neutral,vader_positive,vader_compound
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025-12-16,2025-08-18,Google Maps,i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1,0.191,0.742,0.068,-0.9493
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025-12-16,2025-10-17,Google Maps,9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1,0.063,0.921,0.016,-0.6116
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025-12-16,2025-05-20,Google Maps,worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1,0.152,0.749,0.098,-0.4389
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025-12-16,2025-10-17,Google Maps,myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1,0.07,0.751,0.179,0.7574
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025-12-16,2025-01-20,Google Maps,terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1,0.178,0.717,0.105,-0.9174


In [15]:
fig1 = go.Figure()

colors = px.colors.sequential.Agsunset[:5]
ratings = sorted(reviews['rating'].unique())


for i, rating in enumerate(ratings):
    data = reviews[ reviews['rating'] == rating]['vader_compound' ]

    fig1.add_trace(go.Box(
        y=data,
        name=f'{rating} Rating',
        marker=dict(color=colors[i]),
        boxmean=True
    ))

fig1.update_layout(
    width=1000, height=600,
    title={'text':'VADER Compound Sentiment Score by Rating', 'x': 0.5},
    xaxis=dict(
        title='Rating',
        ticklabelstandoff=10
    ),
    yaxis=dict(
        title='Compound Score',
        ticklabelstandoff=10
    ),
    margin=dict(t=50, b=50, l=50, r=50)
)

fig1.show()

In [16]:
fig2 = make_subplots(
    rows=1, cols=3,
    subplot_titles=('VADER Positive', 'VADER Neutral', 'VADER Negative')
    )   

colors = px.colors.sequential.Agsunset[:5]
ratings = sorted(reviews['rating'].unique())

# Custom hover template with box statistics
hover_text = (
    '<b>%{fullData.name}</b><br>'
    'Min: %{customdata[0]:.3f}<br>'
    'Q1: %{customdata[1]:.3f}<br>'
    'Median: %{customdata[2]:.3f}<br>'
    'Mean: %{customdata[3]:.3f}<br>'
    'Q3: %{customdata[4]:.3f}<br>'
    'Max: %{customdata[5]:.3f}<br>'
    '<extra></extra>'
)

sentiments = [
    ('vader_positive', 1, 'Positive Score'),
    ('vader_neutral', 2, 'Neutral Score'),
    ('vader_negative', 3, 'Negative Score')
]

for col_name, col_idx, y_axis_title in sentiments:
    for i, rating in enumerate(ratings):
        data = reviews[ reviews['rating'] == rating ][col_name]
        stats = [data.min(), data.quantile(0.25), data.median(),
                 data.mean(), data.quantile(0.75), data.max()]
    
        fig2.add_trace(go.Box(
                y=data,
                name=f'{rating} Rating',
                marker=dict(color=colors[i]),
                boxmean=True,
                legendgroup=str(rating),
                showlegend=(col_idx==1), # only show legend for first column (grouped)
                hovertemplate=hover_text,
                customdata=[stats] * len(data)),
            row=1, col=col_idx
        )
    
    fig2.update_yaxes(title_text=y_axis_title, ticklabelstandoff=10,
                      row=1, col=col_idx)

fig2.update_layout(
    width=2000, height=750,
    title={'text': 'VADER Sentiment Scores Decomposition by Rating', 'x': 0.5},
    showlegend=True,
    margin=dict(t=100, b=50, l=50, r=50)
)

fig2.update_xaxes(ticklabelstandoff=10)

fig2.show()

# ROBERTA Sentiment Scoring (Pre-Trained Transformer Model --Hugging Face)

In [18]:
pprint(example)
print()
pprint(sia.polarity_scores(example))

('i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed reference pictures of the style we '
 'wanted. the stylist (ben)did not listen patiently, rushed through the '
 'process, and completely ignored the details we explained. the haircut looked '
 'nothing like what we asked for. my kid was so upset with the result that he '
 'cried the entire time while doing..the experience upset him so much that now '
 'he doesn’t even want to go to school tomorrow. this was a very disappointing '
 'experience, and i would not recommend this stylist or this location. i hope '
 'management addresses this so other customers don’t go through the same '
 'frustration.')

{'compound': -0.9493, 'neg': 0.191, 'neu': 0.742, 'pos': 0.068}


In [19]:
# META AI Lab's RoBERTa-base model trained on twitter comments release in 2019 [~125M parameters] (loading the model)
roberta = f"cardiffnlp/twitter-roberta-base-sentiment"
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta)
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta)

print(f'Number of parameters: {roberta_model.num_parameters()}')
print(f'Model Name: {roberta_model.config.model_type}')

Number of parameters: 124647939
Model Name: roberta


In [20]:
# Example of running the model on text
encoded_text = roberta_tokenizer(example, return_tensors='pt')
output = roberta_model(**encoded_text)
score = output[0][0].detach().numpy()
scores = softmax(score)
scores_dict = {
    f'roberta_negative': float(scores[0]),
    'roberta_neutral': float(scores[1]),
    'roberta_positive' : float(scores[2])
}
pprint(scores_dict)

{'roberta_negative': 0.9759707450866699,
 'roberta_neutral': 0.02143993228673935,
 'roberta_positive': 0.0025893172714859247}


In [21]:
# CALCULATE SENTIMENT SCORES (v1.0)

def sentiment_scores_roberta(text, model, tokenizer):
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(**encoded_text)
    scores = softmax(output[0][0].detach().numpy())
    scores_dict = {
        f'{model.config.model_type}_negative': float(scores[0]),
        f'{model.config.model_type}_neutral': float(scores[1]),
        f'{model.config.model_type}_positive': float(scores[2]),
    }

    return scores_dict

In [21]:
roberta_sentiment = reviews['review_text'].progress_apply(lambda text: sentiment_scores_roberta(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative', 'roberta_neutral', 'roberta_positive']] = roberta_sentiment.apply(pd.Series)

# ISSUE: RoBERTa-base has a maximum token length of 512 but some review text is longer than that

  0%|          | 0/1266 [00:00<?, ?it/s]

RuntimeError: The expanded size of the tensor (597) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 597].  Tensor sizes: [1, 514]

In [22]:
# CALCULATE SENTIMENT SCORES VIA TOKEN CHUNKING (v1.1)

def sentiment_scores_roberta_chunked(text, model, tokenizer, chunk_size=512):
    '''
    Process long review texts by chunking and using a weighted-average by unique tokens
    per chunk for sentiment score
    '''

    encoded_text = tokenizer(text, return_tensors='pt', truncation=False)
    token_ids = encoded_text['input_ids'][0]
    total_tokens = len(token_ids)

    # If text short enough to fit within one chunk process normally, else chunk it
    if total_tokens <= chunk_size:
        encoded_text = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        output = model(**encoded_text)
        scores = softmax(output[0][0].detach().numpy())
    else:
        chunk_scores = []
        chunk_weights = []
        overlap = 50   # setting the token overlap between chunks

        chunk_starts = list(range(0, total_tokens, chunk_size - overlap))

        for idx, i in enumerate(chunk_starts):
            chunk_end = min(i + chunk_size, total_tokens)
            chunk_ids = token_ids[i:chunk_end].unsqueeze(0)

            output = model(input_ids=chunk_ids)
            chunk_score = softmax(output[0][0].detach().numpy())
            chunk_scores.append(chunk_score)

            chunk_length = chunk_end - i

            if idx == 0: # first chunk
                unique_tokens = chunk_length
            else:        # other chunks
                unique_tokens = chunk_length - overlap
            
            chunk_weights.append(unique_tokens / total_tokens)

            # --- debugging ---
            # pprint(text)
            # print(f"total tokens: {total_tokens}")
            # print(f"chunk_starts: {chunk_starts}")
            # print(f"chunk_end: {chunk_end}")
            # print(f"chunk_length: {chunk_length}")
            # print(f"chunk_weights: {chunk_weights}")
    
        # Calculate weighted average
        chunk_scores = np.array(chunk_scores)
        chunk_weights = np.array(chunk_weights)
        scores = np.average(chunk_scores, axis=0, weights=chunk_weights)

    scores_dict = {
        f'{model.config.model_type}_negative': float(scores[0]),
        f'{model.config.model_type}_neutral': float(scores[1]),
        f'{model.config.model_type}_positive': float(scores[2]),
    }
    return scores_dict

In [23]:
# using chunked version of the application function

roberta_sentiment = reviews['review_text'].progress_apply(lambda text: sentiment_scores_roberta_chunked(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative_chunked', 'roberta_neutral_chunked', 'roberta_positive_chunked']] = roberta_sentiment.apply(pd.Series)

  0%|          | 0/1266 [00:00<?, ?it/s]

In [25]:
reviews.head()

Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,review_text,owner_response_text,location_id,vader_negative,vader_neutral,vader_positive,vader_compound,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025-12-16,2025-08-18,Google Maps,i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1,0.191,0.742,0.068,-0.9493,0.975971,0.02144,0.002589
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025-12-16,2025-10-17,Google Maps,9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1,0.063,0.921,0.016,-0.6116,0.755748,0.209195,0.035058
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025-12-16,2025-05-20,Google Maps,worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1,0.152,0.749,0.098,-0.4389,0.913692,0.076231,0.010077
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025-12-16,2025-10-17,Google Maps,myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1,0.07,0.751,0.179,0.7574,0.005103,0.038017,0.95688
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025-12-16,2025-01-20,Google Maps,terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1,0.178,0.717,0.105,-0.9174,0.965808,0.030455,0.003738


In [26]:
# CALCULATE SENTIMENT SCORE VIA TOKEN TRUNCATION (v1.2)

def sentiment_scores_roberta_trunc(text, model, tokenizer):
    encoded_text = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    output = model(**encoded_text)
    scores = softmax(output[0][0].detach().numpy())
    scores_dict = {
        f'{model.config.model_type}_negative': float(scores[0]),
        f'{model.config.model_type}_neutral': float(scores[1]),
        f'{model.config.model_type}_positive': float(scores[2]),
    }
    
    return scores_dict

In [27]:
roberta_sentiment_trunc = reviews['review_text'].progress_apply(lambda text: sentiment_scores_roberta_trunc(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative_trunc', 'roberta_neutral_trunc', 'roberta_positive_trunc']] = roberta_sentiment.apply(pd.Series)

  0%|          | 0/1266 [00:00<?, ?it/s]

In [28]:
display(reviews.head())

# NOTE: we will probably only see differences in reviews with more than 512 tokens (this only occurs once in the reviews dataset; the first and only occurence of this is at row 999)
display(reviews.iloc[999,:])

Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,review_text,owner_response_text,location_id,vader_negative,vader_neutral,vader_positive,vader_compound,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked,roberta_negative_trunc,roberta_neutral_trunc,roberta_positive_trunc
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025-12-16,2025-08-18,Google Maps,i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1,0.191,0.742,0.068,-0.9493,0.975971,0.02144,0.002589,0.975971,0.02144,0.002589
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025-12-16,2025-10-17,Google Maps,9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1,0.063,0.921,0.016,-0.6116,0.755748,0.209195,0.035058,0.755748,0.209195,0.035058
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025-12-16,2025-05-20,Google Maps,worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1,0.152,0.749,0.098,-0.4389,0.913692,0.076231,0.010077,0.913692,0.076231,0.010077
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025-12-16,2025-10-17,Google Maps,myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1,0.07,0.751,0.179,0.7574,0.005103,0.038017,0.95688,0.005103,0.038017,0.95688
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025-12-16,2025-01-20,Google Maps,terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1,0.178,0.717,0.105,-0.9174,0.965808,0.030455,0.003738,0.965808,0.030455,0.003738


review_id                                ChdDSUhNMG9nS0VJQ0FnSURjMGV5VjJBRRAB
rating                                                                    1.0
likes                                                                       0
date_review_scraped                                       2025-12-16 00:00:00
review_date                                               2020-12-17 00:00:00
source                                                            Google Maps
review_text                 this must’ve been the worst customer service e...
owner_response_text         we’re sorry to see this. if you would like a f...
location_id                                                                 3
vader_negative                                                          0.134
vader_neutral                                                           0.745
vader_positive                                                          0.122
vader_compound                                                  

In [29]:
# display the difference between the chunking vs. truncating methods for token processing with roBERTa

diff_token_processing = pd.DataFrame([], columns=['diff_roberta_positive', 'diff_roberta_neutral', 'diff_roberta_negative'])

diff_token_processing['diff_roberta_positive'] = reviews['roberta_positive_chunked'] - reviews['roberta_positive_trunc']
diff_token_processing['diff_roberta_neutral'] = reviews['roberta_neutral_chunked'] - reviews['roberta_neutral_trunc']
diff_token_processing['diff_roberta_negative'] = reviews['roberta_negative_chunked'] - reviews['roberta_negative_trunc']


display(diff_token_processing)
diff_token_processing.describe()

Unnamed: 0,diff_roberta_positive,diff_roberta_neutral,diff_roberta_negative
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
...,...,...,...
1261,0.0,0.0,0.0
1262,0.0,0.0,0.0
1263,0.0,0.0,0.0
1264,0.0,0.0,0.0


Unnamed: 0,diff_roberta_positive,diff_roberta_neutral,diff_roberta_negative
count,1266.0,1266.0,1266.0
mean,0.0,0.0,0.0
std,0.0,0.0,0.0
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,0.0,0.0,0.0


In [29]:
fig1.write_html('../reports/figures/VADERCompoundSentimentByRating.html')
fig2.write_html('../reports/figures/VADERSentimentScoreDecompByRating.html')

# LLM Model Exploration + Application

### Uses of Summarization Feature (Feature Development)

    ~[1] (LLM-data-cleaning)
        Upon reading a random sample of review text, it was clear that many review may have been written in a rush or without proofreading. Using a LLM model, we will clean the `review_text` column for better grammar, clarity, and sentence structure. Doing this may result in potentially more accurate scoring by our sentiment-scoring models (VADER/roBERTa). Also, this overall will just help clean up the review text for better readability if one desires to just plain read the reviews left by customers.

    ~[2] (Reviews Summarizer)
        Creating this feature may also just help us understand the common things, specifically weaknesses or strengths, customers are saying about the store's service. Knowing this can be extremely useful, helping the business improve customer service, customer satisfaction, and store/brand image in the local area. 

### Model exploration

In [None]:
# REMOVE ME (not enough fine grained control over the model parameters and outputs)

review_summary_pipe = pipeline('text-generation',
                               model='google/gemma-3-270m-it')

Device set to use mps:0


In [62]:
output = review_summary_pipe(f"clean the text for proper grammar and clarity: {example}")
print(output[0]['generated_text'])

clean the text for proper grammar and clarity: i brought my son to this location today for a back-to-school haircut. there is no proper welcoming into the store too they are not showing any respect towards customers… i clearly showed reference pictures of the style we wanted. the stylist (ben)did not listen patiently, rushed through the process, and completely ignored the details we explained. the haircut looked nothing like what we asked for. my kid was so upset with the result that he cried the entire time while doing..the experience upset him so much that now he doesn’t even want to go to school tomorrow. this was a very disappointing experience, and i would not recommend this stylist or this location. i hope management addresses this so other customers don’t go through the same frustration.
This is a very disappointing experience. I brought my son to this location today for a back-to-school haircut. There is no proper welcoming into the store too they are not showing any respect to

In [63]:
output = review_summary_pipe(example,
                             min_length=100,
                             num_beams=5,
                             temperature=0.75,
                             repetition_penalty=1.2,
                             do_sample=True)
pprint(output[0]['generated_text'])

('i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed reference pictures of the style we '
 'wanted. the stylist (ben)did not listen patiently, rushed through the '
 'process, and completely ignored the details we explained. the haircut looked '
 'nothing like what we asked for. my kid was so upset with the result that he '
 'cried the entire time while doing..the experience upset him so much that now '
 'he doesn’t even want to go to school tomorrow. this was a very disappointing '
 'experience, and i would not recommend this stylist or this location. i hope '
 'management addresses this so other customers don’t go through the same '
 'frustration.\n'
 'i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed

Using the pipeline method seemingly does not give us good outputs and indicates that we need more find grain control and prompting to the model

In [None]:
# Load model directly (more fine-grain control with the model -- WILL USE THIS METHOD)

gemma3_270M_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-270m-it")
gemma3_270M_model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m-it")

In [35]:
messages = [
    {"role": "user", 
     "content": f"Revise this review text for proper grammar and clarity: {example}"},
]
inputs = gemma3_270M_tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(gemma3_270M_model.device)

outputs = gemma3_270M_model.generate(**inputs, 
                         max_new_tokens=1024,
                         temperature=0.85)

print(gemma3_270M_tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Here's a revised version of the review text, aiming for better grammar and clarity:

"I brought my son to this location today for a back-to-school haircut. There is no proper welcoming into the store, and they are not showing any respect for customers. I clearly showed reference pictures of the style I wanted. The stylist (ben) did not listen patiently, rushed through the process, and completely ignored the details we explained. The haircut looked nothing like what we asked for. My child was so upset with the result, and he cried throughout the experience. The experience was extremely disappointing, and I would not recommend this stylist or this location. I hope management addresses this issue so other customers do not experience similar frustration."<end_of_turn>


In [36]:
# Trying the google/gemma-3-1b-it (1 billion parameter version)

gemma3_1B_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
gemma3_1B_model = AutoModelForCausalLM.from_pretrained("google/gemma-3-1b-it")

In [37]:
messages = [
    {"role": "user",
     "content": f"Revise this review text I have provided for proper grammar and clarity: {example}"},
]

inputs = gemma3_1B_tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors='pt'
).to(gemma3_1B_model.device)

outputs = gemma3_1B_model.generate(**inputs,
                                   max_new_tokens=1024,
                                   temperature=0.85)

print(gemma3_1B_tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Okay, here are a few revisions of your review, aiming for improved grammar, clarity, and a more professional tone. I've provided a few options, ranging from slightly more formal to a bit more direct, depending on the overall tone you're aiming for.

**Option 1 (More Formal & Detailed):**

“Today, I brought my son to this location for a back-to-school haircut. Unfortunately, the experience was deeply disappointing. There was no welcoming atmosphere, and the staff didn't seem to value our time. I clearly showed the stylist, Ben, reference pictures of the desired style. However, he didn’t listen attentively to our requests, rushed through the process, and completely ignored the details we provided. The resulting haircut was significantly different from what we expected. My son was understandably upset, crying throughout the service and refusing to go to school tomorrow. This was a very frustrating and disheartening experience, and I wouldn’t recommend this stylist or this location to othe

In [38]:
# Stricter prompting for the model

messages = [
    {"role": "system",
     "content": "You are an expert copy editor. Your task is only to correct grammar and improve clarity. You only output the revised text, nothing else."},
    {"role": "user",
     "content": f"Revise this review text I have provided for proper grammar and clarity: {example}"},
]

inputs = gemma3_1B_tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors='pt'
).to(gemma3_1B_model.device)

outputs = gemma3_1B_model.generate(**inputs,
                                   max_new_tokens=1024,
                                   temperature=0.85)

pprint(gemma3_1B_tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

('I brought my son to this location today for a back-to-school haircut. There '
 'was no welcoming atmosphere; the staff did not show respect to customers. I '
 'clearly showed reference pictures of the desired style. The stylist, Ben, '
 'did not listen patiently, rushed through the process, and completely ignored '
 'the details we explained. The haircut did not match our request at all. My '
 'child was extremely upset with the result, crying throughout the process, '
 'and now he refuses to go to school tomorrow. This was a very disappointing '
 'experience, and I would not recommend this stylist or this location. I hope '
 'management addresses this so other customers don’t experience the same '
 'frustration.<end_of_turn>')


In [39]:
example2 = reviews['review_text'][1]
pprint(example2)

('9/16/25 9:30am got my hair cut at the parmer and mopac location. got home '
 'and found i was bleeding in the back of my neck and had razer burnes above '
 'my right eye and two places on my left ear. i will never use great clips '
 'again. this is the man that did it all. i went back and showed him what he '
 'had done. he apologised and refunded my money. i asked to. see the manager '
 'and he said he was the manager, very unlikey.')


In [40]:
messages = [
    {"role": "system",
     "content": "You are an expert copy editor. Your only task is to correct grammar and improve clarity. You only output the revised text, nothing else."},
    {"role": "user",
     "content": f"Revise this review text I have provided for proper grammar and clarity: {example2}"},
]

inputs = gemma3_1B_tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors='pt'
).to(gemma3_1B_model.device)

outputs = gemma3_1B_model.generate(**inputs,
                                   max_new_tokens=1024,
                                   temperature=0.85)

pprint(gemma3_1B_tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

('9/16/25, 9:30 am – I got my hair cut at the Parker & Mopac location. Upon '
 'returning home, I discovered I was bleeding in my back of the neck and had '
 'razer burns above my right eye, and two places on my left ear. I will never '
 'use Great Clips again. This man did everything. I returned to show him what '
 'he had done. He apologized and refunded my money. I asked for it. I then '
 'asked to see the manager, who stated he was the manager, which seemed '
 'unlikely.<end_of_turn>')


### Model selection reasoning

Abstractive Summarization Feature GOAL: find a model that is able to take a large sum of review text (partitioned by location) and summarize what the reviewers are most commonly saying about the location and its service AND is able to clean up text

1) First idea was to use summarization tasked models (e.g. facebook/bart-large-cnn) but after doing some light usage testing, I found that these models did not meet my requirements for text summarization in the context of google reviews. I discovered that these models were the 'Extractive Summarization' models rather than the 'Abstractive Summarization' model which I did not particularly need for achieving the goal of summarization of vast amounts review text. Also these models have relatively small input token limits

2) Second idea was to find abstractive summarization models (e.g. google/pegasus-xsum) but again after doing some light usage testing on only a small portion of example review text, I found that these models were not substantive enough providing very short summaries even after some parameter tuning on model outputs as well as prone to hallucination as demonstrated by other users of the model. Also these models had very small token input size limits (~500 tokens) and small context windows, which would not satisfy my needs to potentially thousands of input tokens

3) Upon more research, I compared different task categories: "extractive summarization", "abstractive summarization", AND "text generation". I found that text generation had the capabilites of more abstractive summarization with larger input token limits but similarly to abstractive summarization, text generation tasked models are prone to hallucinations in model outputs. Even though hallucination are a likely possiblity using text generation model, I think the risk is very well worth the potential ability to complete the goal to a high degree and based on my trialing of these models, they seem to perform well with user prompting (which I think may lower hallucination rates or getting undesirable outputs from the model)For the model, I am selecting the Google Gemma 3 270M parameter model since it is lightweight (<1B parameters) and has an input/output size limit of 32K tokens which checks all the requirements I have for completing this summarization feature

### PART 1 (LLM-data-cleaning feature): Using this model to clean my review text data and re-run my sentiment analysis

In [63]:
# finding rows with no empty text strings
no_text_mask = reviews['review_text'] == ''
display(reviews.loc[no_text_mask, ['review_text', 'rating', 'roberta_negative_chunked', 'roberta_neutral_chunked', 'roberta_positive_chunked']])


# using the model to clean the data
def clean_review_text(text, tokenizer, model):
    """Clean review text using Text Generation Model for grammar and clarity"""
    if not text or text.strip() == '':
        return text
    
    messages = [
        {"role": "system",
         "content": "You are an expert copy editor. Your task is only to correct grammar and improve clarity. You only output the revised text, nothing else."},
        {"role": "user",
         "content": f"Revise this review text for proper grammar and clarity: {text}"},
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors='pt'
    ).to(model.device)

    outputs = model.generate(**inputs, max_new_tokens=1024, temperature=0.85)
    cleaned_text = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]).strip()
    
    return cleaned_text

print(f"Cleaning review text with '{gemma3_1B_model.config.model_type}'...")
reviews['review_text_cleaned'] = reviews['review_text'].progress_apply(
    lambda text: clean_review_text(text, gemma3_1B_tokenizer, gemma3_1B_model)
)

Unnamed: 0,review_text,rating,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked
390,,5.0,0.258294,0.451272,0.290433
420,,5.0,0.258294,0.451272,0.290433
423,,4.0,0.258294,0.451272,0.290433
424,,1.0,0.258294,0.451272,0.290433
425,,4.0,0.258294,0.451272,0.290433
...,...,...,...,...,...
1261,,3.0,0.258294,0.451272,0.290433
1262,,5.0,0.258294,0.451272,0.290433
1263,,4.0,0.258294,0.451272,0.290433
1264,,1.0,0.258294,0.451272,0.290433


Cleaning review text with 'gemma3_text'...


  0%|          | 0/1266 [00:00<?, ?it/s]

In [None]:
# removing <end_of_turn> token from the model output string (Data post-processing)

print('outputs with <end_of_turn> token:', reviews['review_text_cleaned'].str.contains('<end_of_turn').sum())
print('outputs with special characters:', reviews['review_text_cleaned'].apply(lambda x: not x.isprintable()).sum())

reviews['review_text_cleaned'] = (
    reviews['review_text_cleaned']
    .str.replace('<end_of_turn>', '', regex=False)  # remove <end_of_turn> token from model output
    .str.replace('[\n\r\t]', '', regex=True)        # actual newlines, tabs, carriage returns
    .str.replace(r' +', ' ', regex=True)            # multiple spaces to single space
    .str.strip()
)
reviews['review_text_cleaned'] = reviews['review_text_cleaned'].apply(
    lambda x: "".join(char for char in x if char.isprintable()) # remove any character that is 'non-printable'
)
print('string cleaning applied...')

print('outputs with <end_of_turn> token:', reviews['review_text_cleaned'].str.contains('<end_of_turn').sum())
print('outputs with special characters:', reviews['review_text_cleaned'].apply(lambda x: not x.isprintable()).sum())


outputs with <end_of_turn> token: 0
outputs with special characters: 4
string cleaning applied...
outputs with <end_of_turn> token: 0
outputs with special characters: 0


In [94]:
vader_sentiment = reviews['review_text_cleaned'].progress_apply(lambda text: sia.polarity_scores(text))
reviews[['vader_negative_cleaned_text', 'vader_neutral_cleaned_text', 'vader_positive_cleaned_text', 'vader_compound_cleaned_text']] = vader_sentiment.apply(pd.Series)

roberta_sentiment = reviews['review_text_cleaned'].progress_apply(lambda text: sentiment_scores_roberta_chunked(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative_chunked_cleaned_text', 'roberta_neutral_chunked_cleaned_text', 'roberta_positive_chunked_cleaned_text']] = roberta_sentiment.apply(pd.Series)

  0%|          | 0/1266 [00:00<?, ?it/s]

  0%|          | 0/1266 [00:00<?, ?it/s]

In [100]:
display(reviews[['review_text', 'review_text_cleaned', 
                 'vader_negative','vader_neutral', 'vader_positive', 'vader_compound',
                 'vader_negative_cleaned_text','vader_neutral_cleaned_text', 'vader_positive_cleaned_text', 'vader_compound_cleaned_text']].head())
display(reviews[['review_text', 'review_text_cleaned',
                 'roberta_negative_chunked', 'roberta_neutral_chunked', 'roberta_positive_chunked',
                 'roberta_negative_chunked_cleaned_text', 'roberta_neutral_chunked_cleaned_text', 'roberta_positive_chunked_cleaned_text']].head())

Unnamed: 0,review_text,review_text_cleaned,vader_negative,vader_neutral,vader_positive,vader_compound,vader_negative_cleaned_text,vader_neutral_cleaned_text,vader_positive_cleaned_text,vader_compound_cleaned_text
0,i brought my son to this location today for a ...,I brought my son to this location today for a ...,0.191,0.742,0.068,-0.9493,0.171,0.738,0.091,-0.8914
1,9/16/25 9:30am got my hair cut at the parmer a...,"9/16/25 at 9:30am, I got my hair cut at the Pa...",0.063,0.921,0.016,-0.6116,0.062,0.912,0.026,-0.4756
2,worst great clips i’ve ever been to. older man...,The worst haircuts I’ve ever received. An olde...,0.152,0.749,0.098,-0.4389,0.185,0.747,0.068,-0.807
3,myra did my long hair straight cut since glenn...,"Myra recently had a long hair straight cut, as...",0.07,0.751,0.179,0.7574,0.068,0.675,0.257,0.8955
4,terrible experience at great clips i had the w...,“I had a terrible experience at Great Clips. I...,0.178,0.717,0.105,-0.9174,0.199,0.713,0.088,-0.9658


Unnamed: 0,review_text,review_text_cleaned,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked,roberta_negative_chunked_cleaned_text,roberta_neutral_chunked_cleaned_text,roberta_positive_chunked_cleaned_text
0,i brought my son to this location today for a ...,I brought my son to this location today for a ...,0.975971,0.02144,0.002589,0.97094,0.026178,0.002882
1,9/16/25 9:30am got my hair cut at the parmer a...,"9/16/25 at 9:30am, I got my hair cut at the Pa...",0.755748,0.209195,0.035058,0.688777,0.27146,0.039763
2,worst great clips i’ve ever been to. older man...,The worst haircuts I’ve ever received. An olde...,0.913692,0.076231,0.010077,0.927497,0.065636,0.006867
3,myra did my long hair straight cut since glenn...,"Myra recently had a long hair straight cut, as...",0.005103,0.038017,0.95688,0.002748,0.029469,0.967783
4,terrible experience at great clips i had the w...,“I had a terrible experience at Great Clips. I...,0.965808,0.030455,0.003738,0.965295,0.031047,0.003658


In [101]:
# lets see the differences in the model outputs between the raw and cleaned text

diff = pd.DataFrame([], columns=['diff_abs_vader_compound', 'diff_vader_compound', 'diff_abs_roberta_negative_chunked', 'diff_roberta_negative_chunked',
                                 'diff_abs_roberta_neutral_chunked', 'diff_roberta_neutral_chunked', 'diff_abs_roberta_positive_chunked', 'diff_roberta_positive_chunked',])
diff['diff_abs_vader_compound'] = abs(reviews['vader_compound'] - reviews['vader_compound_cleaned_text'])
diff['diff_vader_compound'] = reviews['vader_compound'] - reviews['vader_compound_cleaned_text']

diff['diff_abs_roberta_negative_chunked'] = abs(reviews['roberta_negative_chunked'] - reviews['roberta_negative_chunked_cleaned_text'])
diff['diff_roberta_negative_chunked'] = reviews['roberta_negative_chunked'] - reviews['roberta_negative_chunked_cleaned_text']

diff['diff_abs_roberta_neutral_chunked'] = abs(reviews['roberta_neutral_chunked'] - reviews['roberta_neutral_chunked_cleaned_text'])
diff['diff_roberta_neutral_chunked'] = reviews['roberta_neutral_chunked'] - reviews['roberta_neutral_chunked_cleaned_text']

diff['diff_abs_roberta_positive_chunked'] = abs(reviews['roberta_positive_chunked'] - reviews['roberta_positive_chunked_cleaned_text'])
diff['diff_roberta_positive_chunked'] = reviews['roberta_positive_chunked'] - reviews['roberta_positive_chunked_cleaned_text']

display(diff)
diff.describe()

Unnamed: 0,diff_abs_vader_compound,diff_vader_compound,diff_abs_roberta_negative_chunked,diff_roberta_negative_chunked,diff_abs_roberta_neutral_chunked,diff_roberta_neutral_chunked,diff_abs_roberta_positive_chunked,diff_roberta_positive_chunked
0,0.0579,-0.0579,0.005031,0.005031,0.004738,-0.004738,0.000293,-0.000293
1,0.1360,-0.1360,0.066971,0.066971,0.062265,-0.062265,0.004706,-0.004706
2,0.3681,0.3681,0.013805,-0.013805,0.010595,0.010595,0.003210,0.003210
3,0.1381,-0.1381,0.002354,0.002354,0.008548,0.008548,0.010903,-0.010903
4,0.0484,0.0484,0.000513,0.000513,0.000593,-0.000593,0.000079,0.000079
...,...,...,...,...,...,...,...,...
1261,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1262,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1263,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1264,0.0000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


Unnamed: 0,diff_abs_vader_compound,diff_vader_compound,diff_abs_roberta_negative_chunked,diff_roberta_negative_chunked,diff_abs_roberta_neutral_chunked,diff_roberta_neutral_chunked,diff_abs_roberta_positive_chunked,diff_roberta_positive_chunked
count,1266.0,1266.0,1266.0,1266.0,1266.0,1266.0,1266.0,1266.0
mean,0.127282,-0.011434,0.028144,-2.2e-05,0.039805,0.013507,0.034401,-0.013485
std,0.250355,0.280643,0.070953,0.076335,0.076737,0.085392,0.088974,0.094439
min,0.0,-1.6372,0.0,-0.506202,0.0,-0.594459,0.0,-0.780964
25%,0.0,-0.0044,0.0,-0.000109,0.0,-0.000693,0.0,-0.004371
50%,0.0,0.0,0.001822,0.0,0.006525,0.0,0.001665,0.0
75%,0.148,0.0,0.01759,0.003192,0.043199,0.017222,0.019969,0.000624
max,1.8811,1.8811,0.677367,0.677367,0.594459,0.50014,0.780964,0.658506


In [102]:
# lets visualize the differences
import plotly.figure_factory as ff

fig3 = ff.create_distplot(
    [diff[col] for col in diff.columns],
    [col for col in diff.columns],
    show_hist=False,
    show_rug=True,
    show_curve=True
)

fig3.update_layout(
    width=1500, height=800,
    title={'text': 'Distribution of the Differences on Sentiment Model Scoring on Original vs. Cleaned text', 'x': 0.5},
    xaxis=dict(title='Differences in model scoring', ticklabelstandoff=10),
    yaxis=dict(title='Density', ticklabelstandoff=10),
    margin=dict(t=75,b=175,l=50,r=50)
)

fig3.add_annotation(
    text="1.) VADER compound sentiment scoring is a value normalized from -1 to 1 and does not represent a percentage while roBERTa sentiment scoring represents a percentage, so the<br>differences are respective of each model's scoring method"
         " (differences in normalized scoring for VADER and differences in confidence percentage for roBERTa)"
         "<br><br>2.) These distributions show the differences in sentiment scoring between LLM-cleaned and original review text",
    showarrow=False,
    xref='paper', yref='paper',
    align='left',
    x=0, y=-0.25
)

fig3.show()

In [103]:
fig3.write_html('../reports/figures/DistributionOfDiffInScoringOriginalVsCleanText.html')

Lets explore the review data where the differences between sentiment score between the original text and LLM-cleaned text is quite large

In [104]:
# need to explore the data where we find the largest differences (FIXME)
pd.set_option('display.max_colwidth', None)

# VADER differences
vader_mask = diff['diff_abs_vader_compound'] > 0.25
vader_index = diff[vader_mask].index
print(f"Number of large differences in VADER compound sentiment scoring: {len(vader_index)}")
print("Large changes in VADER compound sentiment scoring:")
display(reviews.loc[vader_index, ['review_text', 'review_text_cleaned','rating', 'vader_compound', 'vader_compound_cleaned_text']].head())


Number of large differences in VADER compound sentiment scoring: 219
Large changes in VADER compound sentiment scoring:


Unnamed: 0,review_text,review_text_cleaned,rating,vader_compound,vader_compound_cleaned_text
2,"worst great clips i’ve ever been to. older man in a walker cut my hair. zero interest in his job, the clippers were either low on battery or old because they kept catching on my hair. didn’t even bother blending, asked me what kind of trim on the back. for what? he did what he wanted. stay away from this location!","The worst haircuts I’ve ever received. An older man in a walker cut my hair with little interest in his job. The clippers were either low on battery or outdated, frequently catching on my hair. He didn’t even bother blending, and asked me what kind of trim I wanted. For what? He simply did what he wanted. I strongly advise avoiding this location.",1.0,-0.4389,-0.807
8,"the haircut was not good. there are so many other barbers around, i honestly don’t know how this place is still open. the owner was yelling and swearing when the internet went out, which felt really unprofessional. this is supposed to be a family-friendly place, and i expected better. on top of that, they had no way to take payment except cash. we had to leave and find an atm just to pay. it felt like they had no backup plan and no idea how to handle it. the price was not discounted for inconvenience. definitely not going back. one star all day. save your time and go somewhere else.","The haircut was unsatisfactory. With so many other barbers in the area, I honestly don’t know how this establishment remains open. The owner was yelling and swearing when the internet went out, which felt highly unprofessional. It was supposed to be a family-friendly place, and I expected better. Furthermore, they offered no alternative payment methods beyond cash, requiring us to leave and find an ATM to pay. This demonstrated a lack of preparedness and no clear plan for handling such an interruption. The price was not discounted for the inconvenience. I will not be returning. One star – save your time and choose another option.",1.0,0.4335,-0.6532
10,"stay away from the hairstylist that always wears a mask. i dont know her name but she is extremely rude and racist. i was top on the wait list and this caucasian gentleman walks in and he is 2nd on the list. she asked him for a haircut first, even though he explicitly said that he was 3nd on the list and i was before him. she didn't care and proceeded to do him first. she did me twice since then and she is hands down the most rude hairstylist i've ever had. now i'm at the point that if she isb the only one in the shop, i walk out of there. only reason i'm giving 2 stars is because areli and the asian guy is great 👍","Stay away from the hairstylist who consistently wears a mask. I don’t know her name, but she is extremely rude and racist. I was at the top of the wait list and a Caucasian gentleman walked in, and he was second in line. She immediately asked him for a haircut, even though he explicitly stated he was second on the list. She didn’t care and proceeded to do me twice, and she is undoubtedly the rudest hairstylist I’ve ever had. Now, I’m at the point where if she’s the only one in the shop, I will leave. Only reason I’m giving two stars is because the stylist and the Asian gentleman are great 👍",2.0,-0.8506,-0.2812
19,"went in oct 9th to have some layers put in. (i had grown my hair out to almost one length.) i showed the asian gentleman how short i wanted the shortest layer, to my shoulders. he chopped off my hair like it was a viral trend. he combed my hair straight back, i thought he was just checking for length. the next thing i felt was the scissors through all of my hair at once. instant dread/fear/regret. he then proceeded to show me all my hair in his fist. he did a few more little snips and in less than 3 minutes my hair was ""done"". this was also done dry. not even a spritz of water. the bulk at end of my hair is gone. the layers/cut is uneven. the shortest layer is at my ears. i have never in my adult life, cried after a haircut. the woman working with him said not to charge me and apologized. i've been having layers put in my hair for over 15 years and never has someone just chopped my hair off at once. i'm too scared to even have someone try to fix this because it just means cutting more hair off in my already thin cut. fwiw i think the man's name was than.","I went in October 9th to have layers cut. (I had grown my hair out to almost one length.) I showed the Asian gentleman how short I wanted the shortest layer to be, down to my shoulders. He chopped off my hair like it was a viral trend. He combed my hair straight back, and I thought he was just checking for length. The next thing I felt was the scissors through all my hair at once. An instant wave of dread, fear, and regret washed over me. He then proceeded to show me all my hair in his fist. He did a few more little snips, and in less than 3 minutes, my hair was “done.” It was also dry, without a spritz of water. The bulk at the end of my hair is gone. The layers/cut are uneven. The shortest layer is at my ears. I’ve never in my adult life, cried after a haircut. The woman working with him said not to charge me, and apologized. I’ve been having layers cut for over 15 years, and never has someone just chopped my hair off at once. I’m too scared to even consider having someone try to fix this, because it just means cutting more hair off in my already thin cut. For my own information, I think the man’s name was than.",1.0,-0.34,-0.9274
20,"an **absolutely disastrous** experience. had a haircut with ben (vietnamese guy) today, and i didn’t like his service, so i chose not to tip him. instead of accepting that, he **literally yelled at me**, stepped closer in an intimidating way, and **basically forced me to tip him**—even though he ruined my haircut. i’ve always tipped my hairdressers, and the owners can check my payment history to confirm that. but apparently, with ben, tipping isn’t about good service—it’s just **mandatory**, even if you leave with a bad haircut. i’d strongly urge the owners to check the cameras and see for themselves how **disgustingly** i was treated.","An absolutely disastrous experience. I had a haircut today with Ben, a Vietnamese stylist, and I didn’t like his service; therefore, I didn’t tip him. Instead of accepting that, he literally yelled at me, approached me in an intimidating manner, and essentially forced me to tip him—even though he ruined my haircut. I’ve always tipped my hairdressers, and the owners can verify my payment history to confirm this. However, with Ben, tipping isn’t about good service; it’s simply mandatory, even if I leave with a bad haircut. I strongly urge the owners to check the cameras and see for themselves how disgustingly I was treated.",1.0,-0.25,-0.908


In [105]:
# roBERTa drastic positive confidence changes
roberta_positive_diff_mask = diff['diff_abs_roberta_positive_chunked'] > 0.20   # if the differences between confidence scoring is greater than 20%
roberta_positive_diff_index = diff[roberta_positive_diff_mask].index
print(f"Number of large differences in roBERTa positive sentiment confidence: {len(roberta_positive_diff_index)}")
print("Large changes in confidence for roBERTa positive sentiment:")
display(reviews.loc[roberta_positive_diff_index, ['review_text', 'review_text_cleaned', 'rating', 'roberta_positive_chunked', 'roberta_positive_chunked_cleaned_text']].head())

# roBERTa drastic negative confidence changes
roberta_negative_diff_mask = diff['diff_abs_roberta_negative_chunked'] > 0.20
roberta_negative_diff_index = diff[roberta_negative_diff_mask].index
print(f"Number of large differences in roBERTa negative sentiment confidence: {len(roberta_negative_diff_index)}")
print("Large changes in confidence for roBERTa negative sentiment:")
display(reviews.loc[roberta_negative_diff_index, ['review_text', 'review_text_cleaned', 'rating', 'roberta_negative_chunked', 'roberta_negative_chunked_cleaned_text']].head())

Number of large differences in roBERTa positive sentiment confidence: 59
Large changes in confidence for roBERTa positive sentiment:


Unnamed: 0,review_text,review_text_cleaned,rating,roberta_positive_chunked,roberta_positive_chunked_cleaned_text
79,men's hair cut and beard trim. stylist took me from looking like i might live in my parents basement to looking like a businessman.,The men’s haircut and beard trim significantly transformed me from looking like I might live in my parents’ basement to appearing as a successful businessman.,5.0,0.118934,0.729204
104,used to take my son to sport clips but i feel like they have no idea what they are doing and they are over priced. husband and my 2 year old son got their hair cut here last week. it was cheaper than sports clip and their hair cuts were much better.,"“I used to take my son to sports clips, but I’ve recently felt like they’re not very knowledgeable about what they offer, and the prices are quite high. My husband and two-year-old son received their haircuts last week, and they were significantly cheaper than sports clips, and the cuts were much better.”",5.0,0.380611,0.659679
107,"ok service, and quick. messed however up my colluch. does not take great clips coupons (?) i had a 8.99 coupon they didn't even take a dollar off. won't be coming here again. complete ripoff am surprised great clips accepts them in their franchise.","The service was quick, and the wait times were efficient. However, I had a problem with my coupon – it didn’t accept a dollar off. I received an 8.99 coupon, but they didn’t even take a dollar off my bill. I won’t be returning, and it felt like a complete rip-off. I was surprised that Great Clips accepts these coupons, especially given their franchise location.",1.0,0.027417,0.245129
129,the slaughter and manchaca store let the cutter go that did the best job and cut the way i like it. they have not been the same since. guess i'll try the one up north near my office near parmer lane. and this is sam not debbie.......,"The Slaughter & Manchaca store exceeded expectations, delivering a cut that I particularly enjoyed. They have noticeably changed since then. I’ll try the one up north near my office, near Parmer Lane. This is Sam, not Debbie…",2.0,0.156924,0.910249
130,"i show up first. then some idiot signs in online, shows up 10 min after i'm already there and gets served first. last time i go here or any other ""great clips"" again. not when there's 4 other haircut places who understand first come first serve.","I arrived first. Then, some online signs indicated I was served before me, ten minutes after I’d already been there. This is the first time I’ve returned, or any other “great clips,” and it doesn’t happen when there are four other barbershops prioritizing first-come, first-served service.",1.0,0.062172,0.350517


Number of large differences in roBERTa negative sentiment confidence: 45
Large changes in confidence for roBERTa negative sentiment:


Unnamed: 0,review_text,review_text_cleaned,rating,roberta_negative_chunked,roberta_negative_chunked_cleaned_text
43,"i usually really don't like getting a haircut and i have let my hair grow for a long time because of how bad the past one was. t the man who cut my hair made me feel welcome and he did a stupendous job with my hair, he took me through each step in the mirror before during and after the whole process. i 100% recommend this establishment 5 star review has been changed to a 3 star review 🥲 10 months later and my haircut here was so rushed, my bangs were cut way too short and when i asked him if he could fix it it just got worse. unfortunately i dont know if i will come back to this location considering how awesome the service was last time. not the same stylist, was most likely my biggest mistake.","I usually don’t enjoy haircuts, and I’ve let my hair grow for a long time because of the previous one. The man who cut my hair made me feel welcome, and he did an outstanding job with my hair – he guided me through each step in the mirror before and during the process. I give this establishment a 5-star review, but it has been changed to a 3-star review 🥲 10 months later, my haircut was rushed, my bangs were cut too short, and when I asked him to fix it, it worsened. Unfortunately, I don’t know if I’ll return, considering how fantastic the service was last time. It’s not the same stylist, and it was likely my biggest mistake.",3.0,0.699611,0.433908
72,the haircut was good but every time i go the online coupons that bring me in never work at the end of the haircut. please send me coupons that actually work. you'll get me in for $9:99 or $12:99 and then charge me $19 bucks for a haircut before tip at the end..geesh,"The haircut was good, but every time I go online, the coupons never work, ending up in the salon. Please send me working coupons – $9.99 or $12.99, and then charge me $19 for the haircut before tip. Seriously.",3.0,0.45984,0.764338
79,men's hair cut and beard trim. stylist took me from looking like i might live in my parents basement to looking like a businessman.,The men’s haircut and beard trim significantly transformed me from looking like I might live in my parents’ basement to appearing as a successful businessman.,5.0,0.245333,0.01138
107,"ok service, and quick. messed however up my colluch. does not take great clips coupons (?) i had a 8.99 coupon they didn't even take a dollar off. won't be coming here again. complete ripoff am surprised great clips accepts them in their franchise.","The service was quick, and the wait times were efficient. However, I had a problem with my coupon – it didn’t accept a dollar off. I received an 8.99 coupon, but they didn’t even take a dollar off my bill. I won’t be returning, and it felt like a complete rip-off. I was surprised that Great Clips accepts these coupons, especially given their franchise location.",1.0,0.83455,0.374531
128,i’ve had my cut many of times here but today i didn’t because of barber choose to send me to another location they have a black person there that cut my hair,"I’ve had my haircut here many times, but today I didn’t because the barber chose to send me to another location. They have a Black person there, and they cut my hair.",1.0,0.36715,0.579876


#### DATA CHECKPOINT

In [None]:
# saving current data after part 1 (DATA CHECKPOINT)
pd.reset_option('display.max_colwidth')
col_order = [
    'review_id','rating','likes','date_review_scraped','approx_review_date',
    'source','location_id','review_text','owner_response_text','review_text_cleaned',
    'vader_negative', 'vader_neutral', 'vader_positive', 'vader_compound',
    'roberta_negative_chunked', 'roberta_neutral_chunked','roberta_positive_chunked',
    'roberta_negative_trunc','roberta_neutral_trunc', 'roberta_positive_trunc',
    'vader_negative_cleaned_text','vader_neutral_cleaned_text', 'vader_positive_cleaned_text','vader_compound_cleaned_text', 
    'roberta_negative_chunked_cleaned_text','roberta_neutral_chunked_cleaned_text','roberta_positive_chunked_cleaned_text'
]

reviews.rename(columns={'review_date': 'approx_review_date'}, inplace=True)
reviews.to_csv('../data/processed/reviews_checkpointPart1.csv', columns=col_order, index=False)

#### Conclusion on LLM-cleaned data
This subset of original and cleaned reviews demonstrates both the strengths and weaknesses of LLMs in cleaning natural language data. While the `Distribution of Differences in Sentiment Model Scoring on Original vs. Cleaned Text` plot shows that most sentiment scores changed only slightly for both RoBERTa and VADER methods, the outliers reveal key advantages and disadvantages of generative models like gemma3-1B-it. The VADER method shows ~200 instances where LLM-cleaned text drastically changed compound sentiment scores, while RoBERTa shows ~50 instances each for positive and negative sentiment where confidence changed significantly. Closer inspection reveals that LLM-cleaned text sometimes accurately captures the reviewer's intent pushing the sentiment scoring in the "correct" direction but other times misinterprets the original meaning (e.g. cases where the reviewer is very sarcastic) or hallucinates content, leading the scoring for sentiment to be inaccurate relative to the content of the review. This largely explains the large sentiment score differences when using the different versions of the review text. 

To conclude, LLMs have the potential to help clean and/or extract signals from raw data, but in this setup we should exercise extreme caution. Preserving the raw text is likely preferable to relying on this model to remove noise, since its cleaning sometimes distorts sentiment and meaning. A stronger model or a task-specific fine-tune might make LLM-assisted cleaning more reliable, but without those improvements, downstream analysis is better grounded on the original data.

### PART 2 (Reviews Summarizer feature): Using a model to summarize all reviews and point out key insights

#### Goal: To replicate well-known features like Amazon's generative AI product reviews summaries for our use-case

In [29]:
def check_memory():
    """Monitor RAM usage"""

    ram = psutil.virtual_memory()

    print(f"Memory Avaiable: {ram.available / (1024**3):.2f}GB | Memory Used: {ram.used / (1024**3):.2f}GB | Memory Free: {ram.free / (1024**3):.2f}GB | Memory Total: {ram.total/ (1024**3):.2f}GB ")
    print(f"RAM: {ram.percent}% used | ({ram.used / (1024**3):.2f}GB of {ram.total / (1024**3):.2f}GB)")

    # Display Neural Architecture Status
    print('\nArchitecture Status:')
    if torch.backends.mps.is_available():
        # MPS doesn't expose memory stats easily, but we can estimate
        print("MPS (Apple Silicon GPU) is active")
    else:
        print("MPS (Applie Silicon GPU) not active")
    
    if torch.cuda.is_available():
        print("NVIDIA CUDA is active")
    else:
        print("NVIDIA CUDA not active")


def clear_memory():
    """Aggressively clear memeory between models"""

    gc.collect()

    # Clear MPS cache (for Mac)
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()

    # Clear CUDA cache (for NVIDIA GPU)
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    print("Memory cleared")

check_memory()
print('\n--------------')
clear_memory()
print('--------------\n')
check_memory()

Memory Avaiable: 3.95GB | Memory Used: 5.84GB | Memory Free: 0.06GB | Memory Total: 16.00GB 
RAM: 75.3% used | (5.84GB of 16.00GB)

Architecture Status:
MPS (Apple Silicon GPU) is active
NVIDIA CUDA not active

--------------
Memory cleared
--------------

Memory Avaiable: 3.95GB | Memory Used: 5.84GB | Memory Free: 0.06GB | Memory Total: 16.00GB 
RAM: 75.3% used | (5.84GB of 16.00GB)

Architecture Status:
MPS (Apple Silicon GPU) is active
NVIDIA CUDA not active


In [16]:
# loading the data from part 1 data checkpoint and recasting all columns to correct data type
reviews = pd.read_csv('../data/processed/reviews_checkpointPart1.csv')

reviews['review_text'] = reviews['review_text'].fillna('')
reviews['review_text_cleaned'] = reviews['review_text'].fillna('')
reviews['owner_response_text'] = reviews['owner_response_text'].fillna('')

reviews = reviews.astype({
    'review_id': str,
    'source': str,
    'review_text': str,
    'review_text_cleaned': str,
    'owner_response_text': str
    })

reviews['date_review_scraped'] = pd.to_datetime(reviews['date_review_scraped'])
reviews['approx_review_date'] = pd.to_datetime(reviews['approx_review_date'])
print(reviews.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 27 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   review_id                              1266 non-null   object        
 1   rating                                 1266 non-null   float64       
 2   likes                                  1266 non-null   int64         
 3   date_review_scraped                    1266 non-null   datetime64[ns]
 4   approx_review_date                     1266 non-null   datetime64[ns]
 5   source                                 1266 non-null   object        
 6   location_id                            1266 non-null   int64         
 7   review_text                            1266 non-null   object        
 8   owner_response_text                    1266 non-null   object        
 9   review_text_cleaned                    1266 non-null   object  

In [17]:
def token_count(text, tokenizer):
    """Count tokens in text"""
    encoded_text = tokenizer(text, return_tensors='pt', truncation=False)
    return len(encoded_text['input_ids'][0])

In [18]:
# generating the token count using roBERTa/gemma tokenizers for both original and LLM-cleaned text

reviews['roberta_token_count'] = reviews['review_text'].progress_apply(lambda text: token_count(text, roberta_tokenizer))
reviews['roberta_token_count_clean_text'] = reviews['review_text_cleaned'].progress_apply(lambda text: token_count(text, roberta_tokenizer))
reviews['gemma_token_count'] = reviews['review_text'].progress_apply(lambda text: token_count(text, gemma3_1B_tokenizer))
reviews['gemma_token_count_clean_text'] = reviews['review_text_cleaned'].progress_apply(lambda text: token_count(text, gemma3_1B_tokenizer))

display(reviews[['roberta_token_count', 'roberta_token_count_clean_text', 'gemma_token_count', 'gemma_token_count_clean_text']])

  0%|          | 0/1266 [00:00<?, ?it/s]

NameError: name 'roberta_tokenizer' is not defined

In [138]:
# grab only the reviews that have text (i.e. reviews with just ratings won't tell us anything because the customer did not say anything)
# and reviews within the past couple of years
mask = (reviews['review_text'] != '') & (reviews['approx_review_date'] >= pd.Timestamp('2024-01-01') )
reviews[mask]['roberta_token_count'].sum()

print(f"Total Token Count (roberta tokenizer | original text) : {reviews[mask]['roberta_token_count'].sum()}")
print(f"Total Token Count (roberta tokenizer | cleaned text) : {reviews[mask]['roberta_token_count_clean_text'].sum()}")
print(f"Total Token Count (gemma tokenizer | original text) : {reviews[mask]['gemma_token_count'].sum()}")
print(f"Total Token Count (gemma tokenizer | cleaned text) : {reviews[mask]['gemma_token_count_clean_text'].sum()}")

Total Token Count (roberta tokenizer | original text) : 10068
Total Token Count (roberta tokenizer | cleaned text) : 10068
Total Token Count (gemma tokenizer | original text) : 9713
Total Token Count (gemma tokenizer | cleaned text) : 9713


The gemma-3-1B-it model has a max input size of 32K tokens. We will likely need a stronger model with a larger context window to help with processing more reviews if we want to increase the window in which we want to capture the summary of reviews, or defer to batching methods to process the reviews using lightweight models (<= ~3B parameters for resource case).

In this part, we will explore multiple models to complete this summarization task and consider the benefits and drawbacks of implementing this feature in this use-case and similar use-cases

* Proposed models to explore (all running on local machine with 16GB RAM):
    1) gemma-3-1B-it (1 billion parameters) `[curently works]`
    2) gemma-3-4B-it (4 billion parameters) `[does not work; not enough system memory]`
    2) qwen2.5-1.5B-Instruct (2 billion parameters)
    3) qwen2.5-3B-Instruct (3 billion parameters)
    4) Llama-3.2-1B-Instruct (1 billion parameters)
    5) Llama-3.2-3B-Instruct (1 billion parameters)


In [None]:
# Qwen2.5 1B model
qwen2dot5_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
qwen2dot5_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

In [None]:
# checking the text token counts with the 4B tokenizer

reviews['gemma_4B_token_count'] = reviews['review_text'].progress_apply(lambda text: token_count(text, gemma3_4B_tokenizer))
reviews['gemma_4B_token_count_clean_text'] = reviews['review_text_cleaned'].progress_apply(lambda text: token_count(text, gemma3_4B_tokenizer))

print(f"Total Token Count (gemma 4B tokenizer | original text) : {reviews[mask]['gemma_4B_token_count'].sum()}")
print(f"Total Token Count (gemma 4B tokenizer | cleaned text) : {reviews[mask]['gemma_4B_token_count_clean_text'].sum()}")
display(reviews[['gemma_4B_token_count', 'gemma_4B_token_count_clean_text']])