In [102]:
import pandas as pd
import numpy as np
import nltk
import re
from pprint import pprint
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [71]:
# perform only once
# nltk.download('all')

In [103]:
reviews = pd.read_csv('../data/processed/cleaned_combined_reviews_data.csv')

In [104]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   review_id            1266 non-null   object 
 1   rating               1266 non-null   float64
 2   likes                1266 non-null   int64  
 3   date_review_scraped  1266 non-null   object 
 4   review_date          1266 non-null   object 
 5   source               1266 non-null   object 
 6   review_text          848 non-null    object 
 7   owner_response_text  1209 non-null   object 
 8   location_id          1266 non-null   int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 89.1+ KB


In [105]:
# slight data cleaning (will have to move this to appropriate notebook)
reviews = reviews.astype({
    'review_id': str,
    'source': str,
    'review_text': str,
    'owner_response_text': str
    })

# contains null values so I need to change them to default values
reviews['review_text'] = reviews['review_text'].fillna('')
reviews['owner_response_text'] = reviews['owner_response_text'].fillna('')

reviews['date_review_scraped'] = pd.to_datetime(reviews['date_review_scraped'])
reviews['review_date'] = pd.to_datetime(reviews['review_date'])


In [106]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   review_id            1266 non-null   object        
 1   rating               1266 non-null   float64       
 2   likes                1266 non-null   int64         
 3   date_review_scraped  1266 non-null   datetime64[ns]
 4   review_date          1266 non-null   datetime64[ns]
 5   source               1266 non-null   object        
 6   review_text          1266 non-null   object        
 7   owner_response_text  1266 non-null   object        
 8   location_id          1266 non-null   int64         
dtypes: datetime64[ns](2), float64(1), int64(2), object(4)
memory usage: 89.1+ KB


In [107]:
print(f'Reviews DataFrame Length: {len(reviews)}')
reviews.head(10)

Reviews DataFrame Length: 1266


Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,review_text,owner_response_text,location_id
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025-12-16,2025-08-18,Google Maps,i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025-12-16,2025-10-17,Google Maps,9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025-12-16,2025-05-20,Google Maps,worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025-12-16,2025-10-17,Google Maps,myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025-12-16,2025-01-20,Google Maps,terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1
5,ChdDSUhNMG9nS0VJQ0FnTURvckxmQTVBRRAB,5.0,0,2025-12-16,2025-05-20,Google Maps,i got come here for the first time without che...,thank you for the great review! we’re glad you...,1
6,Ci9DQUlRQUNvZENodHljRjlvT21WSFRITjFSV1JHYTJKcF...,1.0,0,2025-12-16,2025-09-17,Google Maps,"i rarely leave negative reviews, but this expe...","hi siva prasad, thank you for sharing your fee...",1
7,Ci9DQUlRQUNvZENodHljRjlvT25GQ01rTTViVFUxTm5kM1...,1.0,0,2025-12-16,2025-09-17,Google Maps,possible the worst hair cut i have ever gotten...,"hi gregg, thank you for sharing your feedback ...",1
8,Ci9DQUlRQUNvZENodHljRjlvT21OeFJuUTJjalpTYldkbW...,1.0,0,2025-12-16,2025-07-19,Google Maps,the haircut was not good. there are so many ot...,"hi kinzington, thank you for sharing your feed...",1
9,ChZDSUhNMG9nS0VJQ0FnSUNQNEx2RkFnEAE,1.0,0,2025-12-16,2024-12-16,Google Maps,i just hair cut at great clip. one vietnamese ...,"we're sorry to hear this, cathy. if you would ...",1


In [108]:
# this review data was scraped from the web so it is likely that there are remenant HTML or special characters still in the text so lets check
# NOTE: move any data cleaning to the proper notebook file

html_specialChar_pattern = r'<[^>]+>|\n|\r|\t|&[a-z]+;'
problem_rows = reviews[ reviews['review_text'].str.contains(html_specialChar_pattern, regex=True, na=False) ]
print(problem_rows)

Empty DataFrame
Columns: [review_id, rating, likes, date_review_scraped, review_date, source, review_text, owner_response_text, location_id]
Index: []


In [110]:
example = reviews['review_text'][0]
print(example)

i brought my son to this location today for a back-to-school haircut. there is no proper welcoming into the store too they are not showing any respect towards customers… i clearly showed reference pictures of the style we wanted. the stylist (ben)did not listen patiently, rushed through the process, and completely ignored the details we explained. the haircut looked nothing like what we asked for. my kid was so upset with the result that he cried the entire time while doing..the experience upset him so much that now he doesn’t even want to go to school tomorrow. this was a very disappointing experience, and i would not recommend this stylist or this location. i hope management addresses this so other customers don’t go through the same frustration.


In [111]:
tokens = nltk.word_tokenize(example)
print(tokens[:10])

['i', 'brought', 'my', 'son', 'to', 'this', 'location', 'today', 'for', 'a']


In [112]:
tagged = nltk.pos_tag(tokens) # pos => 'part of speech'
print(tagged[:10])

[('i', 'NN'), ('brought', 'VBD'), ('my', 'PRP$'), ('son', 'NN'), ('to', 'TO'), ('this', 'DT'), ('location', 'NN'), ('today', 'NN'), ('for', 'IN'), ('a', 'DT')]


In [113]:
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

(S
  i/NN
  brought/VBD
  my/PRP$
  son/NN
  to/TO
  this/DT
  location/NN
  today/NN
  for/IN
  a/DT
  back-to-school/JJ
  haircut/NN
  ./.
  there/EX
  is/VBZ
  no/DT
  proper/JJ
  welcoming/NN
  into/IN
  the/DT
  store/NN
  too/RB
  they/PRP
  are/VBP
  not/RB
  showing/VBG
  any/DT
  respect/NN
  towards/IN
  customers…/NN
  i/NN
  clearly/RB
  showed/VBD
  reference/NN
  pictures/NNS
  of/IN
  the/DT
  style/NN
  we/PRP
  wanted/VBD
  ./.
  the/DT
  stylist/NN
  (/(
  ben/NN
  )/)
  did/VBD
  not/RB
  listen/VB
  patiently/RB
  ,/,
  rushed/VBD
  through/IN
  the/DT
  process/NN
  ,/,
  and/CC
  completely/RB
  ignored/VBD
  the/DT
  details/NNS
  we/PRP
  explained/VBD
  ./.
  the/DT
  haircut/NN
  looked/VBD
  nothing/NN
  like/IN
  what/WP
  we/PRP
  asked/VBD
  for/IN
  ./.
  my/PRP$
  kid/NN
  was/VBD
  so/RB
  upset/JJ
  with/IN
  the/DT
  result/NN
  that/IN
  he/PRP
  cried/VBD
  the/DT
  entire/JJ
  time/NN
  while/IN
  doing/VBG
  ../PDT
  the/DT
  experience/NN
  upset

# VADER Sentiment Scoring (Classical Method)

In [114]:
# VADER = Valence Aware Dictionary and sEntiment Reasoner --> ('bag of words approach' to sentiment analysis)
#
#       -> this method does not account for relationships between words

from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [115]:
pprint(example)
pprint(sia.polarity_scores(example))

('i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed reference pictures of the style we '
 'wanted. the stylist (ben)did not listen patiently, rushed through the '
 'process, and completely ignored the details we explained. the haircut looked '
 'nothing like what we asked for. my kid was so upset with the result that he '
 'cried the entire time while doing..the experience upset him so much that now '
 'he doesn’t even want to go to school tomorrow. this was a very disappointing '
 'experience, and i would not recommend this stylist or this location. i hope '
 'management addresses this so other customers don’t go through the same '
 'frustration.')
{'compound': -0.9493, 'neg': 0.191, 'neu': 0.742, 'pos': 0.068}


In [116]:
# Run polarity scoring on all reveiw text
tqdm.pandas()

vader_sentiment = reviews['review_text'].progress_apply(lambda text: sia.polarity_scores(text))
reviews[['vader_negative', 'vader_neutral', 'vader_positive', 'vader_compound']] = vader_sentiment.apply(pd.Series)

# reviews['sentiment_polarity_score'] = reviews['review_text'].progress_apply(lambda text: sia.polarity_scores(text)['compound'])


  0%|          | 0/1266 [00:00<?, ?it/s]

In [117]:
reviews.head(10)

Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,review_text,owner_response_text,location_id,vader_negative,vader_neutral,vader_positive,vader_compound
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025-12-16,2025-08-18,Google Maps,i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1,0.191,0.742,0.068,-0.9493
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025-12-16,2025-10-17,Google Maps,9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1,0.063,0.921,0.016,-0.6116
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025-12-16,2025-05-20,Google Maps,worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1,0.152,0.749,0.098,-0.4389
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025-12-16,2025-10-17,Google Maps,myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1,0.07,0.751,0.179,0.7574
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025-12-16,2025-01-20,Google Maps,terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1,0.178,0.717,0.105,-0.9174
5,ChdDSUhNMG9nS0VJQ0FnTURvckxmQTVBRRAB,5.0,0,2025-12-16,2025-05-20,Google Maps,i got come here for the first time without che...,thank you for the great review! we’re glad you...,1,0.029,0.759,0.212,0.9651
6,Ci9DQUlRQUNvZENodHljRjlvT21WSFRITjFSV1JHYTJKcF...,1.0,0,2025-12-16,2025-09-17,Google Maps,"i rarely leave negative reviews, but this expe...","hi siva prasad, thank you for sharing your fee...",1,0.216,0.592,0.192,-0.8064
7,Ci9DQUlRQUNvZENodHljRjlvT25GQ01rTTViVFUxTm5kM1...,1.0,0,2025-12-16,2025-09-17,Google Maps,possible the worst hair cut i have ever gotten...,"hi gregg, thank you for sharing your feedback ...",1,0.265,0.66,0.075,-0.9424
8,Ci9DQUlRQUNvZENodHljRjlvT21OeFJuUTJjalpTYldkbW...,1.0,0,2025-12-16,2025-07-19,Google Maps,the haircut was not good. there are so many ot...,"hi kinzington, thank you for sharing your feed...",1,0.144,0.713,0.143,0.4335
9,ChZDSUhNMG9nS0VJQ0FnSUNQNEx2RkFnEAE,1.0,0,2025-12-16,2024-12-16,Google Maps,i just hair cut at great clip. one vietnamese ...,"we're sorry to hear this, cathy. if you would ...",1,0.258,0.632,0.11,-0.9558


In [None]:
fig1 = go.Figure()

colors = px.colors.sequential.Agsunset[:5]
ratings = sorted(reviews['rating'].unique())


for i, rating in enumerate(ratings):
    data = reviews[ reviews['rating'] == rating]['vader_compound']

    fig1.add_trace(go.Box(
        y=data,
        name=f'{rating} Rating',
        marker=dict(color=colors[i]),
        boxmean=True
    ))

fig1.update_layout(
    width=800, height=600,
    title={'text':'VADER Compound Sentiment Score by Rating', 'x': 0.5},
    xaxis=dict(
        title='Rating',
        ticklabelstandoff=10
    ),
    yaxis=dict(
        title='Compound Score',
        ticklabelstandoff=10
    ),
    margin=dict(t=50, b=50, l=50, r=50)
)

fig1.show()

In [132]:
fig2 = make_subplots(
    rows=1, cols=3,
    subplot_titles=('VADER Positive', 'VADER Neutral', 'VADER Negative')
    )   

colors = px.colors.sequential.Agsunset[:5]
ratings = sorted(reviews['rating'].unique())

# Custom hover template with box statistics
hover_text = (
    '<b>%{fullData.name}</b><br>'
    'Min: %{customdata[0]:.3f}<br>'
    'Q1: %{customdata[1]:.3f}<br>'
    'Median: %{customdata[2]:.3f}<br>'
    'Mean: %{customdata[3]:.3f}<br>'
    'Q3: %{customdata[4]:.3f}<br>'
    'Max: %{customdata[5]:.3f}<br>'
    '<extra></extra>'
)

# Positive scores
for i, rating in enumerate(ratings):
    data = reviews[reviews['rating'] == rating]['vader_positive']
    stats = [data.min(), data.quantile(0.25), data.median(), data.mean(), data.quantile(0.75), data.max()]
    fig2.add_trace(go.Box(
        y=data, 
        name=f'{rating} Rating', 
        marker=dict(color=colors[i]), 
        boxmean=True,
        legendgroup=str(rating),
        showlegend=True,
        hovertemplate=hover_text,
        customdata=[stats] * len(data)),
        row=1, col=1
    )

# Neutral scores
for i, rating in enumerate(ratings):
    data = reviews[reviews['rating'] == rating]['vader_neutral']
    stats = [data.min(), data.quantile(0.25), data.median(), data.mean(), data.quantile(0.75), data.max()]
    fig2.add_trace(go.Box(
        y=data, 
        name=f'{rating} Rating', 
        marker=dict(color=colors[i]), 
        boxmean=True,
        legendgroup=str(rating),
        showlegend=False,
        hovertemplate=hover_text,
        customdata=[stats] * len(data)),
        row=1, col=2
    )

# Negative scores
for i, rating in enumerate(ratings):
    data = reviews[reviews['rating'] == rating]['vader_negative']
    stats = [data.min(), data.quantile(0.25), data.median(), data.mean(), data.quantile(0.75), data.max()]
    fig2.add_trace(go.Box(
        y=data, 
        name=f'{rating} Rating', 
        marker=dict(color=colors[i]), 
        boxmean=True,
        legendgroup=str(rating),
        showlegend=False,
        hovertemplate=hover_text,
        customdata=[stats] * len(data)),
        row=1, col=3
    )

fig2.update_layout(
    width=1750, height=750,
    title={'text':'VADER Sentiment Scores by Rating', 'x':0.5},
    showlegend=True,
    hoverlabel=dict(namelength=-1, bgcolor="white", bordercolor="black")
)

fig2.update_xaxes(ticklabelstandoff=10, row=1, col=1)
fig2.update_xaxes(ticklabelstandoff=10, row=1, col=2)
fig2.update_xaxes(ticklabelstandoff=10, row=1, col=3)

fig2.update_yaxes(title_text='Positive Score', row=1, col=1)
fig2.update_yaxes(title_text='Neutral Score', row=1, col=2)
fig2.update_yaxes(title_text='Negative Score', row=1, col=3)

fig2.show()


# ROBERTA Sentiment Scoring (Pre-Trained Transformer Model --Hugging Face)

In [121]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [89]:
# # Chandar Research Lab's NeoBERT model
# NeoBERT = 'chandar-lab/NeoBERT'
# NeoBERT_tokenizer = AutoTokenizer.from_pretrained(NeoBERT, trust_remote_code=True)
# NeoBERT_model = AutoModelForSequenceClassification.from_pretrained(NeoBERT, trust_remote_code=True)

# NOTE: cannot run this model on macOS (revist this later)

In [122]:
print(example)
pprint(sia.polarity_scores(example))

i brought my son to this location today for a back-to-school haircut. there is no proper welcoming into the store too they are not showing any respect towards customers… i clearly showed reference pictures of the style we wanted. the stylist (ben)did not listen patiently, rushed through the process, and completely ignored the details we explained. the haircut looked nothing like what we asked for. my kid was so upset with the result that he cried the entire time while doing..the experience upset him so much that now he doesn’t even want to go to school tomorrow. this was a very disappointing experience, and i would not recommend this stylist or this location. i hope management addresses this so other customers don’t go through the same frustration.
{'compound': -0.9493, 'neg': 0.191, 'neu': 0.742, 'pos': 0.068}


In [123]:
# META AI Lab's RoBERTa-base model trained on twitter comments release in 2019 [~125M parameters] (loading the model)
roberta = f"cardiffnlp/twitter-roberta-base-sentiment"
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta)
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta)

print(roberta_model.num_parameters())
print(roberta_model.config.model_type)

124647939
roberta


In [None]:
# Example of running the model on text
encoded_text = roberta_tokenizer(example, return_tensors='pt')
output = roberta_model(**encoded_text)
score = output[0][0].detach().numpy()
scores = softmax(score)
scores_dict = {
    f'roberta_negative': float(scores[0]),
    'roberta_neutral': float(scores[1]),
    'roberta_positive' : float(scores[2])
}
pprint(scores_dict)

{'roberta_negative': 0.9759707450866699,
 'roberta_neutral': 0.02143993228673935,
 'roberta_positive': 0.0025893172714859247}


In [125]:
# CALCULATE SENTIMENT SCORES (v1.0)

def sentiment_scores_roberta(text, model, tokenizer):
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(**encoded_text)
    scores = softmax(output[0][0].detach().numpy())
    scores_dict = {
        f'{model.config.model_type}_negative': float(scores[0]),
        f'{model.config.model_type}_neutral': float(scores[1]),
        f'{model.config.model_type}_positive': float(scores[2]),
    }

    return scores_dict

In [94]:
roberta_sentiment = reviews['review_text'].progress_apply(lambda text: sentiment_scores_roberta(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative', 'roberta_neutral', 'roberta_positive']] = roberta_sentiment.apply(pd.Series)

# ISSUE: RoBERTa-base has a maximum token length of 512 but some review text is longer than that

  0%|          | 0/1266 [00:00<?, ?it/s]

RuntimeError: The expanded size of the tensor (597) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 597].  Tensor sizes: [1, 514]

In [126]:
# CALCULATE SENTIMENT SCORES VIA TOKEN CHUNKING (v1.1)

def sentiment_scores_roberta_chunked(text, model, tokenizer, chunk_size=512):
    '''
    Process long texts by chunking and using a weighted-average by chunk-percentage for sentiment score
    '''

    encoded_text = tokenizer(text, return_tensors='pt', truncation=False)
    token_ids = encoded_text['input_ids'][0]

    # If text short enough to fit within one chunk, process normally
    if len(token_ids) <= chunk_size:
        encoded_text = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        output = model(**encoded_text)
        scores = softmax(output[0][0].detach().numpy())
    else:
        chunk_scores = []
        chunk_weights = []
        total_tokens = len(token_ids)

        for i in range(0, len(token_ids), chunk_size - 50):
            chunk_end = min(i + chunk_size, len(token_ids))
            chunk_ids = token_ids[i:chunk_end].unsqueeze(0)

            output = model(input_ids=chunk_ids)
            chunk_score = softmax(output[0][0].detach().numpy())
            chunk_scores.append(chunk_score)

            # weighting chunk
            chunk_token_count = chunk_end - i
            chunk_weight_percentage = chunk_token_count / total_tokens
            chunk_weights.append(chunk_weight_percentage)
    
        # Calculate weighted average
        chunk_scores = np.array(chunk_scores)
        chunk_weights = np.array(chunk_weights)
        scores = np.average(chunk_scores, axis=0, weights=chunk_weights)

    scores_dict = {
        f'{model.config.model_type}_negative': float(scores[0]),
        f'{model.config.model_type}_neutral': float(scores[1]),
        f'{model.config.model_type}_positive': float(scores[2]),
    }
    return scores_dict

In [127]:
# using chunked version of the application function

roberta_sentiment = reviews['review_text'].progress_apply(lambda text: sentiment_scores_roberta_chunked(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative_chunked', 'roberta_neutral_chunked', 'roberta_positive_chunked']] = roberta_sentiment.apply(pd.Series)

  0%|          | 0/1266 [00:00<?, ?it/s]

In [128]:
reviews.head(10)

Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,review_text,owner_response_text,location_id,vader_negative,vader_neutral,vader_positive,vader_compound,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025-12-16,2025-08-18,Google Maps,i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1,0.191,0.742,0.068,-0.9493,0.975971,0.02144,0.002589
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025-12-16,2025-10-17,Google Maps,9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1,0.063,0.921,0.016,-0.6116,0.755748,0.209195,0.035058
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025-12-16,2025-05-20,Google Maps,worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1,0.152,0.749,0.098,-0.4389,0.913692,0.076231,0.010077
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025-12-16,2025-10-17,Google Maps,myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1,0.07,0.751,0.179,0.7574,0.005103,0.038017,0.95688
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025-12-16,2025-01-20,Google Maps,terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1,0.178,0.717,0.105,-0.9174,0.965808,0.030455,0.003738
5,ChdDSUhNMG9nS0VJQ0FnTURvckxmQTVBRRAB,5.0,0,2025-12-16,2025-05-20,Google Maps,i got come here for the first time without che...,thank you for the great review! we’re glad you...,1,0.029,0.759,0.212,0.9651,0.002146,0.014832,0.983022
6,Ci9DQUlRQUNvZENodHljRjlvT21WSFRITjFSV1JHYTJKcF...,1.0,0,2025-12-16,2025-09-17,Google Maps,"i rarely leave negative reviews, but this expe...","hi siva prasad, thank you for sharing your fee...",1,0.216,0.592,0.192,-0.8064,0.974687,0.022387,0.002926
7,Ci9DQUlRQUNvZENodHljRjlvT25GQ01rTTViVFUxTm5kM1...,1.0,0,2025-12-16,2025-09-17,Google Maps,possible the worst hair cut i have ever gotten...,"hi gregg, thank you for sharing your feedback ...",1,0.265,0.66,0.075,-0.9424,0.933269,0.05803,0.008701
8,Ci9DQUlRQUNvZENodHljRjlvT21OeFJuUTJjalpTYldkbW...,1.0,0,2025-12-16,2025-07-19,Google Maps,the haircut was not good. there are so many ot...,"hi kinzington, thank you for sharing your feed...",1,0.144,0.713,0.143,0.4335,0.955561,0.040021,0.004418
9,ChZDSUhNMG9nS0VJQ0FnSUNQNEx2RkFnEAE,1.0,0,2025-12-16,2024-12-16,Google Maps,i just hair cut at great clip. one vietnamese ...,"we're sorry to hear this, cathy. if you would ...",1,0.258,0.632,0.11,-0.9558,0.950809,0.044461,0.00473


In [129]:
# CALCULATE SENTIMENT SCORE VIA TOKEN TRUNCATION (v1.2)

def sentiment_scores_roberta_trunc(text, model, tokenizer):
    # FIXME: complete this function
    encoded_text = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    output = model(**encoded_text)
    scores = softmax(output[0][0].detach().numpy())
    scores_dict = {
        f'{model.config.model_type}_negative': float(scores[0]),
        f'{model.config.model_type}_neutral': float(scores[1]),
        f'{model.config.model_type}_positive': float(scores[2]),
    }
    
    return scores_dict

In [130]:
roberta_sentiment_trunc = reviews['review_text'].progress_apply(lambda text: sentiment_scores_roberta_trunc(text, roberta_model, roberta_tokenizer))
reviews[['roberta_negative_trunc', 'roberta_neutral_trunc', 'roberta_positive_trunc']] = roberta_sentiment.apply(pd.Series)

  0%|          | 0/1266 [00:00<?, ?it/s]

In [131]:
reviews.head(10)

# NOTE: we will probably only see differences in reviews with more than 512 tokens (i.e. in this reviews dataset the first occurence of this is around row 1000)

Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,review_text,owner_response_text,location_id,vader_negative,vader_neutral,vader_positive,vader_compound,roberta_negative_chunked,roberta_neutral_chunked,roberta_positive_chunked,roberta_negative_trunc,roberta_neutral_trunc,roberta_positive_trunc
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025-12-16,2025-08-18,Google Maps,i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1,0.191,0.742,0.068,-0.9493,0.975971,0.02144,0.002589,0.975971,0.02144,0.002589
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025-12-16,2025-10-17,Google Maps,9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1,0.063,0.921,0.016,-0.6116,0.755748,0.209195,0.035058,0.755748,0.209195,0.035058
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025-12-16,2025-05-20,Google Maps,worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1,0.152,0.749,0.098,-0.4389,0.913692,0.076231,0.010077,0.913692,0.076231,0.010077
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025-12-16,2025-10-17,Google Maps,myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1,0.07,0.751,0.179,0.7574,0.005103,0.038017,0.95688,0.005103,0.038017,0.95688
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025-12-16,2025-01-20,Google Maps,terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1,0.178,0.717,0.105,-0.9174,0.965808,0.030455,0.003738,0.965808,0.030455,0.003738
5,ChdDSUhNMG9nS0VJQ0FnTURvckxmQTVBRRAB,5.0,0,2025-12-16,2025-05-20,Google Maps,i got come here for the first time without che...,thank you for the great review! we’re glad you...,1,0.029,0.759,0.212,0.9651,0.002146,0.014832,0.983022,0.002146,0.014832,0.983022
6,Ci9DQUlRQUNvZENodHljRjlvT21WSFRITjFSV1JHYTJKcF...,1.0,0,2025-12-16,2025-09-17,Google Maps,"i rarely leave negative reviews, but this expe...","hi siva prasad, thank you for sharing your fee...",1,0.216,0.592,0.192,-0.8064,0.974687,0.022387,0.002926,0.974687,0.022387,0.002926
7,Ci9DQUlRQUNvZENodHljRjlvT25GQ01rTTViVFUxTm5kM1...,1.0,0,2025-12-16,2025-09-17,Google Maps,possible the worst hair cut i have ever gotten...,"hi gregg, thank you for sharing your feedback ...",1,0.265,0.66,0.075,-0.9424,0.933269,0.05803,0.008701,0.933269,0.05803,0.008701
8,Ci9DQUlRQUNvZENodHljRjlvT21OeFJuUTJjalpTYldkbW...,1.0,0,2025-12-16,2025-07-19,Google Maps,the haircut was not good. there are so many ot...,"hi kinzington, thank you for sharing your feed...",1,0.144,0.713,0.143,0.4335,0.955561,0.040021,0.004418,0.955561,0.040021,0.004418
9,ChZDSUhNMG9nS0VJQ0FnSUNQNEx2RkFnEAE,1.0,0,2025-12-16,2024-12-16,Google Maps,i just hair cut at great clip. one vietnamese ...,"we're sorry to hear this, cathy. if you would ...",1,0.258,0.632,0.11,-0.9558,0.950809,0.044461,0.00473,0.950809,0.044461,0.00473


In [None]:
# testing token count
test_text = reviews.iloc[999, 6]

test_encoded = roberta_tokenizer(test_text, return_tensors='pt')
num_tokens = len(test_encoded['input_ids'][0])
print(num_tokens)

# test_encoded['input_ids'][0]

597


In [133]:
fig1.write_html('../reports/figures/VADERCompoundSentimentByRating.html')
fig2.write_html('../reports/figures/VADERSentimentScoreDecompByRating.html')

# Abstractive Summarization Feature

In [150]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv('../.env')
HF_TOKEN = os.getenv('HUGGINGFACE_ACCESS_TOKEN')

login(token=HF_TOKEN)

## Model selection reasoning

Abstractive Summarization Feature GOAL: find a model that is able to take a large sum of review text (partitioned by location) and summarize what the reviewers are most commonly saying about the location and its service

1) First idea was to use summarization tasked models (e.g. facebook/bart-large-cnn) but after doing some light usage testing, I found that these models did not meet my requirements for text summarization in the context of google reviews. I discovered that these models were the 'Extractive Summarization' models rather than the 'Abstractive Summarization' model which I did not particularly need for achieving the goal of summarization of vast amounts review text. Also these models have relatively small input token limits

2) Second idea was to find abstractive summarization models (e.g. google/pegasus-xsum) but again after doing some light usage testing on only a small portion of example review text, I found that these models were not substantive enough providing very short summaries even after some parameter tuning on model outputs as well as prone to hallucination as demonstrated by other users of the model. Also these models had very small token input size limits (~500 tokens) and small context windows, which would not satisfy my needs to potentially thousands of input tokens

3) Upon more research, I compared different task categories: "extractive summarization", "abstractive summarization", AND "text generation". I found that text generation had the capabilites of more abstractive summarization with larger input token limits but similarly to abstractive summarization, text generation tasked models are prone to hallucinations in model outputs. Even though hallucination are a likely possiblity using text generation model, I think the risk is very well worth the potential ability to complete the goal to a high degree and based on my trialing of these models, they seem to perform well with user prompting (which I think may lower hallucination rates or getting undesirable outputs from the model)For the model, I am selecting the Google Gemma 3 270M parameter model since it is lightweight (<1B parameters) and has an input/output size limit of 32K tokens which checks all the requirements I have for completing this summarization feature

In [None]:
# REMOVE ME
from transformers import pipeline

review_summary_pipe = pipeline('text-generation',
                               model='google/gemma-3-270m-it')

Device set to use mps:0


In [161]:
output = review_summary_pipe(example)
print(output[0]['generated_text'])

i brought my son to this location today for a back-to-school haircut. there is no proper welcoming into the store too they are not showing any respect towards customers… i clearly showed reference pictures of the style we wanted. the stylist (ben)did not listen patiently, rushed through the process, and completely ignored the details we explained. the haircut looked nothing like what we asked for. my kid was so upset with the result that he cried the entire time while doing..the experience upset him so much that now he doesn’t even want to go to school tomorrow. this was a very disappointing experience, and i would not recommend this stylist or this location. i hope management addresses this so other customers don’t go through the same frustration.
The store is clean and well-maintained, with plenty of space for customers to move around.
The staff is friendly and helpful. They are very knowledgeable about the products and services.
The store is clean and well-maintained.
The store is w

In [164]:
output = review_summary_pipe(example,
                             min_length=100,
                             num_beams=5,
                             temperature=0.75,
                             repetition_penalty=1.2,
                             do_sample=True)
pprint(output[0]['generated_text'])

('i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed reference pictures of the style we '
 'wanted. the stylist (ben)did not listen patiently, rushed through the '
 'process, and completely ignored the details we explained. the haircut looked '
 'nothing like what we asked for. my kid was so upset with the result that he '
 'cried the entire time while doing..the experience upset him so much that now '
 'he doesn’t even want to go to school tomorrow. this was a very disappointing '
 'experience, and i would not recommend this stylist or this location. i hope '
 'management addresses this so other customers don’t go through the same '
 'frustration.\n'
 'i brought my son to this location today for a back-to-school haircut. there '
 'is no proper welcoming into the store too they are not showing any respect '
 'towards customers… i clearly showed

In [183]:
# Load model directly (more fine-grain control with the model -- WILL USE THIS METHOD)
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-270m-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m-it")
messages = [
    {"role": "user", 
     "content": f"Revise this review text for proper grammar and clarity: {example}"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, 
                         max_new_tokens=1092,
                         temperature=0.8)

print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Here's a revised version of the review text, aiming for better grammar and clarity:

"I brought my son to this location today for a back-to-school haircut. There is no proper welcome into the store, and they are not showing any respect for customers. I clearly showed reference pictures of the style we wanted. The stylist (Ben) did not listen patiently, rushed through the process, and completely ignored the details we explained. The haircut looked nothing like what we asked for. My child was so upset with the result, and he cried for the entire time. The experience was very disappointing, and I would not recommend this stylist or this location. I hope management addresses this issue so other customers do not experience this frustration."
<end_of_turn>
