In [86]:
!pip install pyarrow fastparquet huggingface_hub matplotlib seaborn



In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

In [88]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


In [89]:
splits = {'train': 'yelp_review_full/train-00000-of-00001.parquet', 'test': 'yelp_review_full/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["test"])
df = pd.concat([df_train, df_test], ignore_index=True)
index_list = range(len(df))
random.seed(10701)
indices = random.sample(index_list, 1000)
df = df.iloc[indices] # using only 10,000 rows for PoC to make training quicker

In [90]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [91]:
device

device(type='cuda')

### RoBERTa

In [None]:
roberta_sentiment_pipeline = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment', tokenizer='cardiffnlp/twitter-roberta-base-sentiment', device = device)

In [None]:
def analyze_sentiment(row):
    result = roberta_sentiment_pipeline(row['text'], truncation=True, max_length = 512)[0]
    label_map = {
        'LABEL_0': 'Negative',
        'LABEL_1': 'Neutral',
        'LABEL_2': 'Positive'
    }
    translated_label = label_map[result['label']]
    return pd.Series([translated_label, result['score']])

In [None]:
df[['sentiment', 'confidence']] = df.apply(analyze_sentiment, axis=1)

In [None]:
df[['text', 'sentiment', 'confidence']].head()

Unnamed: 0,text,sentiment,confidence
412564,This was the first place in Vegas where the Ye...,Negative,0.660865
171013,We stayed in one of the rooms in the actual Lu...,Negative,0.926861
303456,I've been to Carnevino twice and have been ver...,Negative,0.570523
584751,"I love there food, but the wait.....The long l...",Negative,0.768381
611725,It was on time. That's all the good I can say....,Negative,0.798004


In [None]:
df['text'].iloc[1]

'We stayed in one of the rooms in the actual Luxor Pyramid.  From a structural standpoint, the elevators in the pyramid are an unpleasant ride.  Every time we went into the elevators, we heard complaints from every guest, us included.  Not only is it uneasy riding \\"sideways\\" to your room, the elevators make you more uneasy because of the squeaking and the lack of smoothness of the ride. In our room, everything looked very worn.  On our dresser wardrobe, the paint was faded and one of the handles was missing.  In the bathroom, some of tile had also fallen off.  Overall lighting was poor in both the bathroom and the main room, even when the drapes were drawn back.  My biggest complaint was a huge 4ft by 4ft wet spot between the foot of one of our beds and the door that neighbors the next room.  We had called a few times and let housekeeping know about this incident but the entire weekend we were there, and nobody came up to fix the problem.  We ended up putting a chair to create a br

In [None]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Positive,589
Negative,329
Neutral,82


In [None]:
average_confidence = df.groupby('sentiment')['confidence'].mean()
display(average_confidence)

Unnamed: 0_level_0,confidence
sentiment,Unnamed: 1_level_1
Negative,0.757117
Neutral,0.469608
Positive,0.864947


In [None]:
average_label = df.groupby('sentiment')['label'].mean()
display(average_label)

Unnamed: 0_level_0,label
sentiment,Unnamed: 1_level_1
Negative,0.677812
Neutral,1.47561
Positive,2.908319


### DistilBERT

In [None]:
distilbert_sentiment_pipeline = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english', tokenizer='distilbert-base-uncased-finetuned-sst-2-english',  device = device)

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
def analyze_sentiment(row):
    result = distilbert_sentiment_pipeline(row['text'], truncation=True, max_length=512)[0]

    label_map = {
        'NEGATIVE': 'Negative',
        'NEUTRAL': 'Neutral',
        'POSITIVE': 'Positive'
    }
    translated_label = label_map[result['label']]
    return pd.Series([translated_label, result['score']])

In [None]:
df[['sentiment_distil', 'confidence_distil']] = df.apply(analyze_sentiment, axis=1)

In [None]:
df.head()

Unnamed: 0,label,text,sentiment,confidence,sentiment_distil,confidence_distil
412564,1,This was the first place in Vegas where the Ye...,Negative,0.603374,Negative,0.998504
171013,0,We stayed in one of the rooms in the actual Lu...,Negative,0.612196,Negative,0.999731
303456,0,I've been to Carnevino twice and have been ver...,Negative,0.613428,Negative,0.979577
584751,1,"I love there food, but the wait.....The long l...",Negative,0.625324,Negative,0.99715
611725,1,It was on time. That's all the good I can say....,Negative,0.61004,Negative,0.999511


In [None]:
df['sentiment_distil'].value_counts()

Unnamed: 0_level_0,count
sentiment_distil,Unnamed: 1_level_1
Positive,504
Negative,496


In [None]:
average_confidence = df.groupby('sentiment_distil')['confidence_distil'].mean()
display(average_confidence)

Unnamed: 0_level_0,confidence_distil
sentiment_distil,Unnamed: 1_level_1
Negative,0.970956
Positive,0.974071


In [None]:
average_label = df.groupby('sentiment_distil')['label'].mean()
display(average_label)

Unnamed: 0_level_0,label
sentiment_distil,Unnamed: 1_level_1
Negative,1.032258
Positive,3.065476


### FinBERT

In [93]:
fin_sentiment_pipeline = pipeline('sentiment-analysis', model='yiyanghkust/finbert-tone', tokenizer='yiyanghkust/finbert-tone',  device = -1)

In [98]:
def analyze_finbert_sentiment(row):
    result = fin_sentiment_pipeline(row['text'], truncation=True, max_length=512)[0]
    #print(result)

    # label_map = {
    #     'LABEL_0': 'Negative',
    #     'LABEL_1': 'Neutral',
    #     'LABEL_2': 'Positive'
    # }
    # translated_label = label_map[result['label']]
    return pd.Series([result['label'], result['score']])

In [99]:
df[['sentiment_fin', 'confidence_fin']] = df.apply(analyze_finbert_sentiment, axis=1)

In [100]:
df.head()

Unnamed: 0,label,text,sentiment_fin,confidence_fin
412564,1,This was the first place in Vegas where the Ye...,Negative,0.867933
171013,0,We stayed in one of the rooms in the actual Lu...,Negative,0.998929
303456,0,I've been to Carnevino twice and have been ver...,Negative,0.96799
584751,1,"I love there food, but the wait.....The long l...",Negative,0.969553
611725,1,It was on time. That's all the good I can say....,Negative,0.995087


In [101]:
df['sentiment_fin'].value_counts()

Unnamed: 0_level_0,count
sentiment_fin,Unnamed: 1_level_1
Neutral,609
Positive,290
Negative,101


In [102]:
average_confidence = df.groupby('sentiment_fin')['confidence_fin'].mean()
display(average_confidence)

Unnamed: 0_level_0,confidence_fin
sentiment_fin,Unnamed: 1_level_1
Negative,0.913897
Neutral,0.949656
Positive,0.95512


In [103]:
average_label = df.groupby('sentiment_fin')['label'].mean()
display(average_label)

Unnamed: 0_level_0,label
sentiment_fin,Unnamed: 1_level_1
Negative,0.930693
Neutral,1.891626
Positive,2.796552
