In [1]:
!pip install pyarrow fastparquet huggingface_hub matplotlib seaborn


Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading fastparquet-2024.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cramjam-2.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cramjam, fastparquet
Successfully installed cramjam-2.9.0 fastparquet-2024.11.0


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
from IPython.display import display, Markdown, Latex
from sklearn.model_selection import train_test_split

In [3]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer #Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import torch.nn as nn
import torch.optim as optim
import torch

In [4]:
splits = {'train': 'yelp_review_full/train-00000-of-00001.parquet', 'test': 'yelp_review_full/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["test"])
df = pd.concat([df_train, df_test], ignore_index=True)
index_list = range(len(df))
random.seed(10701)
indices = random.sample(index_list, 1000)
df = df.iloc[indices] # using only 10,000 rows for PoC to make training quicker
df['ground_sentiment'] = df['label'].map(lambda x: 1 if x == x>=2 else 0) # ground truth sentiment labels derived from the 'label' feature

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## **Rule-Based Models: Baseline**

Using lexical methods (VADER)

> Add blockquote



In [5]:
nltk.download('vader_lexicon')

rule_based = SentimentIntensityAnalyzer()

def analyze_sentiment_vader(row):
    result = rule_based.polarity_scores(row['text'])
    compound_score = result['compound']
    translated_label = 'Neutral'
    confidence = result['compound']
    if compound_score >= 0.05:
        translated_label = 'Positive'
    elif compound_score <= -0.05:
        translated_label = 'Negative'
        confidence *= -1
    else:
      confidence = (1- abs(confidence))

    return pd.Series([translated_label, confidence])
df[['sentiment', 'confidence']] = df.apply(analyze_sentiment_vader, axis=1)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [6]:
average_confidence = df.groupby('sentiment')['confidence'].mean()
display(Markdown("**Average confidence value of prediction grouped by predicted sentiment**"))
display(average_confidence)

**Average confidence value of prediction grouped by predicted sentiment**

Unnamed: 0_level_0,confidence
sentiment,Unnamed: 1_level_1
Negative,0.632132
Neutral,0.993693
Positive,0.826596


In [7]:
average_label = df.groupby('sentiment')['label'].mean()
display(Markdown("**Average label (number of stars) grouped by predicted sentiment**"))
display(average_label)

**Average label (number of stars) grouped by predicted sentiment**

Unnamed: 0_level_0,label
sentiment,Unnamed: 1_level_1
Negative,0.791444
Neutral,1.466667
Positive,2.364662


In [8]:
accuracy = np.mean(
    (df['label'].isin([0, 1]) & (df['sentiment'] == 'Negative')) |
    (df['label'].isin([2, 3, 4]) & (df['sentiment'] == 'Positive')) |
    ((df['sentiment'] == 'Neutral'))
    )
display(Markdown("**Accuracy compared to ground truth labels**"))
display(accuracy)

**Accuracy compared to ground truth labels**

0.734

## **Sequence Models: RNNs**



In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
X = df['text'].values
y = df['sentiment'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [12]:
class SentimentDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, maxlen=128):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.maxlen = maxlen

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=self.maxlen, return_tensors='pt')
        item = {key: encoding[key].squeeze() for key in encoding}  # remove extra dimension
        item['targets'] = torch.tensor(self.targets[idx], dtype=torch.long)
        return item

In [13]:
texts = df['text'].tolist()
labels = df['ground_sentiment'].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [14]:
class RNNSentimentModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers=1, dropout=0.25):
        super(RNNSentimentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = (self.embedding(x))
        output, (hidden, cell) = self.rnn(embedded)
        hidden = self.dropout(hidden[-1, :, :])
        output = self.fc(hidden)
        return output

In [15]:
embedding_dim = 100
hidden_dim = 256
output_dim = 2
n_layers = 4
dropout = 0.25

model = RNNSentimentModel(vocab_size=len(tokenizer.vocab), embed_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=n_layers, dropout=dropout)

In [16]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.to(device)



def evaluate(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids)
            loss = criterion(outputs, targets)

            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            correct_preds += (preds == targets).sum().item()
            total_preds += targets.size(0)

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_preds / total_preds
    return avg_loss, accuracy


def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0

    correct_preds = 0
    total_preds = 0

    for batch in dataloader:
      optimizer.zero_grad()

      inputs = batch['input_ids'].to(device)
      targets = batch['targets'].to(device)

      preds = model(inputs)
      loss = criterion(preds, targets)

      loss.backward()
      optimizer.step()

      total_loss += loss.item()

      preds = torch.argmax(preds, dim=1)
      correct_preds += (preds == targets).sum().item()
      total_preds += targets.size(0)

    avg_loss = total_loss / len(dataloader)
    acc = correct_preds / total_preds

    return avg_loss, acc

num_epochs = 10

for epoch in range(num_epochs):
  train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion)
  val_loss, val_acc = evaluate(model, val_loader, criterion)

  print(f"Epoch {epoch + 1}/{num_epochs}")
  print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
  print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}")

Epoch 1/10
Train Loss: 0.6600, Train Accuracy: 0.6362
Val Loss: 0.7021, Val Accuracy: 0.5200
Epoch 2/10
Train Loss: 0.6311, Train Accuracy: 0.6675
Val Loss: 0.7679, Val Accuracy: 0.5200
Epoch 3/10
Train Loss: 0.5883, Train Accuracy: 0.7175
Val Loss: 0.7426, Val Accuracy: 0.5300
Epoch 4/10
Train Loss: 0.5232, Train Accuracy: 0.7675
Val Loss: 0.8518, Val Accuracy: 0.5450
Epoch 5/10
Train Loss: 0.4653, Train Accuracy: 0.7825
Val Loss: 1.0105, Val Accuracy: 0.5400
Epoch 6/10
Train Loss: 0.3881, Train Accuracy: 0.8200
Val Loss: 1.2400, Val Accuracy: 0.5450
Epoch 7/10
Train Loss: 0.3507, Train Accuracy: 0.8387
Val Loss: 1.1220, Val Accuracy: 0.5450
Epoch 8/10
Train Loss: 0.3801, Train Accuracy: 0.8300
Val Loss: 1.2620, Val Accuracy: 0.5100
Epoch 9/10
Train Loss: 0.3250, Train Accuracy: 0.8438
Val Loss: 1.7225, Val Accuracy: 0.5400
Epoch 10/10
Train Loss: 0.3098, Train Accuracy: 0.8475
Val Loss: 1.6109, Val Accuracy: 0.5200
