# 3-Class BiLSTM Sentiment Classifier on Yelp Reviews

**High-level summary:**  
An end-to-end PyTorch pipeline that loads parquet review data, maps star ratings to 3 sentiment classes, preprocesses text, builds a vocabulary with GloVe embeddings, constructs a balanced DataLoader (or weighted sampler), defines a BiLSTM classifier, trains with class-imbalance handling, evaluates with detailed metrics, saves the model, and provides an inference utility.

In [None]:
# prompt: connect google drive

from google.colab import drive
drive.mount('/content/drive')

# prompt: load current directory

import os

os.chdir('/content/drive/My Drive/CS605-NLP-Project')

Mounted at /content/drive


In [None]:
# Install
#!pip install --upgrade numpy gensim --no-cache-dir


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Reads the Yelp train/test sets (in Parquet format) into pandas.

Reads your USS reviews CSV for later inference.

Prints out the number of rows/columns and shows the first few records of each.

In [None]:
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# 1. Load your data (update file paths as needed)
train_df = pd.read_parquet("datastore/train-00000-of-00001.parquet")
test_df  = pd.read_parquet("datastore/test-00000-of-00001.parquet")

# 1. LOAD & MAP LABELS
def map_label(star):
    if star in [0, 1]:
        return 0   # Negative
    elif star == 2:
        return 1   # Neutral
    else:
        return 2   # Positive

train_df['label'] = train_df['label'].apply(map_label)
test_df['label']  = test_df['label'].apply(map_label)

# Print class distribution
print("Training set class distribution:")
print(train_df['label'].value_counts().sort_index())
print("\nTest set class distribution:")
print(test_df['label'].value_counts().sort_index())

# Calculate class weights for imbalanced dataset
train_labels = train_df['label'].values
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
print(f"\nClass weights: {class_weights}")

# 2. TEXT PREPROCESSING
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.split()

# 3. BUILD VOCAB
MAX_VOCAB_SIZE = 20000
PAD, UNK = '<PAD>', '<UNK>'
token_counts = {}
for txt in tqdm(train_df['text'], desc='Counting tokens'):
    for tok in preprocess_text(txt):
        token_counts[tok] = token_counts.get(tok, 0) + 1

# keep top‐MAX_VOCAB_SIZE
most_common = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
itos = [PAD, UNK] + [tok for tok, _ in most_common[:MAX_VOCAB_SIZE-2]]
stoi = {tok: i for i, tok in enumerate(itos)}

# 4. LOAD GloVe
EMBED_DIM = 100
GLOVE_PATH = 'datastore/glove.6B.100d.txt'  # adjust if needed
emb_index = {}
try:
    with open(GLOVE_PATH, encoding='utf8') as f:
        for line in f:
            parts = line.split()
            word  = parts[0]
            vec   = np.array(parts[1:], dtype='float32')
            emb_index[word] = vec
    print(f"Loaded {len(emb_index)} GloVe embeddings")
except FileNotFoundError:
    print(f"GloVe file not found at {GLOVE_PATH}. Using random embeddings.")
    emb_index = {}

vocab_size = len(itos)
emb_matrix = np.zeros((vocab_size, EMBED_DIM))
for i, tok in enumerate(itos):
    emb_matrix[i] = emb_index.get(tok, np.random.normal(scale=0.6, size=(EMBED_DIM,)))

# 5. DATASET & DATALOADER
MAX_LEN = 200
BATCH_SIZE = 128

class YelpDataset(Dataset):
    def __init__(self, df):
        self.texts  = df['text'].tolist()
        self.labels = df['label'].tolist()
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        toks = preprocess_text(self.texts[idx])
        seq = [stoi.get(t, stoi[UNK]) for t in toks]
        if len(seq) < MAX_LEN:
            seq += [stoi[PAD]] * (MAX_LEN - len(seq))
        else:
            seq = seq[:MAX_LEN]
        return torch.tensor(seq, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

train_ds = YelpDataset(train_df)
test_ds  = YelpDataset(test_df)

# Create balanced sampler for training
def create_balanced_sampler(labels):
    """Create weighted sampler to handle class imbalance"""
    class_counts = np.bincount(labels)
    class_weights = 1.0 / class_counts
    sample_weights = [class_weights[label] for label in labels]

    sampler = WeightedRandomSampler(
        weights=sample_weights,
        num_samples=len(sample_weights),
        replacement=True
    )
    return sampler

# Option 1: Use balanced sampler
use_balanced_sampler = False  # Set to True to use balanced sampling instead of weighted loss

if use_balanced_sampler:
    train_sampler = create_balanced_sampler(train_labels)
    train_ld = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=train_sampler)
    print("Using balanced sampler")
else:
    train_ld = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    print("Using weighted loss function")

test_ld = DataLoader(test_ds, batch_size=BATCH_SIZE)

# 6. MODEL
class BiLSTMClassifier(nn.Module):
    def __init__(self, emb_matrix, hidden_dim, n_classes, n_layers=1, dropout=0.5):
        super().__init__()
        vocab_sz, emb_dim = emb_matrix.shape
        self.embedding = nn.Embedding(vocab_sz, emb_dim, padding_idx=stoi[PAD])
        self.embedding.weight.data.copy_(torch.from_numpy(emb_matrix))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers=n_layers,
                            bidirectional=True, batch_first=True,
                            dropout=dropout if n_layers>1 else 0)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim*2, n_classes)

    def forward(self, x):
        x_emb = self.embedding(x)
        _, (h_n, _) = self.lstm(x_emb)
        h_f = h_n[-2]  # forward final
        h_b = h_n[-1]  # backward final
        h   = torch.cat([h_f, h_b], dim=1)
        return self.fc(self.dropout(h))

device      = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
hidden_dim  = 64
n_classes   = 3
model       = BiLSTMClassifier(emb_matrix, hidden_dim, n_classes).to(device)

# Use weighted loss function to handle class imbalance
if not use_balanced_sampler:
    class_weights_tensor = torch.FloatTensor(class_weights).to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
    print(f"Using weighted CrossEntropyLoss with weights: {class_weights}")
else:
    criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 7. TRAIN
EPOCHS = 5
for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss, total_acc = 0, 0
    for texts, labels in tqdm(train_ld, desc=f'Epoch {epoch}/{EPOCHS}'):
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        out = model(texts)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * texts.size(0)
        total_acc  += (out.argmax(1)==labels).sum().item()
    avg_l = total_loss/len(train_ds)
    avg_a = total_acc/len(train_ds)
    print(f"Epoch {epoch}: loss={avg_l:.4f}, acc={avg_a:.4f}")

# 8. DETAILED EVALUATION WITH IMBALANCE-AWARE METRICS
def evaluate_model_detailed(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    test_loss = 0

    with torch.no_grad():
        for texts, labels in tqdm(test_ld, desc="Evaluating"):
            texts, labels = texts.to(device), labels.to(device)
            out = model(texts)
            test_loss += criterion(out, labels).item() * texts.size(0)

            preds = out.argmax(1).cpu().numpy()
            labels_np = labels.cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels_np)

    test_loss /= len(test_ds)
    overall_acc = sum(p == l for p, l in zip(all_preds, all_labels)) / len(all_labels)

    # Calculate balanced metrics
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    weighted_f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"\n{'='*50}")
    print(f"DETAILED EVALUATION RESULTS")
    print(f"{'='*50}")
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Overall Accuracy: {overall_acc:.4f}")
    print(f"Macro F1-Score: {macro_f1:.4f}")
    print(f"Weighted F1-Score: {weighted_f1:.4f}")

    print(f"\nClassification Report:")
    print(classification_report(all_labels, all_preds,
                              target_names=['Negative', 'Neutral', 'Positive'],
                              digits=4))

    print(f"\nConfusion Matrix:")
    cm = confusion_matrix(all_labels, all_preds)
    print("       Predicted")
    print("       Neg  Neu  Pos")
    for i, row in enumerate(cm):
        class_name = ['Neg', 'Neu', 'Pos'][i]
        print(f"{class_name:>6} {row[0]:4d} {row[1]:4d} {row[2]:4d}")

    return overall_acc, macro_f1, weighted_f1

# Run detailed evaluation
evaluate_model_detailed(model, test_ld, device)

# 9. SAVE MODEL
torch.save({
    'model_state_dict': model.state_dict(),
    'vocab': {'itos': itos, 'stoi': stoi},
    'class_weights': class_weights,
    'config': {
        'hidden_dim': hidden_dim,
        'n_classes': n_classes,
        'max_len': MAX_LEN,
        'embed_dim': EMBED_DIM
    }
}, 'model/3class_bilstm_yelp.pth')
print("\nModel saved as 'model/3class_bilstm_yelp.pth'")

# 10. EXAMPLE PREDICTIONS
def predict_text(model, text, stoi, device, max_len=200):
    model.eval()
    tokens = preprocess_text(text)
    seq = [stoi.get(t, stoi[UNK]) for t in tokens]
    if len(seq) < max_len:
        seq += [stoi[PAD]] * (max_len - len(seq))
    else:
        seq = seq[:max_len]

    input_tensor = torch.tensor(seq, dtype=torch.long).unsqueeze(0).to(device)

    with torch.no_grad():
        output = model(input_tensor)
        probabilities = torch.softmax(output, dim=1)
        predicted_class = output.argmax(dim=1).item()
        confidence = probabilities[0][predicted_class].item()

    class_names = ['Negative', 'Neutral', 'Positive']
    return class_names[predicted_class], confidence, probabilities[0].cpu().numpy()

print(f"\n{'='*50}")
print("EXAMPLE PREDICTIONS")
print(f"{'='*50}")
sample_texts = [
    "This restaurant is absolutely amazing! Great food and service.",
    "The food was okay, nothing special but not bad either.",
    "Terrible experience, worst meal I've ever had."
]

for text in sample_texts:
    sentiment, confidence, probs = predict_text(model, text, stoi, device)
    print(f"\nText: {text}")
    print(f"Predicted: {sentiment} (confidence: {confidence:.3f})")
    print(f"Probabilities - Negative: {probs[0]:.3f}, Neutral: {probs[1]:.3f}, Positive: {probs[2]:.3f}")

Training set class distribution:
label
0    260000
1    130000
2    260000
Name: count, dtype: int64

Test set class distribution:
label
0    20000
1    10000
2    20000
Name: count, dtype: int64

Class weights: [0.83333333 1.66666667 0.83333333]


Counting tokens: 100%|██████████| 650000/650000 [00:33<00:00, 19380.96it/s]


Loaded 400000 GloVe embeddings
Using weighted loss function
Using weighted CrossEntropyLoss with weights: [0.83333333 1.66666667 0.83333333]


Epoch 1/5: 100%|██████████| 5079/5079 [01:18<00:00, 64.36it/s]


Epoch 1: loss=0.7561, acc=0.6759


Epoch 2/5: 100%|██████████| 5079/5079 [01:18<00:00, 64.30it/s]


Epoch 2: loss=0.6356, acc=0.7375


Epoch 3/5: 100%|██████████| 5079/5079 [01:18<00:00, 64.50it/s]


Epoch 3: loss=0.6039, acc=0.7528


Epoch 4/5: 100%|██████████| 5079/5079 [01:17<00:00, 65.47it/s]


Epoch 4: loss=0.5855, acc=0.7616


Epoch 5/5: 100%|██████████| 5079/5079 [01:17<00:00, 65.58it/s]


Epoch 5: loss=0.5730, acc=0.7673


Evaluating: 100%|██████████| 391/391 [00:04<00:00, 79.24it/s]



DETAILED EVALUATION RESULTS
Test Loss: 0.5551
Overall Accuracy: 0.7730
Macro F1-Score: 0.7479
Weighted F1-Score: 0.7840

Classification Report:
              precision    recall  f1-score   support

    Negative     0.8875    0.7871    0.8343     20000
     Neutral     0.4847    0.6830    0.5670     10000
    Positive     0.8847    0.8038    0.8423     20000

    accuracy                         0.7730     50000
   macro avg     0.7523    0.7580    0.7479     50000
weighted avg     0.8058    0.7730    0.7840     50000


Confusion Matrix:
       Predicted
       Neg  Neu  Pos
   Neg 15742 3790  468
   Neu 1543 6830 1627
   Pos  452 3472 16076

Model saved as 'model/3class_bilstm_yelp.pth'

EXAMPLE PREDICTIONS

Text: This restaurant is absolutely amazing! Great food and service.
Predicted: Positive (confidence: 0.993)
Probabilities - Negative: 0.000, Neutral: 0.007, Positive: 0.993

Text: The food was okay, nothing special but not bad either.
Predicted: Neutral (confidence: 0.789)
Proba