# 3-Class BiLSTM Inference on USS Reviews

**High-level summary:**  
Recreates the BiLSTM model architecture from saved checkpoint, preprocesses new USS review texts, runs inference to obtain predicted sentiment and class probabilities, and—if true labels exist—evaluates performance with detailed metrics and confidence analysis.

In [None]:
# prompt: connect google drive

from google.colab import drive
drive.mount('/content/drive')

# prompt: load current directory

import os

os.chdir('/content/drive/My Drive/CS605-NLP-Project')

Mounted at /content/drive


In [None]:
# Install
#!pip install --upgrade numpy gensim --no-cache-dir


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Reads the Yelp train/test sets (in Parquet format) into pandas.

Reads your USS reviews CSV for later inference.

Prints out the number of rows/columns and shows the first few records of each.

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
import joblib

# Load the dataset
uss_reviews = pd.read_csv("datastore/USS_Reviews_Silver.csv", parse_dates=["publishedAtDate"])

# ──── 1. DEFINE THE SAME PREPROCESSING AND MODEL ARCHITECTURE ────────────────
# Same text preprocessing as training
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.split()

# Same model architecture as training
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes, padding_idx, n_layers=1, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers=n_layers,
                            bidirectional=True, batch_first=True,
                            dropout=dropout if n_layers>1 else 0)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim*2, n_classes)

    def forward(self, x):
        x_emb = self.embedding(x)
        _, (h_n, _) = self.lstm(x_emb)
        h_f = h_n[-2]  # forward final
        h_b = h_n[-1]  # backward final
        h   = torch.cat([h_f, h_b], dim=1)
        return self.fc(self.dropout(h))

# ──── 2. LOAD SAVED MODEL AND VOCABULARY ─────────────────────────────────────
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load the saved checkpoint
checkpoint = torch.load('model/3class_bilstm_yelp.pth', map_location=device)

# Extract components from checkpoint
itos = checkpoint['vocab']['itos']
stoi = checkpoint['vocab']['stoi']
config = checkpoint['config']
padding_idx = stoi['<PAD>']

# Recreate the model architecture
model = BiLSTMClassifier(
    vocab_size=len(itos),
    emb_dim=config['embed_dim'],
    hidden_dim=config['hidden_dim'],
    n_classes=config['n_classes'],
    padding_idx=padding_idx
).to(device)

# Load the trained weights
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
print("Model loaded successfully")

# ──── 3. CREATE INFERENCE DATASET ───────────────────────────────────────────
MAX_LEN = config['max_len']  # Same as training

class InferenceDataset(Dataset):
    def __init__(self, texts, stoi):
        self.texts = texts
        self.stoi = stoi

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        toks = preprocess_text(text)
        seq = [self.stoi.get(t, self.stoi['<UNK>']) for t in toks]
        if len(seq) < MAX_LEN:
            seq += [self.stoi['<PAD>']] * (MAX_LEN - len(seq))
        else:
            seq = seq[:MAX_LEN]
        return torch.tensor(seq, dtype=torch.long)

# Create dataset and dataloader
inference_dataset = InferenceDataset(uss_reviews['integrated_review'], stoi)
inference_loader = DataLoader(inference_dataset, batch_size=128, shuffle=False)

# ──── 4. RUN PREDICTIONS ─────────────────────────────────────────────────────
all_predictions = []
all_probabilities = []

with torch.no_grad():
    for batch in inference_loader:
        inputs = batch.to(device)
        outputs = model(inputs)
        probabilities = torch.softmax(outputs, dim=1)
        predictions = outputs.argmax(dim=1)

        all_predictions.extend(predictions.cpu().numpy())
        all_probabilities.extend(probabilities.cpu().numpy())

# Add predictions to dataframe
uss_reviews['pred_sentiment_lstm'] = all_predictions
for i, cls in enumerate(['negative', 'neutral', 'positive']):
    uss_reviews[f'prob_{cls}'] = [prob[i] for prob in all_probabilities]

# ──── 5. EVALUATION (IF TRUE LABELS AVAILABLE) ──────────────────────────────
# Same label mapping as training
def map_label(star):
    if star in [1,2]:
        return 0   # Negative
    elif star == 3:
        return 1   # Neutral
    else:
        return 2   # Positive

# Apply to true labels if available
if 'stars' in uss_reviews.columns:
    uss_reviews['true_sentiment'] = uss_reviews['stars'].apply(map_label)

    print("\nUSS Reviews Performance (3-class BiLSTM):")
    print(classification_report(
        uss_reviews['true_sentiment'],
        uss_reviews['pred_sentiment_lstm'],
        target_names=['Negative (0-1 stars)', 'Neutral (2 stars)', 'Positive (3-5 stars)'],
        digits=4
    ))

    # Confidence analysis
    print("\nConfidence by Predicted Class:")
    print(uss_reviews.groupby('pred_sentiment_lstm')['prob_positive'].describe())

    # Borderline cases (low confidence predictions)
    uss_reviews['max_confidence'] = uss_reviews[['prob_negative', 'prob_neutral', 'prob_positive']].max(axis=1)
    borderline = uss_reviews[uss_reviews['max_confidence'] < 0.7]
    print(f"\nBorderline Predictions (confidence<0.7): {len(borderline)} samples")
    print(borderline[['integrated_review', 'pred_sentiment_lstm', 'max_confidence']].head(10))


Using device: cuda
Model loaded successfully


  checkpoint = torch.load('model/3class_bilstm_yelp.pth', map_location=device)



USS Reviews Performance (3-class BiLSTM):
                      precision    recall  f1-score   support

Negative (0-1 stars)     0.4327    0.7738    0.5550      2206
   Neutral (2 stars)     0.1440    0.5054    0.2241      2133
Positive (3-5 stars)     0.9787    0.7018    0.8175     25073

            accuracy                         0.6930     29412
           macro avg     0.5185    0.6603    0.5322     29412
        weighted avg     0.8772    0.6930    0.7547     29412


Confidence by Predicted Class:
                       count      mean       std       min       25%  \
pred_sentiment_lstm                                                    
0                     3945.0  0.086268  0.097573  0.000005  0.008110   
1                     7487.0  0.265098  0.124054  0.007346  0.163122   
2                    17980.0  0.778727  0.173511  0.334709  0.638421   

                          50%       75%       max  
pred_sentiment_lstm                                
0                    0.