# Yelp Review Prediction - Model Inference
**Author:** Ben

**Project:** Capstone - Star Rating Prediction

**Objective:** Load trained models and make predictions on new reviews

---

## Overview

This notebook provides inference capabilities for the trained models:
- Random Forest (Text-Only)
- Random Forest (Text + Metadata)
- XGBoost (Text + Metadata)

**Usage:** Simply modify the `fake_review` variable with your own review text and run all cells.

## 1. Setup and Import Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
import pickle
import os

# Sklearn imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from scipy import sparse
from scipy.sparse import hstack

# XGBoost import
from xgboost import XGBClassifier

# PyTorch imports for LSTM
import torch
import torch.nn as nn

warnings.filterwarnings('ignore')

# Set visualization defaults
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully")
print(f"Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Libraries imported successfully
Current time: 2026-01-28 17:13:39


## 2. Define Helper Functions

In [2]:
def extract_metadata_features(df, encoders=None, fit=True):
    """
    Extract and encode metadata features from the dataframe.
    
    Args:
        df: Input dataframe
        encoders: Dictionary of fitted encoders (for test set)
        fit: Whether to fit encoders (True for train, False for test)
    
    Returns:
        features: Numpy array of features
        encoders: Dictionary of encoders (if fit=True)
    """
    features_list = []
    feature_names = []
    
    if encoders is None:
        encoders = {}
    
    # 1. Numerical features
    numerical_features = [
        'word_count',
        'char_count', 
        'useful',
        'funny',
        'cool',
        'total_engagement',
        'review_count',
        'user_review_count'
    ]
    
    for col in numerical_features:
        if col in df.columns:
            values = df[col].fillna(0).values.reshape(-1, 1)
            
            if fit:
                scaler = StandardScaler()
                scaled_values = scaler.fit_transform(values)
                encoders[f'{col}_scaler'] = scaler
            else:
                scaled_values = encoders[f'{col}_scaler'].transform(values)
            
            features_list.append(scaled_values)
            feature_names.append(col)
    
    # 2. Binary features
    binary_features = ['is_open']
    for col in binary_features:
        if col in df.columns:
            features_list.append(df[col].fillna(1).values.reshape(-1, 1))
            feature_names.append(col)
    
    # 3. Temporal features
    temporal_features = ['year', 'month', 'day_of_week']
    for col in temporal_features:
        if col in df.columns:
            values = df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else 0).values.reshape(-1, 1)
            features_list.append(values)
            feature_names.append(col)
    
    # 4. Categorical features - State (top 10 only)
    if 'state' in df.columns:
        if fit:
            top_states = df['state'].value_counts().head(10).index.tolist()
            encoders['top_states'] = top_states
        else:
            top_states = encoders['top_states']
        
        # Create binary features for top states
        for state in top_states:
            features_list.append((df['state'] == state).astype(int).values.reshape(-1, 1))
            feature_names.append(f'state_{state}')
    
    # 5. Categories - Extract top categories
    if 'categories' in df.columns:
        if fit:
            # Find most common categories
            all_cats = []
            for cats in df['categories'].fillna('').str.split(', '):
                all_cats.extend(cats)
            from collections import Counter
            top_categories = [cat for cat, _ in Counter(all_cats).most_common(20) if cat]
            encoders['top_categories'] = top_categories
        else:
            top_categories = encoders['top_categories']
        
        # Create binary features for top categories
        for category in top_categories:
            has_category = df['categories'].fillna('').str.contains(category, case=False, regex=False)
            features_list.append(has_category.astype(int).values.reshape(-1, 1))
            feature_names.append(f'category_{category.replace(" ", "_")}')
    
    # 6. User activity level
    if 'user_activity' in df.columns:
        activity_mapping = {
            '1 review': 1,
            '2-5 reviews': 2,
            '6-20 reviews': 3,
            '21-100 reviews': 4,
            '100+ reviews': 5
        }
        activity_encoded = df['user_activity'].map(activity_mapping).fillna(1).values.reshape(-1, 1)
        features_list.append(activity_encoded)
        feature_names.append('user_activity_level')
    
    # Combine all features
    X_metadata = np.hstack(features_list)
    
    if fit:
        return X_metadata, encoders, feature_names
    else:
        return X_metadata, feature_names


# Define BiLSTM model architecture (must match training)
class BiLSTM(nn.Module):
    """
    Bidirectional LSTM model for text classification.
    """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, 
                 n_layers=2, dropout=0.5):
        super(BiLSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                           batch_first=True, dropout=dropout if n_layers > 1 else 0,
                           bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        hidden = self.dropout(hidden)
        output = self.fc(hidden)
        return output


def simple_tokenizer(text):
    """Simple word tokenizer - split on whitespace and lowercase."""
    return text.lower().split()


def text_to_sequence(text, word2idx, max_len=200):
    """
    Convert text to sequence of word indices.
    """
    tokens = simple_tokenizer(text)
    sequence = [word2idx.get(word, word2idx['<UNK>']) for word in tokens]
    if max_len:
        sequence = sequence[:max_len]
    return sequence


def pad_sequences(sequences, max_len, pad_value=0):
    """
    Pad sequences to the same length.
    """
    padded = np.zeros((len(sequences), max_len), dtype=np.int64)
    for i, seq in enumerate(sequences):
        length = min(len(seq), max_len)
        padded[i, :length] = seq[:length]
    return padded


print("Helper functions defined")

Helper functions defined


## 3. Load Pre-Trained Models

In [3]:
# Check if models exist
models_exist = (
    os.path.exists('../Outputs/Models/rf_text.pkl') and
    os.path.exists('../Outputs/Models/rf_combined.pkl') and
    os.path.exists('../Outputs/Models/gbm.pkl') and
    os.path.exists('../Outputs/Models/tfidf_vectorizer.pkl') and
    os.path.exists('../Outputs/Models/metadata_encoders.pkl') and
    os.path.exists('../Outputs/Models/lstm_best.pt') and
    os.path.exists('../Outputs/Models/vocab.pkl')
)

if not models_exist:
    raise FileNotFoundError(
        "Pre-trained models not found!\n"
        "Please run the training notebooks first to train the models."
    )

print("=" * 80)
print("LOADING PRE-TRAINED MODELS")
print("=" * 80)

# Load statistical models
with open('../Outputs/Models/rf_text.pkl', 'rb') as f:
    rf_text = pickle.load(f)
print("Loaded: rf_text.pkl")

with open('../Outputs/Models/rf_combined.pkl', 'rb') as f:
    rf_combined = pickle.load(f)
print("Loaded: rf_combined.pkl")

with open('../Outputs/Models/gbm.pkl', 'rb') as f:
    gbm = pickle.load(f)
print("Loaded: gbm.pkl")

with open('../Outputs/Models/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf = pickle.load(f)
print("Loaded: tfidf_vectorizer.pkl")

with open('../Outputs/Models/metadata_encoders.pkl', 'rb') as f:
    metadata_encoders = pickle.load(f)
print("Loaded: metadata_encoders.pkl")

# Load LSTM model
with open('../Outputs/Models/vocab.pkl', 'rb') as f:
    vocab_data = pickle.load(f)
    word2idx = vocab_data['word2idx']
    idx2word = vocab_data['idx2word']
print("Loaded: vocab.pkl")

# LSTM hyperparameters (must match training)
VOCAB_SIZE = len(word2idx)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 5
MAX_LEN = 200

# Initialize LSTM model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lstm_model = BiLSTM(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    output_dim=OUTPUT_DIM,
    n_layers=2,
    dropout=0.5
).to(device)

# Load trained weights
lstm_model.load_state_dict(torch.load('../Outputs/Models/lstm_best.pt', map_location=device))
lstm_model.eval()
print("Loaded: lstm_best.pt")

print("\n" + "=" * 80)
print("All models loaded successfully!")
print(f"Using device for LSTM: {device}")
print("=" * 80)

LOADING PRE-TRAINED MODELS
Loaded: rf_text.pkl
Loaded: rf_combined.pkl
Loaded: gbm.pkl
Loaded: tfidf_vectorizer.pkl
Loaded: metadata_encoders.pkl
Loaded: vocab.pkl
Loaded: lstm_best.pt

All models loaded successfully!
Using device for LSTM: cuda


## 4. Make Predictions

### Configuration
Modify the variables below to customize your prediction:

In [14]:
# ============================================================================
# MODIFY THESE VARIABLES TO TEST YOUR OWN REVIEWS
# ============================================================================

# Review text (required)
review_text = "The food was just fine"

# Optional metadata (adjust as needed)
review_metadata = {
    'useful': 0,           # Number of useful votes
    'funny': 0,            # Number of funny votes
    'cool': 0,             # Number of cool votes
    'review_count': 5,    # Business review count
    'user_review_count': 5,  # User's total review count
    'is_open': 1,          # Is business open? (1=Yes, 0=No)
    'year': 2024,          # Year of review
    'month': 1,            # Month of review (1-12)
    'day_of_week': 0,      # Day of week (0=Monday, 6=Sunday)
    'state': 'CA',         # State abbreviation (e.g., 'CA', 'PA', 'FL')
    'categories': 'Restaurants',  # Business categories
    'user_activity': '6-20 reviews'  # User activity level
}

# ============================================================================

### Run Prediction

In [15]:
print("=" * 80)
print("MAKING PREDICTIONS")
print("=" * 80)

print(f"\nReview Text: {review_text}")
print(f"Review Length: {len(review_text)} characters, {len(review_text.split())} words")

# Step 1: Transform text with TF-IDF
review_tfidf = tfidf.transform([review_text])
print(f"\nText transformed to TF-IDF: {review_tfidf.shape}")

# Step 2: Create metadata features
review_df = pd.DataFrame({
    'text': [review_text],
    'word_count': [len(review_text.split())],
    'char_count': [len(review_text)],
    'useful': [review_metadata['useful']],
    'funny': [review_metadata['funny']],
    'cool': [review_metadata['cool']],
    'total_engagement': [review_metadata['useful'] + review_metadata['funny'] + review_metadata['cool']],
    'review_count': [review_metadata['review_count']],
    'user_review_count': [review_metadata['user_review_count']],
    'is_open': [review_metadata['is_open']],
    'year': [review_metadata['year']],
    'month': [review_metadata['month']],
    'day_of_week': [review_metadata['day_of_week']],
    'state': [review_metadata['state']],
    'categories': [review_metadata['categories']],
    'user_activity': [review_metadata['user_activity']]
})

metadata_array, _ = extract_metadata_features(review_df, encoders=metadata_encoders, fit=False)
metadata_sparse = sparse.csr_matrix(metadata_array)
print(f"Metadata features created: {metadata_sparse.shape}")

# Step 3: Combine text + metadata
review_combined = hstack([review_tfidf, metadata_sparse])
print(f"Combined features: {review_combined.shape} (expected: (1, 7043))")

# Step 4: Prepare text for LSTM
review_sequence = text_to_sequence(review_text, word2idx, MAX_LEN)
review_padded = pad_sequences([review_sequence], MAX_LEN)
review_tensor = torch.LongTensor(review_padded).to(device)
print(f"LSTM sequence prepared: {review_tensor.shape}")

# Step 5: Make predictions
print("\n" + "=" * 80)
print("PREDICTIONS")
print("=" * 80)

# Get class predictions
pred_rf_text = rf_text.predict(review_tfidf)[0]
pred_rf_combined = rf_combined.predict(review_combined)[0]
pred_xgb = gbm.predict(review_combined)[0] + 1

# LSTM prediction
with torch.no_grad():
    lstm_output = lstm_model(review_tensor)
    lstm_probs = torch.softmax(lstm_output, dim=1).cpu().numpy()[0]
    pred_lstm = lstm_output.argmax(1).cpu().numpy()[0] + 1  # Convert 0-4 to 1-5

# Get probability distributions
prob_rf_text = rf_text.predict_proba(review_tfidf)[0]
prob_rf_combined = rf_combined.predict_proba(review_combined)[0]
prob_xgb = gbm.predict_proba(review_combined)[0]
prob_lstm = lstm_probs

print(f"\nRandom Forest (Text-Only):       {int(pred_rf_text)} stars")
print(f"Random Forest (Text + Metadata): {int(pred_rf_combined)} stars")
print(f"XGBoost (Text + Metadata):       {int(pred_xgb)} stars")
print(f"LSTM (Text-Only):                {int(pred_lstm)} stars")

# Display probability distributions
print("\n" + "=" * 80)
print("PROBABILITY DISTRIBUTIONS")
print("=" * 80)

print("\nRandom Forest (Text-Only):")
for i, prob in enumerate(prob_rf_text, 1):
    bar = '█' * int(prob * 50)
    print(f"  {i} star: {prob:6.2%} {bar}")

print("\nRandom Forest (Text + Metadata):")
for i, prob in enumerate(prob_rf_combined, 1):
    bar = '█' * int(prob * 50)
    print(f"  {i} star: {prob:6.2%} {bar}")

print("\nXGBoost (Text + Metadata):")
for i, prob in enumerate(prob_xgb, 1):
    bar = '█' * int(prob * 50)
    print(f"  {i} star: {prob:6.2%} {bar}")

print("\nLSTM (Text-Only):")
for i, prob in enumerate(prob_lstm, 1):
    bar = '█' * int(prob * 50)
    print(f"  {i} star: {prob:6.2%} {bar}")

# Show consensus
predictions = [pred_rf_text, pred_rf_combined, pred_xgb, pred_lstm]
consensus = np.bincount([int(p) for p in predictions]).argmax()
unanimous = len(set([int(p) for p in predictions])) == 1

print("\n" + "=" * 80)
if unanimous:
    print(f"UNANIMOUS PREDICTION: {int(consensus)} STARS")
else:
    print(f"CONSENSUS PREDICTION: {int(consensus)} STARS")
print("=" * 80)

# Create visualization
print("\n" + "=" * 80)
print("GENERATING VISUALIZATION")
print("=" * 80)

# Check if file exists and generate unique filename
base_filename = '../Outputs/Plots/prediction_probabilities'
filename = f'{base_filename}.png'
counter = 1

while os.path.exists(filename):
    filename = f'{base_filename}_{counter}.png'
    counter += 1

# Create figure
fig, axes = plt.subplots(1, 4, figsize=(20, 5))

# Prepare data
star_ratings = np.arange(1, 6)
models_data = [
    ('Random Forest\n(Text-Only)', prob_rf_text, pred_rf_text),
    ('Random Forest\n(Text + Metadata)', prob_rf_combined, pred_rf_combined),
    ('XGBoost\n(Text + Metadata)', prob_xgb, pred_xgb),
    ('LSTM\n(Text-Only)', prob_lstm, pred_lstm)
]

# Plot each model
for idx, (model_name, probs, prediction) in enumerate(models_data):
    ax = axes[idx]
    
    # Create bar chart
    colors = ['#ff6b6b' if i+1 == int(prediction) else '#4ecdc4' for i in range(5)]
    bars = ax.bar(star_ratings, probs, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
    
    # Highlight predicted class
    bars[int(prediction) - 1].set_edgecolor('red')
    bars[int(prediction) - 1].set_linewidth(3)
    
    # Add percentage labels on bars
    for i, (star, prob) in enumerate(zip(star_ratings, probs)):
        ax.text(star, prob + 0.02, f'{prob:.1%}', ha='center', va='bottom', fontweight='bold', fontsize=10)
    
    # Formatting
    ax.set_xlabel('Star Rating', fontweight='bold', fontsize=12)
    ax.set_ylabel('Probability', fontweight='bold', fontsize=12)
    ax.set_title(f'{model_name}\nPrediction: {int(prediction)} stars', fontweight='bold', fontsize=12)
    ax.set_xticks(star_ratings)
    ax.set_ylim(0, max(probs) + 0.15)
    ax.grid(axis='y', alpha=0.3, linestyle='--')

# Add review text as suptitle (truncate if too long)
review_display = review_text if len(review_text) <= 80 else review_text[:77] + '...'
fig.suptitle(f'Review: "{review_display}"', fontsize=14, fontweight='bold', y=1.02)

plt.tight_layout()
plt.savefig(filename, dpi=300, bbox_inches='tight')
plt.close()

print(f"\nVisualization saved: {filename}")
print("=" * 80)

MAKING PREDICTIONS

Review Text: The food was just fine
Review Length: 22 characters, 5 words

Text transformed to TF-IDF: (1, 7000)
Metadata features created: (1, 43)
Combined features: (1, 7043) (expected: (1, 7043))
LSTM sequence prepared: torch.Size([1, 200])

PREDICTIONS

Random Forest (Text-Only):       2 stars
Random Forest (Text + Metadata): 5 stars
XGBoost (Text + Metadata):       1 stars
LSTM (Text-Only):                3 stars

PROBABILITY DISTRIBUTIONS

Random Forest (Text-Only):
  1 star: 19.52% █████████
  2 star: 20.86% ██████████
  3 star: 20.41% ██████████
  4 star: 19.07% █████████
  5 star: 20.14% ██████████

Random Forest (Text + Metadata):
  1 star: 21.85% ██████████
  2 star: 20.15% ██████████
  3 star: 18.15% █████████
  4 star: 17.42% ████████
  5 star: 22.43% ███████████

XGBoost (Text + Metadata):
  1 star: 37.42% ██████████████████
  2 star: 23.05% ███████████
  3 star: 15.86% ███████
  4 star:  8.87% ████
  5 star: 14.80% ███████

LSTM (Text-Only):
  1 star:

[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.0s finished



Visualization saved: ../Outputs/Plots/prediction_probabilities_7.png


## 5. Batch Predictions (Optional)

Use this cell to make predictions on multiple reviews at once:

In [6]:
# Define multiple reviews to test
reviews_to_test = [
    "Absolutely terrible! The worst experience ever. Never coming back!",
    "It was okay, nothing special. Average food and service.",
    "Amazing! Best restaurant in town! Highly recommend to everyone!",
    "Pretty good experience overall. Would come back.",
    "Disappointing. Expected much better based on the reviews."
]

print("=" * 80)
print("BATCH PREDICTIONS")
print("=" * 80)

results = []

for i, review in enumerate(reviews_to_test, 1):
    # Transform for statistical models
    review_tfidf = tfidf.transform([review])
    
    # Create metadata (using defaults)
    review_df = pd.DataFrame({
        'text': [review],
        'word_count': [len(review.split())],
        'char_count': [len(review)],
        'useful': [0],
        'funny': [0],
        'cool': [0],
        'total_engagement': [0],
        'review_count': [50],
        'user_review_count': [50],
        'is_open': [1],
        'year': [2024],
        'month': [1],
        'day_of_week': [0],
        'state': ['CA'],
        'categories': ['Restaurants'],
        'user_activity': ['6-20 reviews']
    })
    
    metadata_array, _ = extract_metadata_features(review_df, encoders=metadata_encoders, fit=False)
    metadata_sparse = sparse.csr_matrix(metadata_array)
    review_combined = hstack([review_tfidf, metadata_sparse])
    
    # Prepare for LSTM
    review_sequence = text_to_sequence(review, word2idx, MAX_LEN)
    review_padded = pad_sequences([review_sequence], MAX_LEN)
    review_tensor = torch.LongTensor(review_padded).to(device)
    
    # Predict with all models
    pred_xgb = int(gbm.predict(review_combined)[0] + 1)
    
    with torch.no_grad():
        lstm_output = lstm_model(review_tensor)
        pred_lstm = int(lstm_output.argmax(1).cpu().numpy()[0] + 1)
    
    results.append({
        'Review': review[:60] + '...' if len(review) > 60 else review,
        'XGBoost': pred_xgb,
        'LSTM': pred_lstm
    })
    
    print(f"\n{i}. {review[:70]}...")
    print(f"   XGBoost: {pred_xgb} stars | LSTM: {pred_lstm} stars")

# Summary
results_df = pd.DataFrame(results)
print("\n" + "=" * 80)
print("BATCH RESULTS SUMMARY")
print("=" * 80)
print(results_df.to_string(index=False))

BATCH PREDICTIONS

1. Absolutely terrible! The worst experience ever. Never coming back!...
   XGBoost: 1 stars | LSTM: 1 stars

2. It was okay, nothing special. Average food and service....
   XGBoost: 3 stars | LSTM: 3 stars

3. Amazing! Best restaurant in town! Highly recommend to everyone!...
   XGBoost: 5 stars | LSTM: 5 stars

4. Pretty good experience overall. Would come back....
   XGBoost: 3 stars | LSTM: 4 stars

5. Disappointing. Expected much better based on the reviews....
   XGBoost: 2 stars | LSTM: 2 stars

BATCH RESULTS SUMMARY
                                                         Review  XGBoost  LSTM
Absolutely terrible! The worst experience ever. Never coming...        1     1
        It was okay, nothing special. Average food and service.        3     3
Amazing! Best restaurant in town! Highly recommend to everyo...        5     5
               Pretty good experience overall. Would come back.        3     4
      Disappointing. Expected much better based on the 