In [None]:
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import time
import os
import gc

In [None]:
# ============================================================================
# GPU CHECK
# ============================================================================
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cpu':
    raise SystemExit("NO GPU! Go to Runtime > Change runtime type > Select GPU")

gpu_name = torch.cuda.get_device_name(0)
total_vram = torch.cuda.get_device_properties(0).total_memory / 1e9

print(f"\n✓ GPU: {gpu_name}")
print(f"✓ VRAM: {total_vram:.2f} GB")

# Aggressive batch sizing for speed
if total_vram >= 15:
    batch_size = 1024
elif total_vram >= 12:
    batch_size = 768
else:
    batch_size = 512

print(f"✓ Batch size: {batch_size}\n")


✓ GPU: Tesla T4
✓ VRAM: 15.83 GB
✓ Batch size: 1024



In [None]:
# ============================================================================
#  DATA LOADING
# ============================================================================
filename = '/content/drive/MyDrive/reviews_complete.json'
output_path = '/content/drive/MyDrive/embeddings/'
os.makedirs(output_path, exist_ok=True)

print(f"Loading from: {filename}")
print("Counting reviews...")

# Count lines efficiently
with open(filename, 'r') as f:
    num_lines = sum(1 for _ in f)
print(f"✓ Found {num_lines:,} reviews\n")

Loading from: /content/drive/MyDrive/reviews_complete.json
Counting reviews...
✓ Found 2,516,821 reviews



In [None]:
# ============================================================================
# LOAD MODEL FIRST
# ============================================================================
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
model.eval()  # Set to eval mode for faster inference
embedding_dim = model.get_sentence_embedding_dimension()
print(f"✓ Model loaded ({embedding_dim} dimensions)\n")

# Estimate time
est_speed = 2500  # T4 with batch_size=1024
print(f"⏱️  Estimated time: {(num_lines / est_speed) / 60:.1f} minutes\n")

# ============================================================================
# PRE-ALLOCATE EVERYTHING
# ============================================================================
print(f"Pre-allocating arrays...")
embeddings = np.zeros((num_lines, embedding_dim), dtype=np.float32)
print(f"✓ Embeddings array: {embeddings.nbytes/1e9:.2f} GB\n")


Loading embedding model...
✓ Model loaded (384 dimensions)

⏱️  Estimated time: 16.8 minutes

Pre-allocating arrays...
✓ Embeddings array: 3.87 GB



In [None]:

# ============================================================================
# STREAMING APPROACH - PROCESS WITHOUT LOADING FULL DF
# ============================================================================
print("Processing reviews in streaming mode...")
start_time = time.time()

chunk_size = 50000
current_idx = 0
text_buffer = []
buffer_size = batch_size * 10

# Store metadata separately as we go
metadata_list = []

try:
    with tqdm(total=num_lines, desc="Processing") as pbar:
        for chunk in pd.read_json(filename, lines=True, chunksize=chunk_size):
            # Extract only what we need
            texts = chunk['text'].fillna('').tolist()

            # Save metadata (only essential columns)
            meta_chunk = chunk[['review_id']].copy()
            for col in ['business_id', 'stars', 'sentiment', 'date', 'user_id']:
                if col in chunk.columns:
                    meta_chunk[col] = chunk[col]
            metadata_list.append(meta_chunk)

            # Process texts in batches
            for i in range(0, len(texts), batch_size):
                batch_texts = texts[i:i+batch_size]

                # Encode batch
                batch_embeddings = model.encode(
                    batch_texts,
                    batch_size=batch_size,
                    show_progress_bar=False,
                    convert_to_numpy=True,
                    normalize_embeddings=True,
                    convert_to_tensor=False  # Stay in numpy
                )

                # Store directly
                batch_end = current_idx + len(batch_texts)
                embeddings[current_idx:batch_end] = batch_embeddings
                current_idx = batch_end

                pbar.update(len(batch_texts))

                # GPU cleanup every 20 batches
                if (current_idx // batch_size) % 20 == 0:
                    torch.cuda.empty_cache()

            # Clear chunk from memory
            del chunk, texts
            gc.collect()

    elapsed = time.time() - start_time
    print(f"\n✓ Complete! {current_idx:,} embeddings in {elapsed/60:.1f} min")
    print(f"✓ Speed: {current_idx/elapsed:.0f} reviews/sec\n")

except Exception as e:
    print(f"\n ERROR: {e}")
    raise

# ============================================================================
# SAVE EMBEDDINGS IMMEDIATELY
# ============================================================================
print("Saving embeddings...")
np.save(f'{output_path}review_embeddings.npy', embeddings)
print(f"✓ Saved: review_embeddings.npy ({embeddings.nbytes/1e9:.2f} GB)")

# Free embedding memory
del embeddings
gc.collect()

# ============================================================================
# SAVE METADATA
# ============================================================================
print("Saving metadata...")
metadata = pd.concat(metadata_list, ignore_index=True)
del metadata_list
gc.collect()

metadata.to_parquet(f'{output_path}review_metadata.parquet', index=False)
print(f"✓ Saved: review_metadata.parquet ({len(metadata):,} rows)\n")

print("=" * 70)
print("✓ ALL DONE!")
print(f"✓ Total time: {elapsed/60:.1f} minutes")
print("=" * 70)

Processing reviews in streaming mode...


Processing: 100%|██████████| 2516821/2516821 [1:58:42<00:00, 353.37it/s]



✓ Complete! 2,516,821 embeddings in 118.7 min
✓ Speed: 353 reviews/sec

Saving embeddings...
✓ Saved: review_embeddings.npy (3.87 GB)
Saving metadata...
✓ Saved: review_metadata.parquet (2,516,821 rows)

✓ ALL DONE!
✓ Total time: 118.7 minutes


In [None]:
# ============================================================================
# QUICK TEST
# ============================================================================
print("Quick test:")
test = np.load(f'{output_path}review_embeddings.npy')
meta = pd.read_parquet(f'{output_path}review_metadata.parquet')
print(f"✓ Embeddings: {test.shape}")
print(f"✓ Metadata: {meta.shape}")
print(f"✓ Aligned: {len(test) == len(meta)}")

print("\n" + "="*70)
print(f"SUCCESS! Files saved to Google Drive:")
print(f"  {output_path}review_embeddings.npy")
print(f"  {output_path}review_metadata.parquet")
print("="*70)

Quick test:
✓ Embeddings: (2516821, 384)
✓ Metadata: (2516821, 5)
✓ Aligned: True

SUCCESS! Files saved to Google Drive:
  /content/drive/MyDrive/embeddings/review_embeddings.npy
  /content/drive/MyDrive/embeddings/review_metadata.parquet
