In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import helper
import importlib
importlib.reload(helper)
from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordPiece
from tokenizers.trainers import BpeTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace, ByteLevel
import sentencepiece as spm
import os

In [21]:
df_original = pd.read_csv("train.csv")
df = df_original

In [22]:
df['review'].unique()

array(['Excellent', 'Very good', 'Bad', 'Good', 'Very bad'], dtype=object)

# Removing URLs

In [23]:
df_with_url_detection = helper.apply_url_processing(df, text_column='text', operation='detect')
print(f"Number of reviews with URLs: {df_with_url_detection['has_url'].sum()}")
print(f"Percentage: {df_with_url_detection['has_url'].mean() * 100:.2f}%")

Number of reviews with URLs: 140
Percentage: 2.00%


In [24]:
pd.set_option('display.max_colwidth', None)
df_cleaned = helper.apply_url_processing(df, text_column='text', operation='remove', replacement='')
print(f"\n✅ Created 'text_no_urls' column with URLs removed")

# # Show examples if URLs were found
# if df_with_url_detection['has_url'].any():
#     print(f"\nExamples of cleaned text (first 3 with URLs):")
#     sample_indices = df_with_url_detection[df_with_url_detection['has_url']].head(3).index
#     for idx in sample_indices:
#         original = df.loc[idx, 'text'][:150]
#         cleaned = df_cleaned.loc[idx, 'text'][:150]
#         print(f"\nOriginal: {original}...")
#         print(f"Cleaned:  {cleaned}...")



✅ Created 'text_no_urls' column with URLs removed


In [25]:
df = df_cleaned

# Converting to LowerCase

In [26]:
df['text'] = df['text'].str.lower()

In [27]:
x = df.loc[df['id'] == 7961,'text']
print(x)

0    honestly the best part of this place is the unbelievable deal you can get on pet related products. because they are one of the primary income sources for halo animal rescue, people that have pets, tend to make donations. you can almost always find pet carriers for under $10. they have dog crates for training and you don't have to shell out a fortune for them. they have fish tanks that will allow you to let little johnny get his feet wet in the world of pet care and the associated responsibilities without breaking the bank. if you are a person that shares your life with animals, you understand the rarity of these kinds of deals!\n\nyes they have clothes, furniture, tchatchkies, etc., but the real deals are on the pet products.\n\nas a side note, halo recently leased the mccac location at 5231 north 35th avenue (it was the previous maricopa county run cat adoption center). so if you are looking for a companion (cat or dog), check them out and support one of the fastest growing no-ki

# Target Distribution

In [28]:
# Calculate counts and percentages of target classes
summary = helper.target_distribution(df, target_col="review")

print("Target distribution (count, percent):")
print(summary)
print(f"\nTotal samples: {len(df):,}")
print(
    f"Imbalance ratio (max/min): {summary['count'].max() / summary['count'].min():.2f}"
)

Target distribution (count, percent):
           count    percent
review                     
Bad          648   9.257143
Excellent   2335  33.357143
Good        1024  14.628571
Very bad     524   7.485714
Very good   2469  35.271429

Total samples: 7,000
Imbalance ratio (max/min): 4.71


# Data Augmentation

In [None]:
# # Set to True to augment minority classes (Bad, Good, Very bad)
# run_augment = True
# if run_augment:
#     classes_to_augment = ["Bad", "Good", "Very bad"]
#     df_augmented = helper.augment_classes(
#         df,
#         text_col="text",
#         target_col="review",
#         classes=classes_to_augment,
#         augment_fn=lambda texts: helper.augment_with_bert_insert(
#             texts,
#             model_path="bert-base-uncased",
#             n=1,
#             aug_p=0.2,
#             action="insert",
#         ),
#         n_per_sample=1,
#         target_count=None,  # defaults to current max class size  
#         random_state=42,
#     )

#     print("Original counts:\n", helper.target_distribution(df, "review"))
#     print("\nAugmented counts:\n", helper.target_distribution(df_augmented, "review"))
#     print(f"\nTotal rows before: {len(df):,} | after: {len(df_augmented):,}")

#     # If you want to continue with augmented data:
#     df = df_augmented
# else:
#     print("Set run_augment=True to perform class-specific augmentation.")


Original counts:
            count    percent
review                     
Bad          648   9.257143
Excellent   2335  33.357143
Good        1024  14.628571
Very bad     524   7.485714
Very good   2469  35.271429

Augmented counts:
            count    percent
review                     
Bad         2469  20.219474
Excellent   2335  19.122103
Good        2469  20.219474
Very bad    2469  20.219474
Very good   2469  20.219474

Total rows before: 7,000 | after: 12,211


# Save Augmented Data

In [32]:
# Save augmented dataset to CSV file
save_augmented = True
output_filename = 'train_augmented.csv'  # Change filename if needed

if save_augmented:
    # Use df_augmented if it exists, otherwise use df (if augmentation was applied)
    if 'df_augmented' in locals():
        data_to_save = df_augmented
    elif run_augment and len(df) > len(df_original):
        # df was updated with augmented data
        data_to_save = df
    else:
        data_to_save = None
        print("⚠️ No augmented data found. Run augmentation first (set run_augment=True).")
    
    if data_to_save is not None:
        # Save to CSV
        data_to_save.to_csv(output_filename, index=False, encoding='utf-8')
        
        print(f"✅ Augmented data saved to: {output_filename}")
        print(f"   Total rows: {len(data_to_save):,}")
        print(f"   Columns: {list(data_to_save.columns)}")
        
        if os.path.exists(output_filename):
            file_size_mb = os.path.getsize(output_filename) / 1024 / 1024
            print(f"   File size: {file_size_mb:.2f} MB")
        
        # Show a preview
        print(f"\n   Preview (first 3 rows):")
        print(data_to_save.head(3))
else:
    print("Set save_augmented=True to save the augmented data to CSV.")


✅ Augmented data saved to: train_augmented.csv
   Total rows: 12,211
   Columns: ['id', 'text', 'review']
   File size: 9.25 MB

   Preview (first 3 rows):
       id  \
0  7961.0   
1  4697.0   
2  4459.0   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

# Custom Tokenizer Training

In [42]:
# Train custom tokenizer on your corpus
# Options: 'bpe', 'sentencepiece', 'wordpiece'
# SentencePiece is recommended for smaller datasets

train_tokenizer = True
tokenizer_method = 'bpe'  # or 'bpe', 'wordpiece'
vocab_size = 8000
output_dir = './tokenizers'

if train_tokenizer:
    # Get all texts from dataframe
    texts = df['text'].dropna().tolist()
    print(f"Training tokenizer on {len(texts):,} texts...")
    print(f"Method: {tokenizer_method}, Vocab size: {vocab_size}")
    
    # Create and train tokenizer
    tokenizer = helper.CustomTokenizer(
        vocab_size=vocab_size,
        method=tokenizer_method
    )
    tokenizer.train_from_texts(texts, output_dir=output_dir)
    
    # Test encoding
    sample_texts = df['text'].head(3).tolist()
    print("\n" + "="*60)
    print("Sample encodings:")
    print("="*60)
    for i, text in enumerate(sample_texts, 1):
        encoded = tokenizer.encode(text, max_length=128)
        print(f"\n{i}. Original: {text[:80]}...")
        print(f"   Encoded (first 20 tokens): {encoded[:20]}")
        print(f"   Length: {len([t for t in encoded if t != 0])}")
    
    print("\n✅ Tokenizer ready to use!")
else:
    print("Set train_tokenizer=True to train the custom tokenizer.")


Training tokenizer on 12,211 texts...
Method: bpe, Vocab size: 8000

Sample encodings:

1. Original: honestly the best part of this place is the unbelievable deal you can get on pet...
   Encoded (first 20 tokens): [1746, 111, 475, 660, 135, 179, 218, 126, 111, 4403, 959, 156, 263, 229, 118, 1709, 6275, 2553, 18, 362]
   Length: 128

2. Original: found indulge on a whim, based on their huge "gluten-free menu" outside - i was ...
   Encoded (first 20 tokens): [869, 6680, 118, 44, 7769, 16, 1754, 118, 283, 902, 6, 3359, 17, 827, 465, 6, 907, 17, 52, 139]
   Length: 128

3. Original: my take on mill street is that it's your classic college-town main strip area wi...
   Encoded (first 20 tokens): [164, 520, 118, 1624, 1270, 126, 161, 124, 11, 62, 313, 2881, 2281, 17, 716, 1088, 2266, 644, 175, 44]
   Length: 128

✅ Tokenizer ready to use!


# Model Training Setup


In [43]:
# Prepare data for training
# Load augmented data if available, otherwise use current df
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load augmented data or use current dataframe
if os.path.exists('train_augmented.csv'):
    print("Loading augmented data from train_augmented.csv...")
    df_train = pd.read_csv('train_augmented.csv')
else:
    print("Using current dataframe (no augmented file found)...")
    df_train = df.copy()

# Encode texts using trained tokenizer
if 'tokenizer' not in locals():
    print("⚠️ Tokenizer not found! Train tokenizer first.")
    print("   Loading saved tokenizer...")
    # Try to load saved tokenizer
    if os.path.exists('./tokenizers/bpe_tokenizer.json'):
        from tokenizers import Tokenizer
        loaded_tokenizer = Tokenizer.from_file('./tokenizers/bpe_tokenizer.json')
        # Wrap in CustomTokenizer for consistent interface
        tokenizer = helper.CustomTokenizer(vocab_size=8000, method='bpe')
        tokenizer.tokenizer = loaded_tokenizer
        print("✅ Loaded saved tokenizer")
    else:
        raise ValueError("No tokenizer found. Please train tokenizer first.")

# Encode all texts
print(f"\nEncoding {len(df_train):,} texts...")
texts = df_train['text'].dropna().tolist()

# Use CustomTokenizer's encode_batch if available, otherwise handle HuggingFace tokenizer
if isinstance(tokenizer, helper.CustomTokenizer):
    encoded_texts = tokenizer.encode_batch(texts, max_length=128, return_tensors='pt')
else:
    # Handle HuggingFace tokenizer directly
    encodings = tokenizer.encode_batch(
        texts,
        padding='max_length',
        truncation=True,
        max_length=128
    )
    encoded_texts = torch.tensor([enc.ids for enc in encodings])

# Create labels mapping
label_map = {'Very bad': 0, 'Bad': 1, 'Good': 2, 'Very good': 3, 'Excellent': 4}
reverse_label_map = {v: k for k, v in label_map.items()}
labels = torch.tensor([label_map[label] for label in df_train['review'].values], dtype=torch.long)

print(f"✅ Data prepared:")
print(f"   Texts encoded: {encoded_texts.shape}")
print(f"   Labels shape: {labels.shape}")
print(f"   Label distribution:")
for label_name, label_id in label_map.items():
    count = (labels == label_id).sum().item()
    print(f"     {label_name}: {count:,} ({count/len(labels)*100:.1f}%)")


Loading augmented data from train_augmented.csv...

Encoding 12,211 texts...
✅ Data prepared:
   Texts encoded: torch.Size([12211, 128])
   Labels shape: torch.Size([12211])
   Label distribution:
     Very bad: 2,469 (20.2%)
     Bad: 2,469 (20.2%)
     Good: 2,469 (20.2%)
     Very good: 2,469 (20.2%)
     Excellent: 2,335 (19.1%)


In [44]:
# Create train/validation/test splits
train_size = 0.7
val_size = 0.15
test_size = 0.15

# First split: train vs (val + test)
X_temp, X_test, y_temp, y_test = train_test_split(
    encoded_texts, labels, test_size=test_size, random_state=42, stratify=labels
)

# Second split: train vs val
val_size_adjusted = val_size / (train_size + val_size)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=val_size_adjusted, random_state=42, stratify=y_temp
)

print(f"✅ Data splits created:")
print(f"   Train: {len(X_train):,} samples ({len(X_train)/len(encoded_texts)*100:.1f}%)")
print(f"   Validation: {len(X_val):,} samples ({len(X_val)/len(encoded_texts)*100:.1f}%)")
print(f"   Test: {len(X_test):,} samples ({len(X_test)/len(encoded_texts)*100:.1f}%)")


✅ Data splits created:
   Train: 8,547 samples (70.0%)
   Validation: 1,832 samples (15.0%)
   Test: 1,832 samples (15.0%)


In [45]:
# Create PyTorch Dataset and DataLoaders
class ReviewDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.texts[idx],
            'labels': self.labels[idx]
        }

# Create datasets
train_dataset = ReviewDataset(X_train, y_train)
val_dataset = ReviewDataset(X_val, y_val)
test_dataset = ReviewDataset(X_test, y_test)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"✅ DataLoaders created:")
print(f"   Batch size: {batch_size}")
print(f"   Train batches: {len(train_loader)}")
print(f"   Val batches: {len(val_loader)}")
print(f"   Test batches: {len(test_loader)}")


✅ DataLoaders created:
   Batch size: 32
   Train batches: 268
   Val batches: 58
   Test batches: 58


# Initialize Model


In [46]:
# Initialize BalancedBERT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Get vocab size from tokenizer
if isinstance(tokenizer, helper.CustomTokenizer) and tokenizer.tokenizer is not None:
    # For CustomTokenizer with HuggingFace tokenizer
    if hasattr(tokenizer.tokenizer, 'get_vocab_size'):
        vocab_size = tokenizer.tokenizer.get_vocab_size()
    elif hasattr(tokenizer.tokenizer, 'get_vocab'):
        vocab_size = len(tokenizer.tokenizer.get_vocab())
    else:
        vocab_size = tokenizer.vocab_size  # Use the vocab_size from CustomTokenizer
elif hasattr(tokenizer, 'get_vocab_size'):
    vocab_size = tokenizer.get_vocab_size()
elif hasattr(tokenizer, 'vocab_size'):
    vocab_size = tokenizer.vocab_size
else:
    # Default fallback
    vocab_size = 8000
    print(f"⚠️ Could not determine vocab size, using default: {vocab_size}")

print(f"Using vocab size: {vocab_size}")

# Model hyperparameters
num_classes = 5
max_len = 128
hidden_size = 256
num_layers = 4
num_heads = 8
intermediate_size = 512
dropout = 0.2

# Create model
model = helper.BalancedBERT(
    vocab_size=vocab_size,
    num_classes=num_classes,
    max_len=max_len,
    hidden_size=hidden_size,
    num_layers=num_layers,
    num_heads=num_heads,
    intermediate_size=intermediate_size,
    dropout=dropout
).to(device)

print(f"\n✅ Model initialized on {device}")


Using device: cpu
Using vocab size: 8000
BalancedBERT parameters: 5,081,349

✅ Model initialized on cpu


# Training Loop


In [48]:
# Training configuration
train_model = True
num_epochs = 10
learning_rate = 2e-4
weight_decay = 1e-4

if train_model:
    # Loss and optimizer
    import torch.nn as nn
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
    
    # Training history
    train_losses = []
    val_losses = []
    val_accuracies = []
    best_val_acc = 0.0
    
    print(f"Starting training for {num_epochs} epochs...")
    print("="*60)
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
        
        avg_train_loss = train_loss / len(train_loader)
        train_acc = 100 * train_correct / train_total
        train_losses.append(avg_train_loss)
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        avg_val_loss = val_loss / len(val_loader)
        val_acc = 100 * val_correct / val_total
        val_losses.append(avg_val_loss)
        val_accuracies.append(val_acc)
        
        scheduler.step(avg_val_loss)
        
        # Print progress
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"  Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.2f}%")
        print(f"  Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.2f}%")
        
        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pt')
            print(f"  ✅ New best model saved! (Val Acc: {best_val_acc:.2f}%)")
        
        print("-"*60)
    
    print(f"\n✅ Training completed!")
    print(f"   Best validation accuracy: {best_val_acc:.2f}%")
    print(f"   Model saved to: best_model.pt")
else:
    print("Set train_model=True to start training.")


Starting training for 10 epochs...
Epoch 1/10
  Train Loss: nan, Train Acc: 20.21%
  Val Loss: nan, Val Acc: 20.20%
  ✅ New best model saved! (Val Acc: 20.20%)
------------------------------------------------------------
Epoch 2/10
  Train Loss: nan, Train Acc: 20.22%
  Val Loss: nan, Val Acc: 20.20%
------------------------------------------------------------
Epoch 3/10
  Train Loss: nan, Train Acc: 20.22%
  Val Loss: nan, Val Acc: 20.20%
------------------------------------------------------------
Epoch 4/10
  Train Loss: nan, Train Acc: 20.22%
  Val Loss: nan, Val Acc: 20.20%
------------------------------------------------------------
Epoch 5/10
  Train Loss: nan, Train Acc: 20.22%
  Val Loss: nan, Val Acc: 20.20%
------------------------------------------------------------
Epoch 6/10
  Train Loss: nan, Train Acc: 20.22%
  Val Loss: nan, Val Acc: 20.20%
------------------------------------------------------------
Epoch 7/10
  Train Loss: nan, Train Acc: 20.22%
  Val Loss: nan, Val 

In [None]:
# Load best model and evaluate on test set
if train_model and os.path.exists('best_model.pt'):
    import torch.nn as nn
    print("Loading best model for evaluation...")
    model.load_state_dict(torch.load('best_model.pt'))
    
    model.eval()
    all_predictions = []
    all_labels = []
    test_loss = 0.0
    
    criterion = nn.CrossEntropyLoss()
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    test_accuracy = accuracy_score(all_labels, all_predictions) * 100
    avg_test_loss = test_loss / len(test_loader)
    
    print(f"\n{'='*60}")
    print("TEST SET RESULTS")
    print(f"{'='*60}")
    print(f"Test Loss: {avg_test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.2f}%")
    print(f"\nClassification Report:")
    print(classification_report(
        all_labels, all_predictions,
        target_names=['Very bad', 'Bad', 'Good', 'Very good', 'Excellent']
    ))
    
    print(f"\nConfusion Matrix:")
    cm = confusion_matrix(all_labels, all_predictions)
    print(cm)
else:
    print("Train model first (set train_model=True in previous cell).")


In [None]:
# Example: Encode batches for model training
# This is useful when preparing data for neural network training

if train_tokenizer and 'tokenizer' in locals():
    # Example batch encoding
    batch_texts = df['text'].head(10).tolist()
    encoded_batch = tokenizer.encode_batch(batch_texts, max_length=128, return_tensors='pt')
    
    print(f"Batch shape: {encoded_batch.shape}")
    print(f"Batch size: {len(batch_texts)}, Max length: 128")
    print(f"\nFirst 3 sequences (first 10 tokens each):")
    for i in range(min(3, len(batch_texts))):
        print(f"  {i+1}. {encoded_batch[i][:10].tolist()}")
    
    # You can use this for training:
    # X = tokenizer.encode_batch(df['text'].tolist(), max_length=128, return_tensors='pt')
    # y = df['review'].map({'Very bad': 0, 'Bad': 1, 'Good': 2, 'Very good': 3, 'Excellent': 4}).values
else:
    print("Train tokenizer first (set train_tokenizer=True in previous cell).")
