<a href="https://colab.research.google.com/github/FujunhaoFc/Word2Vec/blob/main/Bert_for_BagOfWords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install/upgrade required packages (run in first cell)
!pip install -q transformers datasets evaluate accelerate

# Core imports
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset, DatasetDict
import evaluate

# Verify GPU availability
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")


In [None]:
import random

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)

In [None]:
from bs4 import BeautifulSoup
import re

In [None]:
# Load the competition data
train = pd.read_csv('/content/drive/MyDrive/Word2Vec/BagOfWords/labeledTrainData.tsv', delimiter='\t', quoting=3)
test = pd.read_csv('/content/drive/MyDrive/Word2Vec/BagOfWords/testData.tsv', delimiter='\t', quoting=3)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nSample review (raw):\n{train['review'][0][:200]}...")

In [None]:
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords # Import the stop word list

In [None]:
def clean_review(raw_review, remove_stopwords=True):
    """Enhanced preprocessing that preserves sentiment-critical words"""
    # Remove HTML
    review_text = BeautifulSoup(raw_review, "html.parser").get_text()

    # Handle contractions
    review_text = review_text.replace("n't", " not")
    review_text = review_text.replace("'m", " am")
    review_text = review_text.replace("'s", " is")
    review_text = review_text.replace("'re", " are")
    review_text = review_text.replace("'ll", " will")
    review_text = review_text.replace("'ve", " have")

    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)

    # Convert to lowercase and split
    words = letters_only.lower().split()

    if remove_stopwords:
        stops = set(stopwords.words("english"))
        # Keep negations and important sentiment words
        keep_words = {'not', 'no', 'nor', 'never', 'neither', 'nobody',
                     'nothing', 'nowhere', 'none', 'barely', 'hardly',
                     'scarcely', 'seldom'}
        stops = stops - keep_words
        words = [w for w in words if w not in stops]

    return " ".join(words)

In [None]:
# Apply cleaning
train['review'] = train['review'].apply(clean_review)
test['review'] = test['review'].apply(clean_review)

print(train['review'])

In [None]:
# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train[['review', 'sentiment']])

# Rename columns to match Trainer expectations
train_dataset = train_dataset.rename_column('review', 'text')
train_dataset = train_dataset.rename_column('sentiment', 'labels')

# Create 90/10 train/validation split
dataset = train_dataset.train_test_split(test_size=0.1, seed=42)
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['test'])}")

In [None]:
# Initialize tokenizer - must match your chosen model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define tokenization function with memory-optimized settings
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding=False,        # Dynamic padding in data collator (more efficient)
        truncation=True,      # Truncate sequences longer than max_length
        max_length=256,       # Reduced from 512 for better memory usage
        return_tensors=None   # Return lists, not tensors (for datasets.map)
    )

# Apply tokenization with batching for speed
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Inspect tokenized output
print("Tokenized sample:")
print(f"Input IDs length: {len(tokenized_dataset['train'][0]['input_ids'])}")
print(f"Attention mask length: {len(tokenized_dataset['train'][0]['attention_mask'])}")

In [None]:
# Create data collator for efficient dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Load model with binary classification head
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,                                  # Binary classification
    id2label={0: "NEGATIVE", 1: "POSITIVE"},      # Label mapping for interpretability
    label2id={"NEGATIVE": 0, "POSITIVE": 1}       # Reverse mapping
)

# Check model size
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model size: ~{total_params * 4 / 1024**2:.1f} MB (fp32)")

In [None]:
training_args = TrainingArguments(
    # Output and checkpointing
    output_dir="/content/drive/MyDrive/Word2Vec/imdb_bert_results",
    save_strategy="steps",              # More frequent saves due to disconnect risk
    save_steps=500,                     # Save every 500 steps (every ~15 min)
    save_total_limit=2,                 # Keep last 2 checkpoints
    load_best_model_at_end=True,           # Critical: load best checkpoint after training

    # Training parameters
    num_train_epochs=3,                    # 2-3 epochs typical for BERT fine-tuning
    per_device_train_batch_size=16,        # Balanced for P100 with fp16
    per_device_eval_batch_size=32,         # Evaluation can use larger batches

    # Optimizer and learning rate
    learning_rate=2e-5,                    # Standard BERT fine-tuning LR (2e-5 to 5e-5)
    weight_decay=0.01,                     # L2 regularization
    warmup_steps=500,                      # Gradual LR warmup for stability

    # Evaluation strategy
    eval_strategy="steps",
    eval_steps=500,                     # Evaluate every 500 steps
    metric_for_best_model="f1",           # Use F1 for model selection
    greater_is_better=True,

    # Memory optimization (critical for Kaggle)
    fp16=True,                            # Mixed precision: 2x speed, 50% memory
    gradient_accumulation_steps=2,         # Effective batch size = 16 * 2 = 32
    gradient_checkpointing=False,          # Disabled for speed (enable if OOM)

    # Performance optimization
    dataloader_num_workers=2,              # Parallel data loading
    dataloader_pin_memory=True,            # Faster CPU->GPU transfer

    # Logging
    logging_dir="./logs",
    logging_steps=100,
    logging_strategy="steps",
    report_to="none",                      # Disable wandb/tensorboard on Kaggle

    # Resume from checkpoint on disconnect
    resume_from_checkpoint=True,

    # Disable push to hub (not needed for competition)
    push_to_hub=False,
)

In [None]:
# Define evaluation metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    """
    Compute accuracy and F1 score for evaluation.
    Called automatically by Trainer during evaluation.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="binary")["f1"]

    return {
        "accuracy": accuracy,
        "f1": f1
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model (this is where the magic happens)
print("Starting training...")
trainer.train()

# Evaluate on validation set
print("\nEvaluating on validation set...")
results = trainer.evaluate()
print(f"Validation Accuracy: {results['eval_accuracy']:.4f}")
print(f"Validation F1: {results['eval_f1']:.4f}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def detailed_evaluation(trainer, dataset, dataset_name="Validation"):
    """
    Generate comprehensive evaluation with confusion matrix and per-class metrics.
    """
    # Get predictions
    predictions = trainer.predict(dataset)
    pred_labels = predictions.predictions.argmax(axis=-1)
    true_labels = predictions.label_ids

    # Classification report
    print(f"\n{dataset_name} Set Classification Report:")
    print(classification_report(
        true_labels,
        pred_labels,
        target_names=["Negative", "Positive"],
        digits=4
    ))

    # Confusion matrix
    cm = confusion_matrix(true_labels, pred_labels)
    print(f"\nConfusion Matrix:")
    print(f"TN: {cm[0,0]}, FP: {cm[0,1]}")
    print(f"FN: {cm[1,0]}, TP: {cm[1,1]}")

    # Calculate additional metrics
    accuracy = (cm[0,0] + cm[1,1]) / cm.sum()
    precision = cm[1,1] / (cm[1,1] + cm[0,1])
    recall = cm[1,1] / (cm[1,1] + cm[1,0])
    f1 = 2 * (precision * recall) / (precision + recall)

    print(f"\nSummary Metrics:")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")

    return pred_labels, true_labels, cm

# Run detailed evaluation
pred_labels, true_labels, cm = detailed_evaluation(
    trainer,
    tokenized_dataset["test"],
    "Validation"
)

In [None]:
# Convert to Dataset and tokenize
test_dataset = Dataset.from_pandas(test[['review']])
test_dataset = test_dataset.rename_column('review', 'text')
test_tokenized = test_dataset.map(tokenize_function, batched=True)

# Generate predictions using Trainer
print("Generating predictions on test set...")
predictions = trainer.predict(test_tokenized)

# Extract predicted labels (argmax of logits)
pred_labels = predictions.predictions.argmax(axis=-1)

# Optional: Get prediction probabilities
import torch.nn.functional as F
probs = F.softmax(torch.tensor(predictions.predictions), dim=-1)
confidence_scores = probs.max(dim=-1).values.numpy()

print(f"Predictions generated: {len(pred_labels)}")
print(f"Class distribution: {np.bincount(pred_labels)}")
print(f"Average confidence: {confidence_scores.mean():.4f}")

In [None]:
def predict_sentiment(text, trainer, tokenizer, threshold=0.5):
    """
    Predict sentiment for a single text with confidence score.
    """
    # Clean and tokenize
    cleaned_text = clean_review(text)
    inputs = tokenizer(
        cleaned_text,
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt"
    ).to(trainer.model.device)

    # Get prediction
    with torch.no_grad():
        outputs = trainer.model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
        predicted_class = probs.argmax().item()
        confidence = probs.max().item()

    return {
        "sentiment": "POSITIVE" if predicted_class == 1 else "NEGATIVE",
        "confidence": confidence,
        "label": predicted_class
    }

# Test on sample texts
sample_texts = [
    "This movie was absolutely fantastic! Highly recommend it.",
    "Terrible film. Waste of time and money.",
    "It was okay, nothing special but not terrible either."
]

for text in sample_texts:
    result = predict_sentiment(text, trainer, tokenizer)
    print(f"Text: {text[:60]}...")
    print(f"Prediction: {result['sentiment']} (confidence: {result['confidence']:.2%})\n")

In [None]:
# Create submission DataFrame
submission = pd.DataFrame({
    'id': test['id'],
    'sentiment': pred_labels
})

# Verify submission format
print("Submission Preview:")
print(submission.head(10))
print(f"\nSubmission shape: {submission.shape}")
print(f"Sentiment distribution:\n{submission['sentiment'].value_counts()}")

# Sanity checks
assert submission.shape[0] == 25000, "Wrong number of predictions!"
assert set(submission['sentiment'].unique()) == {0, 1}, "Invalid labels!"
assert submission['id'].nunique() == 25000, "Duplicate IDs!"

# Save submission file
import csv
submission.to_csv(
    '/content/drive/MyDrive/Word2Vec/submission.csv',
    index=False,
    quoting=csv.QUOTE_NONE,      # Don't quote any fields
    escapechar='\\'               # Escape character for special cases
)
print("\nSubmission file saved to: /content/drive/MyDrive/Word2Vec/submission.csv")

# Display first few lines to verify format
with open('/content/drive/MyDrive/Word2Vec/submission.csv', 'r') as f:
    print("\nSubmission file contents:")
    for i, line in enumerate(f):
        print(line.strip())
        if i >= 5:
            break

The Next Part of this task is using other model like roBerta, deBerta, etc.