# Simplified Sentiment Analysis Model Training

A streamlined approach to train a custom sentiment analysis model without API keys or complex setup.

## Quick Overview
- **Dataset**: Cardiff NLP Tweet Eval (60k tweets)
- **Model**: Twitter-optimized RoBERTa
- **Training**: 3 epochs, no external services required
- **Output**: Ready-to-use model for your project

In [None]:
# Quick Setup - Install dependencies and disable wandb
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable wandb to avoid API key prompts

!pip install -q transformers datasets torch scikit-learn matplotlib seaborn
print("✅ Dependencies installed")

✅ Dependencies installed - no API keys required!


In [None]:
# Import everything we need
import torch
import numpy as np
from datetime import datetime
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

print(f"🚀 Setup complete! Using device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
print(f"📅 Training started: {datetime.now().strftime('%H:%M:%S')}")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load and prepare data (this takes ~30 seconds)
print("📥 Loading dataset...")
dataset = load_dataset("tweet_eval", "sentiment")

# Quick preprocessing
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|@\w+', '[URL]', text)  # URLs and mentions
    text = re.sub(r'#(\w+)', r'\1', text)  # Remove # from hashtags
    return text.strip()

def preprocess_batch(examples):
    examples['text'] = [clean_text(text) for text in examples['text']]
    return examples

dataset = dataset.map(preprocess_batch, batched=True)
print(f"✅ Data loaded: {len(dataset['train'])} train, {len(dataset['validation'])} val, {len(dataset['test'])} test")

In [None]:
# Setup model and tokenizer
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
print(f"🤖 Loading model: {MODEL_NAME}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    id2label={0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"},
    label2id={"NEGATIVE": 0, "NEUTRAL": 1, "POSITIVE": 2}
)

# Tokenize data
def tokenize_batch(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_batch, batched=True, remove_columns=['text'])
tokenized_dataset.set_format("torch")

print("✅ Model and data ready for training!")

In [None]:
# Simple training configuration
training_args = TrainingArguments(
    output_dir='./simple_sentiment_model',
    num_train_epochs=2,                    # Reduced for faster training
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=200,
    eval_strategy="epoch",                 # Evaluate after each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    seed=42,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,              # Simplified for stability
    remove_unused_columns=False,
    report_to=[],                          # No external reporting
)

# Metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {'accuracy': accuracy, 'f1': f1, 'precision': precision, 'recall': recall}

print("⚙️ Training configuration ready!")

In [None]:
# Train the model (takes ~5-10 minutes)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("🚀 Starting training...")
start_time = datetime.now()

training_result = trainer.train()

end_time = datetime.now()
duration = (end_time - start_time).total_seconds() / 60

print(f"\n✅ Training completed in {duration:.1f} minutes!")
print(f"📊 Final training loss: {training_result.training_loss:.4f}")
print(f"🎯 Training steps: {training_result.global_step}")

In [None]:
# Quick evaluation and save
print("📈 Evaluating on test set...")
test_results = trainer.evaluate(eval_dataset=tokenized_dataset['test'])

print(f"🎯 Test Results:")
print(f"   Accuracy: {test_results['eval_accuracy']:.3f}")
print(f"   F1 Score: {test_results['eval_f1']:.3f}")

# Save the trained model
save_path = "./trained_sentiment_model"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"\n💾 Model saved to: {save_path}")
print("🎉 Ready to use in your sentiment analysis project!")

In [None]:
# Quick test with sample texts
sentiment_analyzer = pipeline(
    "sentiment-analysis", 
    model=model, 
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

test_texts = [
    "I love this product! It's amazing!",
    "This is terrible, worst experience ever.",
    "It's okay, nothing special.",
    "@company thanks for the great service! #happy",
    "Check this out: https://example.com not sure about it"
]

print("🧪 Testing your new model:")
print("=" * 50)
for text in test_texts:
    clean = clean_text(text)
    result = sentiment_analyzer(clean)[0]
    print(f"Text: {text}")
    print(f"Prediction: {result['label']} ({result['score']:.3f})")
    print("-" * 30)

# Integration Instructions

Now you can use your trained model in your project! Here's how:

## Simple Usage (copy this code):

```python
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load your trained model
model_path = "./trained_sentiment_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Create analyzer
analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Use it
result = analyzer("I love this product!")
print(result)  # [{'label': 'POSITIVE', 'score': 0.999}]
```

## Integration Steps:
1. Copy the `trained_sentiment_model` folder to your project
2. Replace TextBlob in `src/sentiment_analysis/sentiment_analyzer.py` 
3. Update `config/settings.py` to use the new model
4. Test with your existing data pipeline

**That's it! No API keys, no complex setup required.**

# Custom Sentiment Analysis Model Training

This notebook trains a custom sentiment analysis model using the cardiffnlp/tweet_eval dataset from Hugging Face. The trained model will replace the default TextBlob analyzer in our sentiment analysis project for improved accuracy on social media text.

## Dataset Information
- **Source**: Cardiff NLP Tweet Eval Dataset
- **Task**: Sentiment Classification
- **Labels**: 3 classes (Negative, Neutral, Positive)
- **Size**: ~60k tweets
- **Language**: English

## 1. Install and Import Required Libraries

First, we'll install the necessary libraries and import the required modules for model training and evaluation.

In [None]:
!pip install torch torchvision torchaudio
!pip install transformers datasets vaderSentiment
!pip install scikit-learn matplotlib seaborn accelerate
# Optional: experiment tracking
!pip install wandb

# Verify PyTorch installation and GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

In [None]:
# Import required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from datasets import load_dataset, DatasetDict

# PyTorch
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F

# Scikit-learn for metrics
from sklearn.metrics import (
    accuracy_score, 
    precision_recall_fscore_support, 
    confusion_matrix,
    classification_report
)

# Set random seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

print("All libraries imported successfully!")

## 2. Load and Explore the Tweet Eval Dataset

Let's load the tweet_eval sentiment dataset and explore its structure, distribution, and sample content.

In [None]:
# Load the tweet_eval sentiment dataset
print("Loading tweet_eval sentiment dataset...")
dataset = load_dataset("tweet_eval", "sentiment")

# Display dataset information
print(f"Dataset structure: {dataset}")
print(f"\nDataset features: {dataset['train'].features}")
print(f"\nTraining samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Test samples: {len(dataset['test'])}")

In [None]:
# Explore label distribution
def explore_dataset_labels(dataset_split, split_name):
    """Explore and visualize label distribution in dataset split"""
    labels = dataset_split['label']
    label_names = ['Negative', 'Neutral', 'Positive']
    
    # Count labels
    label_counts = {}
    for label in labels:
        label_counts[label_names[label]] = label_counts.get(label_names[label], 0) + 1
    
    print(f"\n{split_name} Set Label Distribution:")
    for label, count in label_counts.items():
        percentage = (count / len(labels)) * 100
        print(f"  {label}: {count} ({percentage:.1f}%)")
    
    return label_counts

# Explore all splits
train_counts = explore_dataset_labels(dataset['train'], 'Training')
val_counts = explore_dataset_labels(dataset['validation'], 'Validation')
test_counts = explore_dataset_labels(dataset['test'], 'Test')

In [None]:
# Visualize label distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
splits = [('Training', train_counts), ('Validation', val_counts), ('Test', test_counts)]

for idx, (split_name, counts) in enumerate(splits):
    labels = list(counts.keys())
    values = list(counts.values())
    colors = ['#ff6b6b', '#ffd93d', '#6bcf7f']  # Red, Yellow, Green
    
    axes[idx].pie(values, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
    axes[idx].set_title(f'{split_name} Set Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Display sample tweets from each category
label_names = ['Negative', 'Neutral', 'Positive']

print("Sample tweets by sentiment category:\n")
for label_idx, label_name in enumerate(label_names):
    print(f"=== {label_name.upper()} TWEETS ===")
    
    # Find tweets with this label
    samples = []
    for i, item in enumerate(dataset['train']):
        if item['label'] == label_idx and len(samples) < 3:
            samples.append(item['text'])
    
    for i, text in enumerate(samples, 1):
        print(f"{i}. {text}")
    print()

## 3. Preprocess Text Data

We'll clean and preprocess the tweet text to handle URLs, mentions, hashtags, and other social media-specific elements.

In [None]:
import re
from typing import List, Dict

def preprocess_tweet(text: str) -> str:
    """
    Preprocess tweet text by cleaning URLs, mentions, hashtags, etc.
    
    Args:
        text: Raw tweet text
        
    Returns:
        Cleaned tweet text
    """
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Replace URLs with [URL] token
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '[URL]', text)
    text = re.sub(r'www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '[URL]', text)
    
    # Replace user mentions with [USER] token
    text = re.sub(r'@[A-Za-z0-9_]+', '[USER]', text)
    
    # Keep hashtags but remove the # symbol
    text = re.sub(r'#([A-Za-z0-9_]+)', r'\1', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    return text

# Test preprocessing function
sample_tweets = [
    "I love this product! #amazing @company https://example.com",
    "@user This is terrible... why would anyone buy this? #disappointed",
    "It's okay I guess. Nothing special. Check out www.example.com for more info"
]

print("Preprocessing examples:")
for original in sample_tweets:
    cleaned = preprocess_tweet(original)
    print(f"Original: {original}")
    print(f"Cleaned:  {cleaned}\n")

In [None]:
# Apply preprocessing to the entire dataset
def preprocess_dataset(examples):
    """Preprocess a batch of examples"""
    examples['text'] = [preprocess_tweet(text) for text in examples['text']]
    return examples

# Apply preprocessing to all splits
print("Preprocessing dataset splits...")
processed_dataset = dataset.map(preprocess_dataset, batched=True)

print("Preprocessing completed!")
print(f"Sample processed tweet: {processed_dataset['train'][0]['text']}")

## 4. Split Dataset and Create Data Loaders

We'll use the existing train/validation/test splits and create tokenized datasets with appropriate data loaders.

In [None]:
try:
    from transformers import AutoTokenizer
    print("✓ AutoTokenizer imported successfully")
    
    # Initialize tokenizer right away
    MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"  # Pre-trained on tweets
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    print(f"✓ Tokenizer loaded: {MODEL_NAME}")
    print(f"✓ Tokenizer vocab size: {tokenizer.vocab_size}")
    
except ImportError as e:
    print(f"❌ Error importing transformers: {e}")
    print("Installing transformers...")
    import subprocess
    subprocess.check_call(["pip", "install", "transformers"])
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# OPTION 2: Import other components when needed (lazy loading approach)
# This spreads the import time across multiple cells instead of one big import

def lazy_import_transformers():
    """Import transformers components as needed"""
    global AutoModelForSequenceClassification, Trainer, TrainingArguments
    global DataCollatorWithPadding, pipeline
    
    print("🔄 Loading remaining transformers components...")
    
    from transformers import (
        AutoModelForSequenceClassification,
        Trainer, 
        TrainingArguments,
        DataCollatorWithPadding,
        pipeline
    )
    
    print("✅ All transformers components loaded!")
    return True

# Call this function when you actually need these components
# lazy_import_transformers()

In [None]:
try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    print("✓ VADER available - ultra-fast sentiment analysis option")
    
    # Quick test
    analyzer = SentimentIntensityAnalyzer()
    sample = "I love this product!"
    score = analyzer.polarity_scores(sample)
    print(f"VADER test: '{sample}' -> {score}")
    
except ImportError:
    print("💡 Install VADER for ultra-fast alternative: pip install vaderSentiment")


In [None]:
# Tokenization function
def tokenize_function(examples):
    """Tokenize the text data"""
    return tokenizer(
        examples['text'], 
        truncation=True, 
        padding=True, 
        max_length=128,  # Tweets are typically short
        return_tensors=None
    )

# Apply tokenization to all splits
print("Tokenizing dataset...")
tokenized_dataset = processed_dataset.map(
    tokenize_function, 
    batched=True,
    remove_columns=['text']  # Remove original text to save memory
)

print("Tokenization completed!")
print(f"Tokenized dataset structure: {tokenized_dataset}")

In [None]:
# FAST ALTERNATIVE: Create a simple data collator without importing DataCollatorWithPadding
class SimpleDataCollator:
    """Lightweight alternative to DataCollatorWithPadding"""
    def __init__(self, tokenizer, padding=True):
        self.tokenizer = tokenizer
        self.padding = padding
        
    def __call__(self, features):
        import torch
        # Get all the keys from the first feature
        first = features[0]
        batch = {}
        
        # Handle each key in the features
        for key in first.keys():
            values = [f[key] for f in features]
            
            if key == "input_ids" or key == "attention_mask":
                # Convert to tensors and pad if needed
                if self.padding:
                    # Convert to lists first if they're tensors
                    if isinstance(values[0], torch.Tensor):
                        values = [v.tolist() if isinstance(v, torch.Tensor) else v for v in values]
                    
                    # Find max length in batch
                    max_length = max(len(v) for v in values)
                    # Pad sequences
                    padded_values = []
                    for v in values:
                        if len(v) < max_length:
                            # Pad with tokenizer's pad_token_id for input_ids, 0 for attention_mask
                            pad_value = self.tokenizer.pad_token_id if key == "input_ids" else 0
                            padded = v + [pad_value] * (max_length - len(v))
                            padded_values.append(padded)
                        else:
                            padded_values.append(v)
                    batch[key] = torch.tensor(padded_values)
                else:
                    # Stack tensors or convert lists to tensors
                    if isinstance(values[0], torch.Tensor):
                        batch[key] = torch.stack(values)
                    else:
                        batch[key] = torch.tensor(values)
            else:
                # For labels and other non-sequence data
                if isinstance(values[0], torch.Tensor):
                    batch[key] = torch.stack(values)
                else:
                    batch[key] = torch.tensor(values)
                
        return batch

# Set dataset format for PyTorch
tokenized_dataset.set_format("torch")

# Create our fast data collator
data_collator = SimpleDataCollator(tokenizer)

print("✅ Fast data collator created!")

# Display dataset statistics
print("Dataset Statistics:")
print(f"Training set: {len(tokenized_dataset['train'])} samples")
print(f"Validation set: {len(tokenized_dataset['validation'])} samples")
print(f"Test set: {len(tokenized_dataset['test'])} samples")

# Show sample tokenized data
sample = tokenized_dataset['train'][0]
print(f"\nSample tokenized data:")
print(f"Input IDs shape: {sample['input_ids'].shape}")
print(f"Attention mask shape: {sample['attention_mask'].shape}")
print(f"Label: {sample['label']}")

print(f"\n🚀 Ready to proceed without waiting for DataCollatorWithPadding import!")

## 5. Define the Model Architecture

We'll use a pre-trained RoBERTa model fine-tuned for Twitter sentiment analysis as our base model.

In [None]:
from transformers import AutoModelForSequenceClassification
# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,  # 3 sentiment classes
    id2label={0: "Negative", 1: "Neutral", 2: "Positive"},
    label2id={"Negative": 0, "Neutral": 1, "Positive": 2}
)

# Display model information
print(f"Model: {model.__class__.__name__}")
print(f"Number of parameters: {model.num_parameters():,}")
print(f"Number of labels: {model.num_labels}")
print(f"Label mapping: {model.config.id2label}")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model moved to device: {device}")

## 6. Set Up Training Configuration

Configure training parameters including learning rate, epochs, batch size, and other hyperparameters.

In [None]:
from transformers import TrainingArguments
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',                    # Output directory
    num_train_epochs=3,                        # Number of training epochs
    per_device_train_batch_size=16,           # Batch size for training
    per_device_eval_batch_size=32,            # Batch size for evaluation
    warmup_steps=500,                         # Number of warmup steps
    weight_decay=0.01,                        # Strength of weight decay
    learning_rate=2e-5,                       # Learning rate
    logging_dir='./logs',                     # Directory for storing logs
    logging_steps=100,                        # Log every N steps
    eval_strategy="steps",                    # Changed from evaluation_strategy
    eval_steps=500,                           # Evaluation frequency
    save_strategy="steps",                    # Save every N steps
    save_steps=1000,                          # Save frequency
    load_best_model_at_end=True,             # Load best model at end
    metric_for_best_model="eval_accuracy",    # Metric for best model
    greater_is_better=True,                   # Higher accuracy is better
    save_total_limit=3,                       # Only save last N checkpoints
    seed=42,                                  # Random seed
    fp16=torch.cuda.is_available(),           # Use mixed precision if GPU available
    dataloader_num_workers=4,                 # Number of data loading workers
    remove_unused_columns=False,              # Keep all columns
    push_to_hub=False,                        # Don't push to HuggingFace Hub
)

print("Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Train batch size: {training_args.per_device_train_batch_size}")
print(f"  Eval batch size: {training_args.per_device_eval_batch_size}")
print(f"  Mixed precision: {training_args.fp16}")
print("✅ Training arguments configured successfully!")

In [None]:
# Define metrics computation function
def compute_metrics(eval_pred):
    """Compute accuracy, precision, recall, and F1 score"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

print("Metrics computation function defined.")

## 7. Train the Sentiment Analysis Model

Implement the training loop with progress tracking and validation monitoring.

In [None]:
from transformers import Trainer
# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Trainer initialized successfully!")
print(f"Training dataset size: {len(tokenized_dataset['train'])}")
print(f"Validation dataset size: {len(tokenized_dataset['validation'])}")

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
# Start training
print("Starting model training...")
print(f"Training start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Train the model
training_result = trainer.train()

print(f"Training completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Training loss: {training_result.training_loss:.4f}")
print(f"Total training steps: {training_result.global_step}")

In [None]:
# Plot training history
training_logs = trainer.state.log_history

# Extract training and validation metrics
train_loss = []
eval_loss = []
eval_accuracy = []
eval_f1 = []
steps = []

for log in training_logs:
    if 'loss' in log:  # Training logs
        train_loss.append(log['loss'])
        steps.append(log['step'])
    elif 'eval_loss' in log:  # Evaluation logs
        eval_loss.append(log['eval_loss'])
        eval_accuracy.append(log['eval_accuracy'])
        eval_f1.append(log['eval_f1'])

# Create training plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Training loss
axes[0, 0].plot(steps, train_loss, 'b-', label='Training Loss')
axes[0, 0].set_title('Training Loss Over Time')
axes[0, 0].set_xlabel('Steps')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Validation loss
if eval_loss:
    eval_steps = [log['step'] for log in training_logs if 'eval_loss' in log]
    axes[0, 1].plot(eval_steps, eval_loss, 'r-', label='Validation Loss')
    axes[0, 1].set_title('Validation Loss Over Time')
    axes[0, 1].set_xlabel('Steps')
    axes[0, 1].set_ylabel('Loss')
    axes[0, 1].legend()
    axes[0, 1].grid(True)

# Validation accuracy
if eval_accuracy:
    axes[1, 0].plot(eval_steps, eval_accuracy, 'g-', label='Validation Accuracy')
    axes[1, 0].set_title('Validation Accuracy Over Time')
    axes[1, 0].set_xlabel('Steps')
    axes[1, 0].set_ylabel('Accuracy')
    axes[1, 0].legend()
    axes[1, 0].grid(True)

# Validation F1 score
if eval_f1:
    axes[1, 1].plot(eval_steps, eval_f1, 'purple', label='Validation F1')
    axes[1, 1].set_title('Validation F1 Score Over Time')
    axes[1, 1].set_xlabel('Steps')
    axes[1, 1].set_ylabel('F1 Score')
    axes[1, 1].legend()
    axes[1, 1].grid(True)

plt.tight_layout()
plt.show()

## 8. Evaluate Model Performance

Evaluate the trained model on the test dataset and calculate comprehensive performance metrics.

In [None]:
# Evaluate on test set
print("Evaluating model on test set...")
test_results = trainer.evaluate(eval_dataset=tokenized_dataset['test'])

print("Test Set Results:")
for key, value in test_results.items():
    if key.startswith('eval_'):
        metric_name = key.replace('eval_', '').title()
        print(f"  {metric_name}: {value:.4f}")

In [None]:
# Get detailed predictions for confusion matrix
predictions = trainer.predict(tokenized_dataset['test'])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# Calculate detailed metrics
report = classification_report(
    y_true, y_pred, 
    target_names=['Negative', 'Neutral', 'Positive'],
    output_dict=True
)

# Display detailed classification report
print("\nDetailed Classification Report:")
print("-" * 50)
for label in ['Negative', 'Neutral', 'Positive']:
    metrics = report[label]
    print(f"{label}:")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall:    {metrics['recall']:.4f}")
    print(f"  F1-score:  {metrics['f1-score']:.4f}")
    print(f"  Support:   {int(metrics['support'])}")
    print()

print(f"Overall Accuracy: {report['accuracy']:.4f}")
print(f"Macro Average F1: {report['macro avg']['f1-score']:.4f}")
print(f"Weighted Average F1: {report['weighted avg']['f1-score']:.4f}")

In [None]:
# Create confusion matrix
cm = confusion_matrix(y_true, y_pred)
labels = ['Negative', 'Neutral', 'Positive']

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=labels, yticklabels=labels,
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - Test Set')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')

# Add percentage annotations
total = cm.sum()
for i in range(len(labels)):
    for j in range(len(labels)):
        percentage = (cm[i, j] / total) * 100
        plt.text(j + 0.7, i + 0.7, f'({percentage:.1f}%)', 
                ha='center', va='center', fontsize=10, color='red')

plt.tight_layout()
plt.show()

# Calculate per-class accuracy
class_accuracy = cm.diagonal() / cm.sum(axis=1)
print("\nPer-class Accuracy:")
for i, label in enumerate(labels):
    print(f"  {label}: {class_accuracy[i]:.4f}")

## 9. Save and Export the Trained Model

Save the trained model, tokenizer, and configuration files for use in the sentiment analysis project.

In [None]:
# Create directory for saving the model
model_save_path = "./trained_sentiment_model"
os.makedirs(model_save_path, exist_ok=True)

# Save the model and tokenizer
print("Saving trained model and tokenizer...")
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved to: {model_save_path}")
print(f"Files saved:")
for file in os.listdir(model_save_path):
    print(f"  - {file}")

In [None]:
# Create model configuration file for the project
model_config = {
    "model_name": "custom_twitter_sentiment",
    "model_path": model_save_path,
    "base_model": MODEL_NAME,
    "num_labels": 3,
    "label_mapping": {0: "negative", 1: "neutral", 2: "positive"},
    "preprocessing_required": True,
    "max_length": 128,
    "training_date": datetime.now().isoformat(),
    "test_accuracy": float(test_results['eval_accuracy']),
    "test_f1": float(test_results['eval_f1']),
    "model_size_mb": sum(os.path.getsize(os.path.join(model_save_path, f)) 
                        for f in os.listdir(model_save_path)) / (1024 * 1024)
}

# Save configuration
import json
config_path = os.path.join(model_save_path, "model_config.json")
with open(config_path, 'w') as f:
    json.dump(model_config, f, indent=2)

print(f"Model configuration saved to: {config_path}")
print("Configuration:")
for key, value in model_config.items():
    print(f"  {key}: {value}")

In [None]:
# Create sentiment analysis pipeline
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Test samples representing different sentiment types
test_samples = [
    # Positive samples
    "I absolutely love this new feature! It's amazing! 🎉",
    "Best product ever! Highly recommended to everyone!",
    "Great job team! This update is fantastic!",
    
    # Negative samples  
    "This is terrible. Worst experience ever. 😞",
    "I hate this app. It never works properly!",
    "Completely disappointed. Waste of money.",
    
    # Neutral samples
    "It's okay, nothing special but works fine.",
    "The product arrived on time. Standard quality.",
    "Meeting scheduled for tomorrow at 3 PM.",
    
    # Social media specific
    "@company your service is down again #frustrated",
    "Check out this link https://example.com not sure about it",
    "Thanks @support for the quick response! #grateful"
]

print("Testing Custom Model Predictions:")
print("=" * 60)

for i, text in enumerate(test_samples, 1):
    # Get prediction from our custom model
    result = sentiment_pipeline(preprocess_tweet(text))[0]
    
    print(f"\n{i}. Text: {text}")
    print(f"   Prediction: {result['label']} (confidence: {result['score']:.4f})")
    
    # Show preprocessed text if different
    preprocessed = preprocess_tweet(text)
    if preprocessed != text.lower():
        print(f"   Preprocessed: {preprocessed}")

In [None]:
# Compare with baseline TextBlob (if available)
try:
    from textblob import TextBlob
    
    def textblob_sentiment(text):
        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        
        if polarity > 0.1:
            return "positive"
        elif polarity < -0.1:
            return "negative"
        else:
            return "neutral"
    
    print("\n" + "=" * 80)
    print("COMPARISON: Custom Model vs TextBlob Baseline")
    print("=" * 80)
    
    comparison_samples = test_samples[:6]  # Use first 6 samples
    
    for i, text in enumerate(comparison_samples, 1):
        # Custom model prediction
        custom_result = sentiment_pipeline(preprocess_tweet(text))[0]
        custom_sentiment = custom_result['label'].lower()
        custom_confidence = custom_result['score']
        
        # TextBlob prediction
        textblob_sentiment_result = textblob_sentiment(text)
        
        # Agreement indicator
        agreement = "✓" if custom_sentiment == textblob_sentiment_result else "✗"
        
        print(f"\n{i}. {text}")
        print(f"   Custom Model:  {custom_sentiment} ({custom_confidence:.4f})")
        print(f"   TextBlob:      {textblob_sentiment_result}")
        print(f"   Agreement:     {agreement}")

except ImportError:
    print("\nTextBlob not installed. Install with: pip install textblob")
    print("Skipping baseline comparison.")

In [None]:
# Performance summary and recommendations
print("\n" + "=" * 80)
print("TRAINING SUMMARY AND RECOMMENDATIONS")
print("=" * 80)

print(f"\n📊 Model Performance:")
print(f"   • Test Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"   • Test F1 Score: {test_results['eval_f1']:.4f}")
print(f"   • Model Size: {model_config['model_size_mb']:.1f} MB")

print(f"\n🔧 Integration Steps:")
print(f"   1. Copy model files from: {model_save_path}")
print(f"   2. Update src/sentiment_analysis/sentiment_analyzer.py")
print(f"   3. Use the integration code provided above")
print(f"   4. Update config/settings.py to use custom model")

print(f"\n⚡ Performance Optimizations:")
print(f"   • Model is optimized for Twitter/social media text")
print(f"   • Preprocessing pipeline handles URLs, mentions, hashtags")
print(f"   • GPU acceleration available if CUDA is present")
print(f"   • Confidence scores provided for filtering low-quality predictions")

print(f"\n🎯 Recommendations:")
print(f"   • Use confidence threshold of 0.7+ for high-quality predictions")
print(f"   • Monitor model performance on new data")
print(f"   • Consider retraining if accuracy drops below 0.80")
print(f"   • Implement A/B testing between custom model and TextBlob")

print(f"\n✅ Training completed successfully!")
print(f"   Model ready for production use in sentiment analysis project.")