# Fine-tuning BERT on Review Classification Task

## Model Evaluation

In [None]:
# Evaluate the model
if trainer is not None:
    eval_results = trainer.evaluate()
    print("Evaluation Results:")
    for key, value in eval_results.items():
        if isinstance(value, float):
            print(f"{key}: {value:.4f}")
        else:
            print(f"{key}: {value}")
else:
    print("Cannot evaluate - no trained model available")

In [None]:
# Detailed evaluation with classification report
if trainer is not None:
    # Get predictions on validation set
    predictions = trainer.predict(tokenized_dataset['validation'])
    y_pred = np.argmax(predictions.predictions, axis=1)
    y_true = predictions.label_ids
    
    # Print detailed classification report
    print("\nDetailed Classification Report:")
    print(classification_report(
        y_true, y_pred, 
        target_names=CATEGORIES,
        digits=4
    ))
    
    # Confusion matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_true, y_pred)
    print("\nConfusion Matrix:")
    cm_df = pd.DataFrame(cm, index=CATEGORIES, columns=CATEGORIES)
    display(cm_df)

## Model Inference

In [None]:
def predict_reviews(model, tokenizer, texts, batch_size=16):
    """Make predictions on a list of review texts."""
    model.eval()
    predictions = []
    
    # Ensure model is on the right device
    device = next(model.parameters()).device
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        # Tokenize batch
        inputs = tokenizer(
            batch, 
            padding=True, 
            truncation=True, 
            max_length=512, 
            return_tensors="pt"
        )
        
        # Move inputs to same device as model
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Get predictions
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            preds = torch.argmax(probs, dim=-1)
            
        predictions.extend(preds.cpu().numpy())
    
    return predictions

In [None]:
if trainer is not None:
    # Extract the base model for inference (important for multi-GPU)
    if hasattr(trainer.model, 'module'):
        # Model is wrapped in DataParallel/DistributedDataParallel
        inference_model = trainer.model.module
    else:
        # Single GPU or CPU
        inference_model = trainer.model
    
    # Move to single GPU for inference
    inference_model = inference_model.to('cuda:0')
    inference_model.eval()
    
    # Sample some validation examples
    sample_indices = random.sample(range(len(tokenized_dataset['validation'])), 5)
    sample_texts = [dataset['validation'][i]['combined_text'] for i in sample_indices]
    sample_labels = [tokenized_dataset['validation'][i]['labels'] for i in sample_indices]
    
    # Make predictions with the unwrapped model
    predictions = predict_reviews(inference_model, tokenizer, sample_texts)
    
    print("Sample Predictions:")
    print("=" * 80)
    
    for i, (text, true_label, pred_label) in enumerate(zip(sample_texts, sample_labels, predictions)):
        print(f"\nExample {i+1}:")
        print(f"Text: {text[:150]}..." if len(text) > 150 else f"Text: {text}")
        print(f"True Label: {CATEGORIES[true_label]}")
        print(f"Predicted: {CATEGORIES[pred_label]}")
        print(f"Correct: {'✓' if true_label == pred_label else '✗'}")
        print("-" * 40)
else:
    print("Cannot test inference - no trained model available")

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

def save_evaluation_results(trainer, tokenized_dataset, categories, output_dir="./evaluation_results"):
    """
    Save comprehensive evaluation results to CSV files
    """
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    if trainer is not None:
        # Get predictions on validation set
        predictions = trainer.predict(tokenized_dataset['validation'])
        y_pred = np.argmax(predictions.predictions, axis=1)
        y_true = predictions.label_ids
        
        # 1. Per-sample results CSV
        validation_results = []
        for i, (true_label, pred_label) in enumerate(zip(y_true, y_pred)):
            # Get the original text
            sample_text = tokenized_dataset['validation'][i]['input_ids']
            decoded_text = trainer.tokenizer.decode(sample_text, skip_special_tokens=True)
            
            # Get prediction confidence
            probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions[i]), dim=0)
            confidence = float(probs[pred_label])
            
            validation_results.append({
                'sample_id': i,
                'text_preview': decoded_text[:100] + "..." if len(decoded_text) > 100 else decoded_text,
                'true_label': categories[true_label],
                'predicted_label': categories[pred_label],
                'correct': true_label == pred_label,
                'confidence': confidence,
                'true_label_id': int(true_label),
                'predicted_label_id': int(pred_label)
            })
        
        # Save per-sample results
        results_df = pd.DataFrame(validation_results)
        results_df.to_csv(f"{output_dir}/validation_predictions.csv", index=False)
        print(f"Per-sample results saved to: {output_dir}/validation_predictions.csv")
        
        # 2. Classification metrics CSV
        report_dict = classification_report(y_true, y_pred, target_names=categories, output_dict=True)
        
        # Convert to DataFrame format
        metrics_data = []
        for label in categories:
            metrics_data.append({
                'category': label,
                'precision': report_dict[label]['precision'],
                'recall': report_dict[label]['recall'],
                'f1_score': report_dict[label]['f1-score'],
                'support': int(report_dict[label]['support'])
            })
        
        # Add overall metrics
        metrics_data.append({
            'category': 'macro_avg',
            'precision': report_dict['macro avg']['precision'],
            'recall': report_dict['macro avg']['recall'],
            'f1_score': report_dict['macro avg']['f1-score'],
            'support': int(report_dict['macro avg']['support'])
        })
        
        metrics_data.append({
            'category': 'weighted_avg',
            'precision': report_dict['weighted avg']['precision'],
            'recall': report_dict['weighted avg']['recall'],
            'f1_score': report_dict['weighted avg']['f1-score'],
            'support': int(report_dict['weighted avg']['support'])
        })
        
        metrics_df = pd.DataFrame(metrics_data)
        metrics_df.to_csv(f"{output_dir}/classification_metrics.csv", index=False)
        print(f"Classification metrics saved to: {output_dir}/classification_metrics.csv")
        
        # 3. Confusion matrix CSV
        cm = confusion_matrix(y_true, y_pred)
        cm_df = pd.DataFrame(cm, index=categories, columns=categories)
        cm_df.to_csv(f"{output_dir}/confusion_matrix.csv")
        print(f"Confusion matrix saved to: {output_dir}/confusion_matrix.csv")
        
        # 4. Summary statistics CSV
        total_samples = len(y_true)
        correct_predictions = np.sum(y_true == y_pred)
        accuracy = correct_predictions / total_samples
        
        summary_data = [{
            'metric': 'total_samples',
            'value': total_samples
        }, {
            'metric': 'correct_predictions', 
            'value': correct_predictions
        }, {
            'metric': 'accuracy',
            'value': accuracy
        }, {
            'metric': 'training_samples',
            'value': len(tokenized_dataset['train'])
        }]
        
        # Add per-class accuracy
        for i, category in enumerate(categories):
            class_mask = (y_true == i)
            if np.sum(class_mask) > 0:
                class_accuracy = np.sum((y_true == y_pred) & class_mask) / np.sum(class_mask)
                summary_data.append({
                    'metric': f'{category}_accuracy',
                    'value': class_accuracy
                })
        
        summary_df = pd.DataFrame(summary_data)
        summary_df.to_csv(f"{output_dir}/evaluation_summary.csv", index=False)
        print(f"Evaluation summary saved to: {output_dir}/evaluation_summary.csv")
        
        # 5. Training history (if available)
        if hasattr(trainer.state, 'log_history'):
            history_df = pd.DataFrame(trainer.state.log_history)
            history_df.to_csv(f"{output_dir}/training_history.csv", index=False)
            print(f"Training history saved to: {output_dir}/training_history.csv")
        
        return {
            'accuracy': accuracy,
            'total_samples': total_samples,
            'results_files': [
                f"{output_dir}/validation_predictions.csv",
                f"{output_dir}/classification_metrics.csv", 
                f"{output_dir}/confusion_matrix.csv",
                f"{output_dir}/evaluation_summary.csv"
            ]
        }
    else:
        print("Cannot save evaluation results - no trained model available")
        return None

# Usage after training:
if trainer is not None:
    eval_results = save_evaluation_results(trainer, tokenized_dataset, CATEGORIES)
    print(f"\nEvaluation complete! Overall accuracy: {eval_results['accuracy']:.4f}")
else:
    print("No trainer available for evaluation")

In [None]:
# Generate evaluation results as CSV with validation predictions
if trainer is not None:
    print("Generating evaluation CSV with validation predictions...")
    
    # Get predictions on validation set
    predictions = trainer.predict(tokenized_dataset['validation'])
    y_pred = np.argmax(predictions.predictions, axis=1)
    y_true = predictions.label_ids
    probabilities = torch.nn.functional.softmax(torch.from_numpy(predictions.predictions), dim=-1)
    
    # Get the original validation data
    val_indices = val_df.index.tolist()  # Get original indices
    
    # Create evaluation results dataframe
    eval_results = pd.DataFrame({
        'original_index': val_indices,
        'true_category': [CATEGORIES[label] for label in y_true],
        'predicted_category': [CATEGORIES[pred] for pred in y_pred],
        'classification_result': ['valid' if CATEGORIES[pred] == 'valid' else 'invalid' for pred in y_pred],
        'valid_probability': probabilities[:, 0].numpy(),  # Probability of 'valid' (index 0)
        'correct_prediction': y_true == y_pred
    })
    
    # Add the original text for reference
    eval_results['review_text'] = [val_dataset[i]['combined_text'] for i in range(len(val_dataset))]
    
    # Save to CSV
    eval_csv_path = "validation_evaluation_results.csv"
    eval_results.to_csv(eval_csv_path, index=False)
    
    print(f"Evaluation results saved to: {eval_csv_path}")
    print(f"Total validation samples: {len(eval_results)}")
    print(f"Correct predictions: {eval_results['correct_prediction'].sum()}")
    print(f"Accuracy: {eval_results['correct_prediction'].mean():.4f}")
    
    # Show sample of results
    print(f"\nSample of evaluation results:")
    display(eval_results.head(10))
    
else:
    print("No trained model available for evaluation CSV generation")