# Baseline Sentiment Analysis Model

This notebook implements a baseline sentiment analysis model using TF-IDF features and Logistic Regression. It serves as a benchmark for comparison with BERT and robustness testing. Adapted from `baseline_model.py` to run in Google Colab, using preprocessed data stored on Google Drive.

## Setup and Imports

Mount Google Drive, install dependencies, and import required libraries.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install necessary libraries
!pip install scikit-learn joblib matplotlib seaborn

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import joblib
import logging
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Configuration
DATA_DIR = Path('/content/drive/MyDrive/twitter_data')
MODELS_DIR = DATA_DIR / 'models'
RESULTS_DIR = DATA_DIR / 'results'
BASELINE_CONFIG = {
    'tfidf_max_features': 5000,
    'tfidf_ngram_range': (1, 2),
    'lr_C': 1.0,
    'lr_max_iter': 1000
}
RANDOM_STATE = 42

# Create directories if they don't exist
MODELS_DIR.mkdir(exist_ok=True)
RESULTS_DIR.mkdir(exist_ok=True)

## Baseline Sentiment Model Class

Define the `BaselineSentimentModel` class for TF-IDF + Logistic Regression.

In [None]:
class BaselineSentimentModel:
    """
    Baseline sentiment analysis model using TF-IDF features and Logistic Regression
    """
    
    def __init__(self, config=BASELINE_CONFIG):
        """
        Initialize the baseline model with configuration
        
        Args:
            config (dict): Model configuration parameters
        """
        self.config = config
        self.model = None
        self.is_trained = False
        
        # Create the pipeline
        self.pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=config['tfidf_max_features'],
                ngram_range=config['tfidf_ngram_range'],
                stop_words='english',
                lowercase=True
            )),
            ('classifier', LogisticRegression(
                C=config['lr_C'],
                max_iter=config['lr_max_iter'],
                random_state=RANDOM_STATE
            ))
        ])
        
        logger.info("Baseline model initialized")
    
    def train(self, X_train, y_train, X_val=None, y_val=None):
        """
        Train the baseline model
        
        Args:
            X_train (array-like): Training texts
            y_train (array-like): Training labels
            X_val (array-like): Validation texts (optional)
            y_val (array-like): Validation labels (optional)
            
        Returns:
            dict: Training history and validation scores
        """
        logger.info("Training baseline model...")
        
        # Fit the pipeline
        self.pipeline.fit(X_train, y_train)
        self.is_trained = True
        
        # Evaluate on training set
        train_pred = self.pipeline.predict(X_train)
        train_accuracy = accuracy_score(y_train, train_pred)
        
        results = {
            'train_accuracy': train_accuracy,
            'train_size': len(X_train)
        }
        
        logger.info(f"Training accuracy: {train_accuracy:.4f}")
        
        # Evaluate on validation set if provided
        if X_val is not None and y_val is not None:
            val_pred = self.pipeline.predict(X_val)
            val_accuracy = accuracy_score(y_val, val_pred)
            results['val_accuracy'] = val_accuracy
            logger.info(f"Validation accuracy: {val_accuracy:.4f}")
        
        self.model = self.pipeline
        return results
    
    def predict(self, X):
        """
        Make predictions on new data
        
        Args:
            X (array-like): Input texts
            
        Returns:
            array: Predicted labels
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        
        return self.pipeline.predict(X)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        
        Args:
            X (array-like): Input texts
            
        Returns:
            array: Prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        
        return self.pipeline.predict_proba(X)
    
    def evaluate(self, X_test, y_test, save_results=True):
        """
        Comprehensive model evaluation
        
        Args:
            X_test (array-like): Test texts
            y_test (array-like): Test labels
            save_results (bool): Whether to save evaluation plots
            
        Returns:
            dict: Evaluation metrics
        """
        logger.info("Evaluating baseline model...")
        
        # Make predictions
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        
        results = {
            'accuracy': accuracy,
            'precision': report['weighted avg']['precision'],
            'recall': report['weighted avg']['recall'],
            'f1_score': report['weighted avg']['f1-score'],
            'confusion_matrix': cm.tolist(),
            'classification_report': report
        }
        
        logger.info(f"Test accuracy: {accuracy:.4f}")
        logger.info(f"Test F1-score: {results['f1_score']:.4f}")
        
        # Save visualizations
        if save_results:
            self._save_evaluation_plots(cm, y_test, y_pred_proba)
        
        return results
    
    def _save_evaluation_plots(self, cm, y_test, y_pred_proba):
        """
        Save evaluation plots and visualizations
        
        Args:
            cm (array): Confusion matrix
            y_test (array): True labels
            y_pred_proba (array): Prediction probabilities
        """
        # Create plots directory
        plots_dir = RESULTS_DIR / 'baseline_plots'
        plots_dir.mkdir(exist_ok=True)
        
        # 1. Confusion Matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=['Negative', 'Positive'],
                   yticklabels=['Negative', 'Positive'])
        plt.title('Baseline Model - Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.savefig(plots_dir / 'confusion_matrix.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        # 2. Prediction Confidence Distribution
        plt.figure(figsize=(12, 5))
        
        # Subplot 1: Confidence for negative predictions
        plt.subplot(1, 2, 1)
        neg_confidence = y_pred_proba[y_test == 0][:, 0]
        plt.hist(neg_confidence, bins=30, alpha=0.7, color='red', label='True Negative')
        pos_confidence_neg = y_pred_proba[y_test == 1][:, 0]
        plt.hist(pos_confidence_neg, bins=30, alpha=0.7, color='blue', label='True Positive')
        plt.xlabel('Confidence for Negative Class')
        plt.ylabel('Frequency')
        plt.title('Confidence Distribution - Negative Class')
        plt.legend()
        
        # Subplot 2: Confidence for positive predictions
        plt.subplot(1, 2, 2)
        neg_confidence_pos = y_pred_proba[y_test == 0][:, 1]
        plt.hist(neg_confidence_pos, bins=30, alpha=0.7, color='red', label='True Negative')
        pos_confidence = y_pred_proba[y_test == 1][:, 1]
        plt.hist(pos_confidence, bins=30, alpha=0.7, color='blue', label='True Positive')
        plt.xlabel('Confidence for Positive Class')
        plt.ylabel('Frequency')
        plt.title('Confidence Distribution - Positive Class')
        plt.legend()
        
        plt.tight_layout()
        plt.savefig(plots_dir / 'confidence_distribution.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        logger.info(f"Evaluation plots saved to {plots_dir}")
    
    def get_feature_importance(self, top_n=20):
        """
        Get the most important features (words) for each class
        
        Args:
            top_n (int): Number of top features to return
            
        Returns:
            dict: Top features for each class
        """
        if not self.is_trained:
            raise ValueError("Model must be trained first")
        
        # Get feature names and coefficients
        feature_names = self.pipeline.named_steps['tfidf'].get_feature_names_out()
        coef = self.pipeline.named_steps['classifier'].coef_[0]
        
        # Get top positive and negative features
        top_positive_idx = np.argsort(coef)[-top_n:][::-1]
        top_negative_idx = np.argsort(coef)[:top_n]
        
        top_positive_features = [(feature_names[i], coef[i]) for i in top_positive_idx]
        top_negative_features = [(feature_names[i], coef[i]) for i in top_negative_idx]
        
        return {
            'positive_features': top_positive_features,
            'negative_features': top_negative_features
        }
    
    def save_model(self, model_path=None):
        """
        Save the trained model
        
        Args:
            model_path (str): Path to save the model
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before saving")
        
        if model_path is None:
            model_path = MODELS_DIR / 'baseline_model.pkl'
        
        model_data = {
            'pipeline': self.pipeline,
            'config': self.config,
            'is_trained': self.is_trained
        }
        
        joblib.dump(model_data, model_path)
        logger.info(f"Model saved to {model_path}")
    
    def load_model(self, model_path=None):
        """
        Load a previously trained model
        
        Args:
            model_path (str): Path to the saved model
        """
        if model_path is None:
            model_path = MODELS_DIR / 'baseline_model.pkl'
        
        if not Path(model_path).exists():
            raise FileNotFoundError(f"Model file not found: {model_path}")
        
        model_data = joblib.load(model_path)
        self.pipeline = model_data['pipeline']
        self.config = model_data['config']
        self.is_trained = model_data['is_trained']
        self.model = self.pipeline
        
        logger.info(f"Model loaded from {model_path}")

## Main Execution

Load preprocessed data, train, and evaluate the baseline model.

In [None]:
# Load preprocessed data
data_path = DATA_DIR / 'processed_splits.pkl'
if not data_path.exists():
    logger.error(f"Processed data not found: {data_path}")
    logger.info("Please run data_preprocessing.ipynb first to generate the processed data")
    raise FileNotFoundError(f"Processed data not found: {data_path}")

data = joblib.load(data_path)
X_train = data['X_train']
X_val = data['X_val']
X_test = data['X_test']
y_train = data['y_train']
y_val = data['y_val']
y_test = data['y_test']

logger.info("Loaded preprocessed data splits")

# Initialize and train model
baseline_model = BaselineSentimentModel()

# Train the model
train_results = baseline_model.train(X_train, y_train, X_val, y_val)

# Evaluate on test set
test_results = baseline_model.evaluate(X_test, y_test)

# Get feature importance
feature_importance = baseline_model.get_feature_importance()

# Print top features
logger.info("\nTop features for POSITIVE sentiment:")
for feature, coef in feature_importance['positive_features'][:10]:
    logger.info(f"  {feature}: {coef:.4f}")

logger.info("\nTop features for NEGATIVE sentiment:")
for feature, coef in feature_importance['negative_features'][:10]:
    logger.info(f"  {feature}: {coef:.4f}")

# Save model and results
baseline_model.save_model()

# Save results
all_results = {
    'train_results': train_results,
    'test_results': test_results,
    'feature_importance': feature_importance
}

joblib.dump(all_results, RESULTS_DIR / 'baseline_results.pkl')

logger.info("Baseline model training and evaluation completed!")

## Usage Instructions

1. **Prerequisites**:
   - Ensure the preprocessed data file (`processed_splits.pkl`) is available in the `twitter_data` folder on your Google Drive, generated by running `data_preprocessing.ipynb`.
2. **Run the Notebook**:
   - Execute all cells in sequence. The first cell mounts Google Drive and installs dependencies.
   - The notebook loads the preprocessed data, trains the TF-IDF + Logistic Regression model, evaluates it, and saves the results.
3. **Outputs**:
   - The trained model is saved as `baseline_model.pkl` in the `twitter_data/models` folder.
   - Evaluation results are saved as `baseline_results.pkl` in the `twitter_data/results` folder.
   - Plots (confusion matrix and confidence distribution) are saved in the `twitter_data/results/baseline_plots` folder.
4. **Notes**:
   - The model uses a TF-IDF vectorizer with up to 5000 features and bigrams, and a Logistic Regression classifier with C=1.0. Adjust these in `BASELINE_CONFIG` if needed.
   - The notebook logs training, validation, and test accuracies, as well as the top 10 features for positive and negative sentiments.
   - This model is lightweight and does not require a GPU, making it suitable for CPU-only Colab runtimes.