In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%writefile requirements.txt
# This file lists all the Python packages and their exact versions required for this project.
# Using specific versions ensures reproducibility across different environments.

# Deep Learning Frameworks and NLP Libraries
tensorflow==2.15.0        # Core deep learning library by Google, enabling neural network creation and training.
tensorflow-hub==0.14.0    # For publishing, discovering, and reusing pre-trained ML modules.
tensorflow-text==2.15.0   # Provides TensorFlow operations for text processing (crucially, version must match core TensorFlow for compatibility).
transformers==4.35.0      # Hugging Face library for state-of-the-art NLP models (like BERT) for model architecture and tokenization.
torch==2.1.0              # PyTorch deep learning framework (often a dependency for Hugging Face Transformers internal operations).

# Data Manipulation and Machine Learning Utilities
pandas==2.0.3             # Fundamental library for data manipulation and analysis using DataFrames.
numpy==1.26.4             # Basic numerical computing library for array operations.
scikit-learn==1.3.0       # Machine learning library for data splitting, preprocessing, and evaluation metrics.

# Visualization Libraries
matplotlib==3.7.2         # Basic plotting library.
seaborn==0.12.2           # High-level statistical data visualization based on Matplotlib.
plotly==5.17.0            # Interactive graphing library for web-based, dynamic plots.

# FastAPI API Development and Server Components
fastapi==0.104.1          # Modern, high-performance web framework for building APIs.
uvicorn==0.24.0           # ASGI server to run FastAPI applications.
pydantic==2.4.2           # Data validation and settings management (used by FastAPI for request/response models).
python-multipart==0.0.6   # Supports handling form data in FastAPI requests.

# Web and Utility Libraries
jinja2==3.1.2             # Powerful templating engine (useful for rendering HTML in web UIs, though not directly used in API core).
aiofiles==23.2.0          # Enables asynchronous file I/O operations (for non-blocking file access in async apps).
python-dotenv==1.0.0      # Loads environment variables from .env files (for secure management of sensitive data like API keys).

# Logging and Rich Terminal Output
loguru==0.7.2             # Simplified and powerful logging library.
rich==13.6.0              # Library for rich text and beautiful formatting in the terminal.

# Streamlit (Optional: Included for convenience as a potential future UI framework)
streamlit==1.28.0         # Open-source framework for building interactive web applications for ML/Data Science.

In [None]:
%%writefile config.py
import os # Module for interacting with the operating system (e.g., path operations).
from pathlib import Path # Object-oriented filesystem paths, preferred for cross-platform path handling.
from dataclasses import dataclass # Decorator to easily create classes that store data (config settings).
from typing import Optional # For type hinting, indicating a value can be None or a specific type.

@dataclass
class ModelConfig:
    """Model configuration settings, defining BERT model's core hyperparameters and properties."""
    model_name: str = "bert-base-uncased" # Name of the pre-trained BERT model from Hugging Face Transformers.
    max_length: int = 128               # Maximum sequence length for tokenization; sequences longer are truncated, shorter are padded.
    num_classes: int = 2                # Number of output classes for sentiment (e.g., 2 for positive/negative).
    dropout_rate: float = 0.1           # Dropout probability applied in the classification head for regularization.
    learning_rate: float = 2e-5         # Initial learning rate for the Adam optimizer during training.
    batch_size: int = 16                # Number of samples processed in one forward/backward pass during training.
    epochs: int = 3                     # Number of full passes through the entire training dataset.
    validation_split: float = 0.2       # Proportion of the training data to be reserved for validation.

@dataclass
class TrainingConfig:
    """Training configuration settings, controlling the training process behavior and callback strategies."""
    save_strategy: str = "epoch"        # Defines when to save model checkpoints (e.g., 'epoch' to save after each epoch).
    evaluation_strategy: str = "epoch"  # Defines when to evaluate the model on the validation set (e.g., 'epoch').
    logging_steps: int = 100            # How many steps between logging training progress updates.
    save_total_limit: int = 3           # Maximum number of model checkpoints to keep. Older ones are deleted.
    load_best_model_at_end: bool = True # If True, the model with the best validation metric is loaded at the end of training.
    metric_for_best_model: str = "eval_accuracy" # The metric to monitor to determine the "best" model checkpoint.
    greater_is_better: bool = True      # If True, a higher value for `metric_for_best_model` indicates a better model.

@dataclass
class ProjectConfig:
    """Project paths and settings, defining the directory structure for inputs and outputs."""
    # Project root directory. Critically set to '/kaggle/working/' for execution within Kaggle Notebooks.
    project_root: Path = Path("/kaggle/working").absolute() 
    data_dir: Path = project_root / "data"     # Directory for raw and processed data files.
    models_dir: Path = project_root / "models" # Directory to save trained model checkpoints (e.g., best_model.h5).
    logs_dir: Path = project_root / "logs"     # Directory for application logs (e.g., app.log).
    output_dir: Path = project_root / "outputs" # Directory for evaluation results, reports, and plots.
    temp_dir: Path = project_root / "temp"     # Directory for temporary files created during execution.

    def __post_init__(self):
        """
        Special method that runs automatically after an instance of ProjectConfig is created.
        It ensures all specified project directories exist, creating them if necessary.
        """
        for dir_path in [self.data_dir, self.models_dir, self.logs_dir,
                        self.output_dir, self.temp_dir]:
            dir_path.mkdir(parents=True, exist_ok=True) # `parents=True` creates any missing parent directories.

# Initialize configurations: Create single instances of each config class.
# These instances hold the definitive settings and are imported across other modules.
model_config = ModelConfig()
training_config = TrainingConfig()
project_config = ProjectConfig()

In [None]:
%%writefile logger.py
import logging # Standard Python logging module (used by RichHandler internally).
import sys     # System-specific parameters and functions (for stderr).
from loguru import logger # Simplified and powerful logging library.
from rich.console import Console # Rich console for beautiful terminal output.
from rich.logging import RichHandler # Loguru handler to integrate with Rich.

# Import project_config to access log directory path defined in config.py.
from config import project_config 

def setup_logging(level: str = "INFO") -> None:
    """
    Sets up a professional logging configuration using Loguru and Rich.
    Logs messages to both console (formatted by Rich) and a file.
    
    Args:
        level (str): Minimum logging level to display and save (e.g., "INFO", "DEBUG", "ERROR").
                     Messages below this level will be ignored.
    """
    
    # Remove any default Loguru handlers to take full control over logging destinations.
    logger.remove()
    
    # Add a RichHandler to direct log messages to the console (stderr).
    # `Console(stderr=True)` directs output to the standard error stream, which is common in Jupyter environments.
    # `rich_tracebacks=True` enhances traceback formatting for better debugging.
    logger.add(
        RichHandler(console=Console(stderr=True), rich_tracebacks=True), 
        # Define the format for log messages: timestamp, level, source (file:function:line), and message.
        format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
        level=level # Set the minimum logging level for console output.
    )
    
    # Add a file handler to persist log messages to a file.
    logger.add(
        project_config.logs_dir / "app.log", # Construct the full log file path using ProjectConfig.
        rotation="10 MB", # Configure log file rotation: a new file is created when the current one reaches 10 MB.
        retention="7 days", # Configure log file retention: log files older than 7 days are automatically deleted.
        level=level, # Set the minimum logging level for file output.
        # Define the format for log messages saved to the file.
        format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}" 
    )
    
    # Log a confirmation message that the logging setup has been completed successfully.
    logger.info("Logging setup completed")

In [None]:
%%writefile data_loader.py
import pandas as pd # Library for data manipulation (e.g., concatenating arrays, not directly used for DataFrame operations here).
import numpy as np # Fundamental package for numerical computing, used for array operations like concatenation.
import tensorflow as tf # Core deep learning library, specifically used here for loading the IMDB dataset from Keras.
from sklearn.model_selection import train_test_split # Utility for splitting datasets into random train and test subsets.
from sklearn.utils import shuffle # Utility for randomizing the order of elements in lists.
from typing import Tuple, List, Dict, Optional # Used for type hints, improving code readability and maintainability.
import re # Module for regular expressions, used for pattern matching and text manipulation (cleaning).
import html # Module for working with HTML entities, used for decoding HTML in text cleaning.
from loguru import logger # Used for structured and informative logging of data processing steps.

class DataProcessor:
    """
    Advanced data processing pipeline for sentiment analysis.
    This class handles loading the raw IMDB dataset, cleaning text content,
    and splitting the data into training, validation, and test sets.
    """

    def __init__(self):
        """
        Initializes the DataProcessor with mappings for sentiment labels.
        This provides a clear way to convert between string labels and numerical IDs.
        """
        self.label_mapping = {"negative": 0, "positive": 1} # Maps string labels to numerical IDs.
        self.reverse_label_mapping = {0: "negative", 1: "positive"} # Maps numerical IDs back to string labels.

    def clean_text(self, text: str) -> str:
        """
        Applies an advanced text cleaning pipeline to a single text string.
        This function is crucial for standardizing input text before tokenization,
        ensuring consistency with how the model was trained.

        Args:
            text (str): The input text string to be cleaned.

        Returns:
            str: The cleaned text string.
        """
        # Ensure the input is a string; return empty if not to prevent errors.
        if not isinstance(text, str):
            return ""

        # Decode HTML entities (e.g., convert '&' to '&').
        # This prevents HTML encoding from interfering with sentiment analysis.
        text = html.unescape(text)

        # Remove HTML tags (e.g., '<br />' becomes '').
        # HTML tags are noise for sentiment analysis.
        text = re.sub(r'<[^>]+>', '', text)

        # Normalize whitespace: replace multiple spaces, tabs, and newlines with a single space.
        # This ensures consistent spacing and removes extra blank lines.
        text = re.sub(r'\s+', ' ', text)

        # Remove any characters that are not alphanumeric, whitespace, or common punctuation.
        # This helps in removing special symbols or emojis that might not be in BERT's vocabulary.
        text = re.sub(r'[^\w\s.,!?;:-]', '', text)

        # Convert the entire text to lowercase and remove any leading/trailing whitespace.
        # Lowercasing helps standardize words (e.g., "Good" and "good" are treated the same).
        text = text.lower().strip()

        return text

    def load_imdb_dataset(self, num_samples: Optional[int] = None) -> Tuple[List[str], List[int]]:
        """
        Loads the IMDB movie review dataset directly from TensorFlow Keras datasets.
        It then decodes the numerical review sequences back into human-readable text,
        applies the defined cleaning process, and optionally limits the number of samples.

        Args:
            num_samples (Optional[int]): If provided, only this many samples will be loaded
                                         from the full dataset. Useful for faster debugging
                                         and initial development.

        Returns:
            Tuple[List[str], List[int]]: A tuple containing:
                - List of cleaned text reviews.
                - List of corresponding numerical sentiment labels (0 for negative, 1 for positive).
        """
        logger.info("Loading IMDB dataset...")

        try:
            # Load the IMDB dataset. This includes numerical sequences of reviews and their labels.
            # The data is downloaded the first time this function is called.
            (x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data()

            # Retrieve the word index mapping from the IMDB dataset.
            # This dictionary maps words to integer IDs.
            word_index = tf.keras.datasets.imdb.get_word_index()
            # Create a reverse mapping from integer IDs back to words.
            # The '-3' accounts for special tokens (padding, start-of-sequence, unknown).
            reverse_word_index = {value: key for key, value in word_index.items()}

            # Nested helper function to decode a single numerical review sequence back to text.
            def decode_review(encoded_review):
                return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review])

            # Convert all numerical training and testing reviews into text strings.
            x_train_text = [decode_review(review) for review in x_train]
            x_test_text = [decode_review(review) for review in x_test]

            # Combine all texts and labels into single lists for unified processing.
            all_texts = x_train_text + x_test_text
            all_labels = np.concatenate([y_train, y_test])

            # Apply the text cleaning function to all combined texts.
            logger.info("Cleaning text data...")
            cleaned_texts = [self.clean_text(text) for text in all_texts]

            # Filter out reviews that became too short or empty after cleaning.
            # This prevents very short, uninformative texts from being processed.
            valid_indices = [i for i, text in enumerate(cleaned_texts) if len(text.strip()) > 10]
            cleaned_texts = [cleaned_texts[i] for i in valid_indices]
            all_labels = [all_labels[i] for i in valid_indices]

            # Optionally limit the number of samples for faster experimentation.
            if num_samples and num_samples < len(cleaned_texts):
                # Shuffle the data before slicing to ensure randomness in selected samples.
                cleaned_texts, all_labels = shuffle(cleaned_texts, all_labels, random_state=42)
                cleaned_texts = cleaned_texts[:num_samples] # Select first `num_samples`.
                all_labels = all_labels[:num_samples]

            logger.info(f"Loaded {len(cleaned_texts)} samples")
            return cleaned_texts, all_labels

        except Exception as e:
            # Log any errors that occur during dataset loading and re-raise them.
            logger.error(f"Error loading IMDB dataset: {e}")
            raise

    def create_data_splits(self, texts: List[str], labels: List[int],
                          test_size: float = 0.2, val_size: float = 0.1) -> Dict:
        """
        Splits the provided texts and labels into distinct training, validation, and test datasets.
        It uses a stratified splitting approach to ensure that the class distribution (positive/negative)
        is preserved in each subset.

        Args:
            texts (List[str]): A list of cleaned text samples.
            labels (List[int]): A list of corresponding numerical sentiment labels.
            test_size (float): The proportion of the dataset to allocate to the final test set (e.g., 0.2 for 20%).
            val_size (float): The proportion of the remaining data (after test split) to allocate to the validation set.

        Returns:
            Dict: A dictionary containing three keys ('train', 'validation', 'test'),
                  each mapping to another dictionary with 'texts' and 'labels' for that split.
        """
        logger.info("Creating data splits...")

        # First split: Separate out the final test set.
        # `stratify=labels` ensures that the proportion of classes (0s and 1s) is the same in both X_temp and X_test.
        X_temp, X_test, y_temp, y_test = train_test_split(
            texts, labels, test_size=test_size, random_state=42, stratify=labels
        )

        # Second split: Divide the remaining data (X_temp, y_temp) into training and validation sets.
        # The validation size is adjusted because it's a split of the *remaining* data, not the original full dataset.
        val_size_adjusted = val_size / (1 - test_size)
        X_train, X_val, y_train, y_val = train_test_split(
            X_temp, y_temp, test_size=val_size_adjusted, random_state=42, stratify=y_temp
        )

        # Organize the split datasets into a dictionary for easy access.
        data_splits = {
            'train': {'texts': X_train, 'labels': y_train},
            'validation': {'texts': X_val, 'labels': y_val},
            'test': {'texts': X_test, 'labels': y_test}
        }

        logger.info(f"Data splits - Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
        return data_splits

    def create_tf_dataset(self, texts: List[str], labels: List[int],
                         batch_size: int, shuffle: bool = True) -> tf.data.Dataset:
        """
        Converts Python lists of text samples and their corresponding labels into a
        highly performant TensorFlow `tf.data.Dataset` object. This format is
        optimized for feeding data to TensorFlow models during training.

        Args:
            texts (List[str]): A list of text samples.
            labels (List[int]): A list of corresponding numerical labels.
            batch_size (int): The number of elements (samples) to include in each batch
                              of the dataset.
            shuffle (bool): If True, the dataset will be shuffled. Recommended for training data.

        Returns:
            tf.data.Dataset: A TensorFlow dataset, which is batched and prefetched
                             for efficient data loading and processing during model training.
        """
        # Create a tf.data.Dataset from slices of the input texts and labels.
        dataset = tf.data.Dataset.from_tensor_slices((texts, labels))

        # Shuffle the dataset if specified.
        # `buffer_size` is used for shuffling, ensuring efficient shuffling.
        if shuffle:
            dataset = dataset.shuffle(buffer_size=1000, seed=42)

        # Batch the dataset into smaller chunks for efficient processing by the model.
        dataset = dataset.batch(batch_size)
        # Use prefetching to allow the data pipeline to prepare batches in the background
        # while the model is processing the current batch, improving performance.
        dataset = dataset.prefetch(tf.data.AUTOTUNE)

        return dataset

In [None]:
%%writefile model.py
import tensorflow as tf # Core deep learning library for building and training models.
import tensorflow_hub as hub # For reusable machine learning modules.
from transformers import TFBertModel, BertTokenizer # Hugging Face BERT model and tokenizer for TensorFlow.
from typing import Dict, Any, Optional # For type hinting.
from loguru import logger # For logging messages.

# Import model_config to access model-specific hyperparameters from config.py.
from config import ModelConfig 

class BERTSentimentClassifier(tf.keras.Model):
    """
    Professional BERT-based sentiment classifier model.
    This class inherits from tf.keras.Model, allowing for a custom, trainable Keras model.
    It encapsulates the BERT base and a custom classification head.
    """
    
    def __init__(self, 
                 model_name: str = "bert-base-uncased", # Name of the pre-trained BERT model to load (e.g., 'bert-base-uncased').
                 num_classes: int = 2,                 # Number of output sentiment classes (e.g., 2 for positive/negative).
                 dropout_rate: float = 0.1,             # Dropout rate for regularization in the classification layers.
                 max_length: int = 128,                 # Maximum sequence length for input tokens.
                 **kwargs):                             # Allows passing additional keyword arguments to the base class.
        super().__init__(**kwargs) # Initialize the base Keras Model class with any passed kwargs.
        
        self.model_name = model_name
        self.num_classes = num_classes
        self.dropout_rate = dropout_rate
        self.max_length = max_length
        
        # Initialize the tokenizer corresponding to the pre-trained BERT model.
        # This tokenizer converts text into numerical input IDs and attention masks.
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        
        # Load the pre-trained TFBertModel (TensorFlow version of BERT).
        # This forms the powerful backbone of our sentiment classifier.
        self.bert = TFBertModel.from_pretrained(model_name)
        # Define a Dropout layer to prevent overfitting during training.
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        # Define the final classification layer.
        # `softmax` activation outputs probabilities for each class, summing to 1.
        self.classifier = tf.keras.layers.Dense(
            num_classes, 
            activation='softmax', 
            name='classifier'     # Assign a name to the layer for better visualization/debugging.
        )
        
        logger.info(f"Initialized BERT model: {model_name}")
    
    def tokenize_texts(self, texts):
        """
        Tokenizes input texts using the pre-trained BERT tokenizer.
        This prepares raw text for input into the BERT model.
        
        Args:
            texts: A list of text strings to be tokenized.
            
        Returns:
            A dictionary containing tokenized inputs as TensorFlow tensors
            (input_ids, attention_mask, etc.).
        """
        return self.tokenizer(
            texts,
            padding=True,              # Pads sequences to the `max_length` or the longest sequence in the batch.
            truncation=True,           # Truncates sequences longer than `max_length`.
            max_length=self.max_length, # Uses the `max_length` defined in model configuration.
            return_tensors='tf'        # Ensures the output is in TensorFlow tensor format.
        )
    
    def call(self, inputs, training=False):
        """
        Defines the forward pass logic of the model.
        This method is called when the model is executed (e.g., during training or prediction).
        
        Args:
            inputs: A dictionary of tokenized inputs (typically 'input_ids' and 'attention_mask').
            training (bool): A boolean indicating whether the model is currently in training mode.
                             This is used to control the behavior of layers like Dropout (active during training).
        
        Returns:
            tf.Tensor: The output logits (raw scores) or probabilities from the classification layer.
        """
        # Pass the input IDs and attention mask through the BERT model.
        # `training` argument is passed to control BERT's internal dropout layers.
        bert_outputs = self.bert(inputs, training=training)
        
        # Extract the pooled output. For classification tasks, this typically represents
        # the aggregated information of the entire sequence, usually from the [CLS] token.
        pooled_output = bert_outputs.pooler_output
        
        # Apply the dropout layer. It is only active when `training` is True.
        pooled_output = self.dropout(pooled_output, training=training)
        
        # Pass the pooled output through the final classification layer.
        logits = self.classifier(pooled_output)
        
        return logits
    
    def get_config(self):
        """
        Returns the model's configuration parameters.
        This method is required for Keras to correctly serialize and deserialize the model.
        """
        return {
            'model_name': self.model_name,
            'num_classes': self.num_classes,
            'dropout_rate': self.dropout_rate,
            'max_length': self.max_length
        }

class BERTModelBuilder:
    """
    A static helper class for building BERT-based models.
    It provides a clean interface to construct model instances using the Keras Functional API.
    """
    
    @staticmethod
    def build_functional_model(model_config: ModelConfig) -> tf.keras.Model:
        """
        Builds a BERT-based sentiment classification model using the Keras Functional API.
        The Functional API is preferred for its flexibility in defining complex architectures.
        
        Args:
            model_config (ModelConfig): A configuration object containing hyperparameters
                                        like model name, max length, number of classes, and dropout rate.
                                        
        Returns:
            tf.keras.Model: A compiled Keras model ready for training or prediction.
        """
        
        # Define the input layers for the BERT model: input_ids and attention_mask.
        # `input_ids` are the numerical representations of tokens.
        input_ids = tf.keras.layers.Input(
            shape=(model_config.max_length,), # Input shape is (sequence_length,), batch size is implicit (None).
            dtype=tf.int32,                   # Data type for token IDs is integer.
            name='input_ids'                  # A name for the input layer, useful for model summaries and debugging.
        )
        # `attention_mask` indicates which tokens are real and which are padding, crucial for BERT.
        attention_mask = tf.keras.layers.Input(
            shape=(model_config.max_length,), 
            dtype=tf.int32, 
            name='attention_mask'
        )
        
        # Load the pre-trained TensorFlow BERT model from Hugging Face.
        # This is the backbone of our classification model.
        bert = TFBertModel.from_pretrained(model_config.model_name)
        # Pass the input layers through the BERT model.
        bert_outputs = bert(input_ids, attention_mask=attention_mask)
        
        # Extract the pooled output from BERT. This is typically the representation of the [CLS] token,
        # which is used as the aggregate representation of the entire input sequence for classification.
        pooled_output = bert_outputs.pooler_output
        
        # Add custom classification layers on top of BERT's output.
        # Dropout layer for regularization to prevent overfitting.
        x = tf.keras.layers.Dropout(model_config.dropout_rate)(pooled_output) 
        # An additional Dense layer with ReLU activation for non-linear transformation.
        x = tf.keras.layers.Dense(128, activation='relu')(x) 
        # Another Dropout layer.
        x = tf.keras.layers.Dropout(model_config.dropout_rate)(x)
        # Final output Dense layer with softmax activation for multi-class probability distribution.
        outputs = tf.keras.layers.Dense(
            model_config.num_classes, 
            activation='softmax' 
        )(x)
        
        # Construct the full Keras Model by specifying its inputs and outputs.
        model = tf.keras.Model(
            inputs=[input_ids, attention_mask], # List of input layers.
            outputs=outputs,                   # Output tensor from the final layer.
            name='bert_sentiment_classifier'   # A descriptive name for the entire model.
        )
        
        return model

In [None]:
%%writefile trainer.py
import tensorflow as tf # Core deep learning library for model training.
from tensorflow.keras.optimizers import Adam # Optimizer for updating model weights.
from tensorflow.keras.losses import SparseCategoricalCrossentropy # Loss function for integer labels.
from tensorflow.keras.metrics import SparseCategoricalAccuracy # Metric to track accuracy for integer labels.
from tensorflow.keras.callbacks import (
    ModelCheckpoint,       # Callback to save the best model during training.
    EarlyStopping,         # Callback to stop training early if validation metric plateaus.
    ReduceLROnPlateau,     # Callback to reduce learning rate when a metric stops improving.
    TensorBoard,           # Callback for visualizing training progress.
    CSVLogger              # Callback to save training history to a CSV file.
)
from sklearn.metrics import classification_report, confusion_matrix # For detailed evaluation metrics.
import numpy as np # For numerical operations, especially converting labels to NumPy arrays.
import matplotlib.pyplot as plt # For plotting (though mostly handled by Plotly for interactive plots).
import seaborn as sns # For enhanced statistical data visualization (often used with Matplotlib).
from typing import Dict, List, Tuple, Optional # For type hinting.
from loguru import logger # For structured logging.
import json # For saving training history and configurations to JSON files.
import time # For measuring training time.

# Import configuration objects from config.py.
from config import ModelConfig, TrainingConfig, project_config

class BERTTrainer:
    """
    Professional BERT training pipeline.
    Manages model compilation, callbacks setup, and the training loop.
    """
    
    def __init__(self, 
                 model: tf.keras.Model,          # The Keras model to be trained.
                 model_config: ModelConfig,       # Model configuration object.
                 training_config: TrainingConfig): # Training configuration object.
        # Call the constructor of the parent class (no explicit parent in this class definition,
        # but it's good practice or might be inherited implicitly from object).
        # This line was previously `super().__init__(**kwargs)` which was incorrect without a proper parent setup.
        super().__init__() 
        self.model = model
        self.model_config = model_config
        self.training_config = training_config
        self.history = None # To store training history (loss, accuracy per epoch).
        self.tokenizer = None # To store the BERT tokenizer.

        # Initialize tokenizer from Hugging Face Transformers.
        # Imported here to avoid circular dependencies if model.py also imports trainer,
        # ensuring the tokenizer is available when the trainer is instantiated.
        from transformers import BertTokenizer 
        self.tokenizer = BertTokenizer.from_pretrained(model_config.model_name)

        logger.info("BERT Trainer initialized")
    
    def prepare_dataset(self, texts: List[str], labels: List[int]) -> tf.data.Dataset:
        """
        Prepares a TensorFlow dataset from texts and labels, including tokenization.
        Note: This method is currently NOT directly used in the `train` method's workflow
        (tokenization is handled directly within `train`). It might be a remnant
        from an alternative data pipeline design.
        """
        def tokenize_function(texts, labels):
            # Tokenize texts, converting them to numerical IDs and attention masks.
            tokenized = self.tokenizer(
                texts.numpy().tolist(), # Convert TensorFlow tensors back to Python list for tokenizer processing.
                padding=True,           # Pad sequences to a uniform length.
                truncation=True,        # Truncate sequences that are too long.
                max_length=self.model_config.max_length, # Use max_length from model config.
                return_tensors='tf'     # Ensure output is in TensorFlow tensor format.
            )
            return {
                'input_ids': tokenized['input_ids'],
                'attention_mask': tokenized['attention_mask']
            }, labels
        
        # Create a TensorFlow dataset from input texts and labels.
        dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
        
        # The following lines (`if shuffle:` and `dataset = dataset.batch(...)`)
        # contain variables (`shuffle`, `batch_size`) that are not defined in this method's scope.
        # If this method were to be used, these would need to be passed as arguments or accessed from config.
        if shuffle: # This `shuffle` variable is not defined in this method's scope.
            dataset = dataset.shuffle(buffer_size=1000, seed=42)
        dataset = dataset.batch(batch_size) # This `batch_size` variable is not defined in this method's scope.
        dataset = dataset.prefetch(tf.data.AUTOTUNE) # Optimize data loading by prefetching batches.
        return dataset

    def compile_model(self):
        """
        Compiles the Keras model with a specified optimizer, loss function, and metrics.
        This prepares the model for the training process before fitting the data.
        """
        optimizer = Adam(learning_rate=self.model_config.learning_rate) # Use the Adam optimizer with the configured learning rate.
        # Define the loss function for multi-class classification with integer labels.
        # `from_logits=False` because our model's final layer uses `softmax` activation (outputs probabilities).
        loss = SparseCategoricalCrossentropy(from_logits=False) 
        metrics = [SparseCategoricalAccuracy(name='accuracy')] # Track sparse categorical accuracy during training.
        
        # Compile the model with the defined optimizer, loss, and metrics.
        self.model.compile(
            optimizer=optimizer,
            loss=loss,
            metrics=metrics
        )
        logger.info("Model compiled successfully")
    
    def setup_callbacks(self) -> List[tf.keras.callbacks.Callback]: # Added `self` as the first argument.
        """
        Sets up a list of Keras Callbacks to enhance, monitor, and control the training process.
        This includes saving the best model, early stopping, learning rate reduction, and logging.
        
        Returns:
            List[tf.keras.callbacks.Callback]: A list of configured Keras callback instances.
        """
        callbacks = []
        
        # ModelCheckpoint: Saves the model's weights or entire model at specific points.
        checkpoint_path = project_config.models_dir / "best_model.h5" # Define the path where the best model will be saved.
        checkpoint = ModelCheckpoint(
            filepath=str(checkpoint_path), # File path for saving the model.
            monitor='val_accuracy',        # Metric to monitor for improvement (validation accuracy).
            save_best_only=True,           # Only save the model if the monitored metric improves.
            save_weights_only=False,       # Save the entire model (architecture + weights).
            mode='max',                    # 'max' indicates that higher 'val_accuracy' is better.
            verbose=1                      # Display messages when a model is saved.
        )
        callbacks.append(checkpoint)
        
        # EarlyStopping: Stops training automatically if the monitored metric stops improving.
        early_stopping = EarlyStopping(
            monitor='val_accuracy',        # Monitor validation accuracy.
            patience=2,                    # Number of epochs to wait for improvement before stopping.
            restore_best_weights=True,     # Reverts model weights to the best performing epoch.
            mode='max',                    # 'max' indicates that higher 'val_accuracy' is better.
            verbose=1                      # Display messages when early stopping is triggered.
        )
        callbacks.append(early_stopping)
        
        # ReduceLROnPlateau: Reduces the learning rate when the monitored metric plateaus.
        reduce_lr = ReduceLROnPlateau(
            monitor='val_loss', # Monitor validation loss.
            factor=0.5,         # Factor by which the learning rate will be reduced (new_lr = old_lr * factor).
            patience=1,         # Number of epochs with no improvement after which learning rate will be reduced.
            min_lr=1e-7,        # Lower bound on the learning rate.
            mode='min',         # 'min' indicates that lower 'val_loss' is better.
            verbose=1           # Display messages when learning rate is reduced.
        )
        callbacks.append(reduce_lr)
        
        # TensorBoard: A visualization toolkit for TensorFlow to inspect training runs.
        tensorboard = TensorBoard(
            log_dir=str(project_config.logs_dir / "tensorboard"), # Directory for TensorBoard logs.
            histogram_freq=1, # Compute histograms for weights, biases, and activations every epoch.
            write_graph=True, # Visualize the model's computation graph.
            update_freq='epoch' # How often TensorBoard logs are updated.
        )
        callbacks.append(tensorboard)
        
        # CSVLogger: Streams epoch results to a CSV file.
        csv_logger = CSVLogger(
            str(project_config.logs_dir / "training_log.csv"), # Path to the CSV log file.
            append=True # If True, append results to the file if it already exists.
        )
        callbacks.append(csv_logger)
        
        return callbacks
    
    def train(self, 
              train_data: Dict,     # Dictionary containing training texts and their corresponding labels.
              val_data: Dict) -> tf.keras.callbacks.History: # Dictionary containing validation texts and their labels.
        """
        Executes the training loop for the BERT model using the prepared data.
        
        Args:
            train_data (Dict): A dictionary with 'texts' (list of strings) and 'labels' (list of integers)
                                for the training set.
            val_data (Dict): A dictionary with 'texts' (list of strings) and 'labels' (list of integers)
                             for the validation set.
            
        Returns:
            tf.keras.callbacks.History: A Keras History object, containing records of training loss values
                                        and metrics values at successive epochs.
        """
        logger.info("Starting model training...")
        start_time = time.time() # Record the start time to calculate total training duration.
        
        # Extract text lists and label lists from the input data dictionaries.
        train_texts = train_data['texts']
        train_labels = train_data['labels']
        val_texts = val_data['texts']
        val_labels = val_data['labels']
        
        # Tokenize training data using the BERT tokenizer.
        logger.info("Tokenizing training data...")
        train_encodings = self.tokenizer(
            train_texts,
            padding=True,                  # Pads sequences to `max_length` or the longest in the batch.
            truncation=True,               # Truncates sequences longer than `max_length`.
            max_length=self.model_config.max_length, # Uses the `max_length` specified in ModelConfig.
            return_tensors='tf'            # Returns TensorFlow tensors for input_ids and attention_mask.
        )
        
        # Tokenize validation data using the BERT tokenizer.
        logger.info("Tokenizing validation data...")
        val_encodings = self.tokenizer(
            val_texts,
            padding=True,
            truncation=True,
            max_length=self.model_config.max_length,
            return_tensors='tf'
        )
        
        # Compile the model (if it hasn't been compiled yet) using defined optimizer, loss, and metrics.
        self.compile_model()
        
        # Setup and retrieve the list of Keras callbacks for this training run.
        callbacks = self.setup_callbacks()
        
        # Start the actual model training process.
        self.history = self.model.fit(
            x=[train_encodings['input_ids'], train_encodings['attention_mask']], # BERT requires input_ids and attention_mask.
            y=np.array(train_labels), # Convert labels to a NumPy array.
            validation_data=( # Provide validation data for monitoring training progress.
                [val_encodings['input_ids'], val_encodings['attention_mask']], 
                np.array(val_labels)
            ), 
            epochs=self.model_config.epochs, # Number of epochs from ModelConfig.
            batch_size=self.model_config.batch_size, # Batch size from ModelConfig.
            callbacks=callbacks, # Apply the configured callbacks during training.
            verbose=1 # Display training progress in detail for each epoch.
        )
        
        training_time = time.time() - start_time # Calculate the total time taken for training.
        logger.info(f"Training completed in {training_time:.2f} seconds")
        
        return self.history # Return the training history object.
    
    def save_training_artifacts(self):
        """
        Saves important artifacts from the training process:
        - The training history (loss and metrics per epoch) as a JSON file.
        - The model configuration (ModelConfig and TrainingConfig) as a JSON file.
        This is crucial for reproducibility and for analyzing past training runs.
        """
        
        # Save training history if the `history` object is available (meaning training occurred).
        if self.history:
            history_path = project_config.output_dir / "training_history.json"
            
            # Prepare the training history for JSON serialization.
            # This step converts NumPy/TensorFlow float32 values (which are not directly JSON serializable)
            # into standard Python floats. This resolved the `TypeError: Object of type float32 is not JSON serializable`
            # error that occurred when attempting to save the history dictionary.
            serializable_history = {}
            for key, value_list in self.history.history.items():
                serializable_history[key] = [float(val) for val in value_list] # Convert each float32 value in lists to Python float.
            
            # Write the serializable history to a JSON file with pretty printing (indent=2).
            with open(history_path, 'w') as f:
                json.dump(serializable_history, f, indent=2) 
            logger.info(f"Training history saved to {history_path}")
        
        # Save the combined model and training configurations.
        config_path = project_config.output_dir / "model_config.json"
        config_dict = {
            'model_config': self.model_config.__dict__, # Convert ModelConfig dataclass instance to a dictionary.
            'training_config': self.training_config.__dict__ # Convert TrainingConfig dataclass instance to a dictionary.
        }
        # Write the configuration dictionary to a JSON file.
        with open(config_path, 'w') as f:
            json.dump(config_dict, f, indent=2)
        logger.info(f"Model configuration saved to {config_path}")

In [None]:
%%writefile evaluator.py
import numpy as np # For numerical operations and array manipulation.
import matplotlib.pyplot as plt # Basic plotting functions (used by seaborn).
import seaborn as sns # High-level statistical plotting library.
from sklearn.metrics import ( # Various metrics for model evaluation.
    accuracy_score,           # Overall classification accuracy.
    precision_score,          # Precision for positive and negative classes.
    recall_score,             # Recall for positive and negative classes.
    f1_score,                 # F1-score (harmonic mean of precision and recall).
    classification_report,    # Detailed report of precision, recall, f1-score for each class.
    confusion_matrix,         # Matrix showing correct and incorrect predictions.
    roc_auc_score,            # Area Under the Receiver Operating Characteristic (ROC) Curve.
    roc_curve                 # Data points for plotting the ROC curve.
)
import plotly.graph_objects as go # For creating interactive graph objects (e.g., scatter plots).
import plotly.express as px # Simplified interface for Plotly for quick plots.
from plotly.subplots import make_subplots # For creating subplots in Plotly.
from typing import Dict, List, Tuple, Optional # For type hints, improving code readability.
from loguru import logger # For structured and informative logging.

# Crucial import: TensorFlow is needed for Keras model type hints and model operations.
import tensorflow as tf 
# Import project and model configurations from config.py.
from config import project_config, model_config 

class ModelEvaluator:
    """
    Comprehensive model evaluation suite.
    This class is responsible for making predictions, calculating various performance metrics,
    and generating interactive visualizations of the model's performance.
    """
    
    # The __init__ method is designed to directly receive the already initialized model and tokenizer.
    # This prevents redundant loading/initialization of the tokenizer, which resolves issues like
    # `NameError: name 'tf' is not defined` and `HFValidationError` seen in earlier iterations.
    def __init__(self, model: tf.keras.Model, tokenizer): # The trained Keras model, and the pre-trained tokenizer.
        self.model = model
        self.tokenizer = tokenizer # Stores the tokenizer provided (e.g., from BERTTrainer).
        self.class_names = ['Negative', 'Positive'] # Defines human-readable class names for reports and plots.
        logger.info("Model Evaluator initialized with provided tokenizer") # Log confirmation of initialization.
    
    def predict(self, texts: List[str]) -> Tuple[np.ndarray, np.ndarray]:
        """
        Makes sentiment predictions on a list of raw text strings using the loaded BERT model.
        The texts are tokenized, then passed through the model.
        
        Args:
            texts (List[str]): A list of text strings for which sentiment predictions are required.
            
        Returns:
            Tuple[np.ndarray, np.ndarray]: A tuple containing:
                - predictions (np.ndarray): The raw probability scores output by the model for each class.
                - predicted_classes (np.ndarray): The numerical class label (0 or 1) with the highest probability.
        """
        # Tokenize the input texts using the tokenizer.
        # This converts text into numerical input IDs and attention masks expected by BERT.
        encodings = self.tokenizer(
            texts,
            padding=True,              # Pads sequences to `max_length` or the longest in the batch.
            truncation=True,           # Truncates sequences longer than `max_length`.
            max_length=model_config.max_length, # Uses the `max_length` specified in ModelConfig for consistent input shape.
            return_tensors='tf'        # Ensures the output is in TensorFlow tensor format.
        )
        
        # Get predictions from the model by passing the tokenized inputs.
        # The model expects separate tensors for input_ids and attention_mask.
        predictions = self.model.predict([
            encodings['input_ids'],
            encodings['attention_mask']
        ])
        
        # Determine the predicted class by finding the index of the highest probability.
        predicted_classes = np.argmax(predictions, axis=1)
        
        return predictions, predicted_classes # Return raw probabilities and the predicted class labels.
    
    def evaluate_model(self, test_data: Dict) -> Dict:
        """
        Performs a comprehensive evaluation of the model's performance on the test data.
        It calculates various standard metrics and generates a detailed classification report
        and confusion matrix.
        
        Args:
            test_data (Dict): A dictionary containing 'texts' (list of strings) and 'labels' (list of integers)
                              for the test dataset.
            
        Returns:
            Dict: A dictionary containing:
                - 'metrics': A dictionary of scalar performance metrics (accuracy, precision, recall, f1-score, ROC AUC).
                - 'classification_report': A detailed report per class (precision, recall, f1-score, support).
                - 'confusion_matrix': A NumPy array representing the confusion matrix.
                - 'predictions': Raw probability predictions from the model.
                - 'predicted_classes': The numerical class labels predicted by the model.
                - 'true_labels': The actual numerical class labels from the test data.
        """
        logger.info("Starting model evaluation...")
        
        test_texts = test_data['texts'] # Extract test texts.
        test_labels = np.array(test_data['labels']) # Extract true labels and convert to NumPy array.
        
        # Get predictions (probabilities and predicted class labels) for the test texts.
        probabilities, predicted_classes = self.predict(test_texts)
        
        # Calculate key performance metrics using scikit-learn.
        metrics = {
            'accuracy': accuracy_score(test_labels, predicted_classes), # Overall accuracy.
            'precision': precision_score(test_labels, predicted_classes, average='weighted'), # Weighted average precision.
            'recall': recall_score(test_labels, predicted_classes, average='weighted'),     # Weighted average recall.
            'f1_score': f1_score(test_labels, predicted_classes, average='weighted'),       # Weighted average F1-score.
            'roc_auc': roc_auc_score(test_labels, probabilities[:, 1]) # ROC AUC score for the positive class.
        }
        
        # Generate a detailed classification report, using human-readable class names.
        class_report = classification_report(
            test_labels, predicted_classes, 
            target_names=self.class_names, # Use 'Negative', 'Positive' for clarity.
            output_dict=True # Return the report as a dictionary.
        )
        
        # Compute the confusion matrix.
        cm = confusion_matrix(test_labels, predicted_classes)
        
        # Compile all evaluation results into a single dictionary.
        results = {
            'metrics': metrics,
            'classification_report': class_report,
            'confusion_matrix': cm,
            'predictions': probabilities,
            'predicted_classes': predicted_classes,
            'true_labels': test_labels
        }
        
        logger.info(f"Evaluation completed. Accuracy: {metrics['accuracy']:.4f}") # Log final accuracy.
        return results
    
    def plot_training_history(self, history: tf.keras.callbacks.History):
        """
        Generates and saves an interactive Plotly graph visualizing the model's training history,
        including training and validation accuracy and loss over epochs.
        
        Args:
            history (tf.keras.callbacks.History): The history object returned by `model.fit()`,
                                                  containing epoch-wise training metrics.
        """
        
        # Create a subplot figure with two columns for accuracy and loss.
        fig = make_subplots(
            rows=1, cols=2,
            subplot_titles=['Model Accuracy', 'Model Loss'], # Titles for each subplot.
            specs=[[{"secondary_y": False}, {"secondary_y": False}]] # Standard layout.
        )
        
        epochs = range(1, len(history.history['accuracy']) + 1) # Generate x-axis values for epochs.
        
        # Add a trace for Training Accuracy.
        fig.add_trace(
            go.Scatter(
                x=list(epochs), y=history.history['accuracy'], # x-axis: epochs, y-axis: accuracy values.
                mode='lines+markers', name='Training Accuracy', # Display lines and markers.
                line=dict(color='#1f77b4', width=3) # Custom line color and width.
            ),
            row=1, col=1 # Position this trace in the first subplot.
        )
        
        # Add a trace for Validation Accuracy.
        fig.add_trace(
            go.Scatter(
                x=list(epochs), y=history.history['val_accuracy'],
                mode='lines+markers', name='Validation Accuracy',
                line=dict(color='#ff7f0e', width=3)
            ),
            row=1, col=1
        )
        
        # Add a trace for Training Loss.
        fig.add_trace(
            go.Scatter(
                x=list(epochs), y=history.history['loss'],
                mode='lines+markers', name='Training Loss',
                line=dict(color='#1f77b4', width=3)
            ),
            row=1, col=2 # Position this trace in the second subplot.
        )
        
        # Add a trace for Validation Loss.
        fig.add_trace(
            go.Scatter(
                x=list(epochs), y=history.history['val_loss'],
                mode='lines+markers', name='Validation Loss',
                line=dict(color='#ff7f0e', width=3)
            ),
            row=1, col=2
        )
        
        # Update the overall layout of the Plotly figure.
        fig.update_layout(
            title="Training History",
            height=500,
            showlegend=True,
            template="plotly_white" # Use a clean, white-themed template.
        )
        
        # Update axis titles for clarity.
        fig.update_xaxes(title_text="Epoch")
        fig.update_yaxes(title_text="Accuracy", row=1, col=1)
        fig.update_yaxes(title_text="Loss", row=1, col=2)
        
        # Save the interactive plot as an HTML file in the specified output directory.
        fig.write_html(str(project_config.output_dir / "training_history.html"))
        # fig.show() # This line is commented out as `fig.show()` creates pop-up windows which are generally
                   # not supported or desirable in non-interactive notebook environments like Kaggle.
    
    def plot_confusion_matrix(self, cm: np.ndarray):
        """
        Generates and saves an interactive Plotly heatmap visualization of the confusion matrix.
        This helps in understanding the types of errors the model makes (e.g., false positives, false negatives).
        
        Args:
            cm (np.ndarray): The confusion matrix array, typically a 2x2 matrix for binary classification.
        """
        
        # Create a Plotly Express imshow (heatmap) figure.
        fig = px.imshow(
            cm,
            labels=dict(x="Predicted", y="Actual", color="Count"), # Labels for axes and color bar.
            x=self.class_names, # Labels for the predicted classes on the x-axis.
            y=self.class_names, # Labels for the actual classes on the y-axis.
            text_auto=True,     # Automatically display the value of each cell on the heatmap.
            aspect="auto",      # Adjusts the aspect ratio of the heatmap automatically.
            color_continuous_scale="Blues" # Use a blue color scale for the heatmap.
        )
        
        fig.update_layout(
            title="Confusion Matrix", # Set the title of the plot.
            width=500,                # Set the width of the plot.
            height=500                # Set the height of the plot.
        )
        
        # Save the interactive confusion matrix plot as an HTML file.
        fig.write_html(str(project_config.output_dir / "confusion_matrix.html"))
        # fig.show() # Commented out for Notebook compatibility.
    
    def plot_roc_curve(self, y_true: np.ndarray, y_proba: np.ndarray):
        """
        Generates and saves an interactive Plotly plot of the Receiver Operating Characteristic (ROC) curve.
        The ROC curve is a graphical plot that illustrates the diagnostic ability of a binary classifier
        as its discrimination threshold is varied. AUC (Area Under the Curve) provides a single metric
        to summarize the overall performance.
        
        Args:
            y_true (np.ndarray): True binary labels (e.g., 0s and 1s).
            y_proba (np.ndarray): Predicted probabilities of the positive class (e.g., probabilities for class 1).
        """
        
        # Calculate the False Positive Rate (FPR), True Positive Rate (TPR), and thresholds for the ROC curve.
        fpr, tpr, _ = roc_curve(y_true, y_proba[:, 1]) # Compute ROC for the positive class (column 1 of probabilities).
        # Calculate the Area Under the ROC Curve (AUC score).
        auc_score = roc_auc_score(y_true, y_proba[:, 1])
        
        fig = go.Figure() # Create a new Plotly Figure object.
        
        # Add the main ROC Curve trace to the figure.
        fig.add_trace(go.Scatter(
            x=fpr, y=tpr,                                 # FPR on x-axis, TPR on y-axis.
            mode='lines',                                 # Connect points with lines.
            name=f'ROC Curve (AUC = {auc_score:.3f})',    # Label including the calculated AUC score.
            line=dict(color='#1f77b4', width=3)           # Custom line style.
        ))
        
        # Add a diagonal line representing a random classifier (AUC = 0.5).
        fig.add_trace(go.Scatter(
            x=[0, 1], y=[0, 1],                           # Diagonal line from (0,0) to (1,1).
            mode='lines',
            name='Random Classifier',                     # Label for the random classifier.
            line=dict(color='red', dash='dash')           # Dashed red line.
        ))
        
        # Update the layout of the ROC curve plot.
        fig.update_layout(
            title='ROC Curve',                           # Set the title of the plot.
            xaxis_title='False Positive Rate',           # Label for the x-axis.
            yaxis_title='True Positive Rate',            # Label for the y-axis.
            template="plotly_white"                      # Use a clean, white-themed template.
        )
        
        # Save the interactive ROC curve plot as an HTML file.
        fig.write_html(str(project_config.output_dir / "roc_curve.html"))
        # fig.show() # Commented out for Notebook compatibility.

In [None]:
%%writefile main.py
import argparse # For command-line argument parsing (not directly used in Notebook but good practice for standalone scripts).
from pathlib import Path # For object-oriented filesystem paths, making path manipulation robust and cross-platform.
import sys # For system-specific parameters and functions, used here to add current directory to Python path.
import json # For working with JSON data, used for saving/loading configuration and history.
from loguru import logger # For professional and highly configurable logging.
from rich.console import Console # From the Rich library, for beautiful and structured console output.
from rich.table import Table # From Rich, for creating formatted tables in the console.
from rich.panel import Panel # From Rich, for creating visually distinct panels in console output.
from rich.progress import track # From Rich, for easily displaying progress bars (used implicitly by Rich when logging).
import tensorflow as tf # Core deep learning library by Google, fundamental for model operations.
from transformers import TFBertModel # Hugging Face's TensorFlow implementation of the BERT model, crucial for loading.

# Import project's custom modules, ensuring modular and organized code.
from config import model_config, training_config, project_config # Project-wide configuration settings.
from logger import setup_logging # Function to configure the logging system.
from data_loader import DataProcessor # Class to handle data loading, cleaning, and splitting.
from model import BERTModelBuilder # Class to build our BERT model architecture.
from trainer import BERTTrainer # Class to encapsulate the model training process.
from evaluator import ModelEvaluator # Class to handle model evaluation and visualization.

console = Console() # Initialize a Rich Console instance for pretty console printing.

def print_project_info():
    """
    Prints a formatted table summarizing the project's key components and their readiness status.
    This provides a quick overview of the project's structure and capabilities using Rich.
    """
    table = Table(title="🚀 Professional BERT Sentiment Analysis")
    table.add_column("Component", style="cyan", no_wrap=True) # Column for component name.
    table.add_column("Status", style="green") # Column for status (e.g., "✅ Ready").
    table.add_column("Description", style="white") # Column for component description.
    
    # Add rows describing each major component of the project.
    table.add_row("Data Processing", "✅ Ready", "Advanced text cleaning and tokenization")
    table.add_row("Model Architecture", "✅ Ready", "BERT-base with custom classification head")
    table.add_row("Training Pipeline", "✅ Ready", "Professional training with callbacks")
    table.add_row("Evaluation Suite", "✅ Ready", "Comprehensive metrics and visualizations")
    table.add_row("Configuration", "✅ Ready", "Modular configuration management")
    table.add_row("Logging", "✅ Ready", "Enhanced logging with Rich and Loguru")
    
    console.print(table) # Display the formatted table to the console.

def main():
    """
    The main execution pipeline for the BERT Sentiment Analysis project.
    It orchestrates data loading, model building/loading, training, evaluation,
    and visualization generation. This function encapsulates the entire workflow.
    """
    
    # Setup the logging system as the very first action to ensure all subsequent messages are logged.
    setup_logging()
    
    # Print the project's introductory information table.
    print_project_info()
    
    try:
        logger.info("Starting BERT Sentiment Analysis Pipeline")
        
        # Initialize the data processor, which handles IMDB dataset operations.
        data_processor = DataProcessor()
        
        # Load and preprocess the IMDB dataset.
        console.print("\n[bold blue]📊 Loading and Processing Data...[/bold blue]")
        # Loads 10,000 samples for efficient training/testing. This number can be adjusted in config.py.
        texts, labels = data_processor.load_imdb_dataset(num_samples=10000) 
        
        # Create the train, validation, and test data splits from the loaded data.
        data_splits = data_processor.create_data_splits(texts, labels)

        # Define the expected path for the saved best model.
        model_path = project_config.models_dir / "best_model.h5"
        
        model = None # Initialize model variable.
        history = None # Initialize history variable; will be populated if model is trained.

        # Check if a trained model already exists on disk.
        if model_path.exists():
            console.print(f"\n[bold green]✅ Found existing model at {model_path}. Loading model...[/bold green]")
            # Load the saved Keras model from the H5 file.
            # `custom_objects={'TFBertModel': TFBertModel}` is crucial here. It tells Keras how to
            # interpret and reconstruct the `TFBertModel` layer, which is a custom layer from Hugging Face
            # and not part of standard Keras. This resolves the `ValueError: Unknown layer: 'TFBertModel'` error.
            model = tf.keras.models.load_model(
                str(model_path), 
                custom_objects={'TFBertModel': TFBertModel}, # Explicitly pass the custom layer.
                compile=False # Do not compile during loading; re-compile explicitly below for evaluation.
            )
            # Re-compile the loaded model. This is necessary to correctly set up the optimizer and metrics
            # for any subsequent evaluation or potential fine-tuning, even if not explicitly trained in this run.
            model.compile(
                optimizer=tf.keras.optimizers.Adam(learning_rate=model_config.learning_rate),
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')]
            )
            console.print("[green]✅ Model loaded and re-compiled successfully for evaluation.[/green]")
        else:
            # If no trained model is found, proceed with building and training.
            console.print("\n[bold blue]🏗️ No existing model found. Building and Training BERT Model...[/bold blue]")
            # Build the BERT model using the predefined builder class and model configuration.
            model = BERTModelBuilder.build_functional_model(model_config)
            
            console.print(f"[green]✅ Model built successfully![/green]")
            # Display the total number of trainable parameters in the model.
            console.print(f"Model parameters: {model.count_params():,}") 
            
            # Initialize the trainer with the newly built model and configurations.
            trainer = BERTTrainer(model, model_config, training_config)
            
            # Start the model training process.
            console.print("\n[bold blue]🎯 Training Model...[/bold blue]")
            history = trainer.train(data_splits['train'], data_splits['validation'])
            
            # Save the training history (metrics over epochs) and model configuration.
            trainer.save_training_artifacts()
            console.print(f"[green]✅ Model trained and saved to {model_path}[/green]")

        # Ensure the `trainer` object is initialized even if the model was loaded (not trained in this run).
        # This is important because the `evaluator` depends on `trainer.tokenizer`.
        if model is not None and 'trainer' not in locals(): # Check if 'trainer' was not created by the 'else' block.
            trainer = BERTTrainer(model, model_config, training_config) # Re-initialize trainer just to get the tokenizer.

        # Evaluate the model on the dedicated test set.
        console.print("\n[bold blue]📈 Evaluating Model...[/bold blue]")
        evaluator = ModelEvaluator(model, trainer.tokenizer) # Pass the loaded model and its tokenizer to the evaluator.
        evaluation_results = evaluator.evaluate_model(data_splits['test'])
        
        console.print(Panel(f"[green]✅ Model evaluation complete![/green]"))
        console.print("\n[bold yellow]Performance Metrics:[/bold yellow]")
        metrics_table = Table(show_header=True, header_style="bold magenta")
        metrics_table.add_column("Metric", style="dim")
        metrics_table.add_column("Value", style="bold green")
        for metric, value in evaluation_results['metrics'].items():
            metrics_table.add_row(metric.replace('_', ' ').title(), f"{value:.4f}")
        console.print(metrics_table)
        
        console.print("\n[bold yellow]Classification Report:[/bold yellow]")
        console.print(json.dumps(evaluation_results['classification_report'], indent=2))
        
        # Generate and save various visualizations of the model's performance.
        console.print("\n[bold blue]📊 Generating Visualizations...[/bold blue]")
        # Plot training history only if the model was actually trained in this run (i.e., `history` object exists).
        if history: 
            evaluator.plot_training_history(history)
            console.print("[green]✅ Training history plot generated.[/green]")
        else:
            console.print("[yellow]Skipping training history plot as model was loaded, not trained.[/yellow]")

        evaluator.plot_confusion_matrix(evaluation_results['confusion_matrix'])
        evaluator.plot_roc_curve(evaluation_results['true_labels'], evaluation_results['predictions'])
        console.print("[green]✅ Confusion Matrix and ROC Curve plots generated.[/green]")
        console.print("[green]✅ All visualizations saved to 'outputs' directory.[/green]")
        
        console.print(Panel("[bold green]✨ Project completed successfully![/bold green]"))

    except Exception as e:
        # Catch any exceptions during the pipeline execution and log them for debugging.
        logger.exception(f"An error occurred during the pipeline execution: {e}")
        console.print(Panel(f"[bold red]❌ An error occurred: {e}[/bold red]", style="red"))

if __name__ == "__main__":
    # Ensures main() is called when the script is executed directly (not imported).
    main()

In [None]:
%%writefile app.py
import uvicorn # ASGI server, used to run FastAPI applications.
import tensorflow as tf # Core deep learning library, used for loading and running the model.
from fastapi import FastAPI, Request # FastAPI framework for building web APIs. `Request` is for accessing request details.
from pydantic import BaseModel # Used by FastAPI for data validation and defining API request/response schemas.
from typing import List, Dict, Union # For type hints, ensuring data integrity for lists, dictionaries, and flexible types.
from transformers import AutoTokenizer, TFBertModel # Hugging Face components for tokenizer and BERT model in TensorFlow.
from pathlib import Path # For object-oriented filesystem paths, used for model file paths.
from loguru import logger # Professional logging library for structured output.
from rich.console import Console # From Rich library, for visually appealing console output.
from rich.panel import Panel # From Rich, for creating distinct, framed panels in console output.
import json # For handling JSON data, used for serialization/deserialization.
import re # Regular expression module for text pattern matching (used in cleaning).
import html # Module for decoding HTML entities in text (used in cleaning).

# Import project-specific configurations and the logging setup function.
from config import project_config, model_config # Access project directories and model hyperparameters.
from logger import setup_logging # Function to initialize our logging system.

setup_logging() # Initialize the logging system early, ensuring all subsequent messages are captured.
console = Console() # Create a Rich Console instance for custom formatted prints.

class PredictionRequest(BaseModel):
    """
    Pydantic model defining the expected structure of the request body for the /predict endpoint.
    This ensures that incoming data adheres to a specified format.
    """
    texts: List[str] # A list of strings, where each string is a text for sentiment prediction.

class PredictionResponse(BaseModel):
    """
    Pydantic model defining the expected structure of the response body from the /predict endpoint.
    This provides clear documentation and validation for the API's output.
    """
    predictions: List[Dict[str, Union[str, float]]] # A list of dictionaries, each containing prediction details.

# Initialize the FastAPI application instance.
# Provides metadata like title, description, and version for the API documentation (Swagger UI).
app = FastAPI(
    title="BERT Sentiment Analysis API",
    description="API for classifying movie review sentiment using a fine-tuned BERT model.",
    version="1.0.0",
)

model = None # Global variable to hold the loaded TensorFlow model. Initialized to None.
tokenizer = None # Global variable to hold the loaded BERT tokenizer. Initialized to None.

def clean_text(text: str) -> str:
    """
    Cleans a single input text string by applying a series of preprocessing steps.
    This function's implementation must be IDENTICAL to the `clean_text` function in `data_loader.py`
    to ensure that the text processed during inference matches the format of text during training.
    
    Args:
        text (str): The raw input text string to be cleaned.
        
    Returns:
        str: The cleaned text string.
    """
    if not isinstance(text, str): # Check if the input is actually a string.
        return "" # Return an empty string if the input is not a string, to prevent errors.
    
    text = html.unescape(text) # Decode HTML entities (e.g., & -> &), essential for web-scraped text.
    text = re.sub(r'<[^>]+>', '', text) # Remove any HTML tags found in the text.
    text = re.sub(r'\s+', ' ', text) # Normalize whitespace by replacing multiple spaces/newlines with a single space.
    text = re.sub(r'[^\w\s.,!?;:-]', '', text) # Remove special characters, keeping alphanumeric, whitespace, and basic punctuation.
    text = text.lower().strip() # Convert text to lowercase and remove leading/trailing whitespace.
    return text

@app.on_event("startup")
async def load_model():
    """
    Asynchronous function that runs ONCE when the FastAPI application starts up.
    Its purpose is to load the pre-trained BERT model and its tokenizer into memory.
    This prevents the model from being reloaded for every incoming prediction request,
    which significantly improves API performance.
    """
    global model, tokenizer # Declare these variables as global so they can be accessed and modified outside this function.
    try:
        model_path = project_config.models_dir / "best_model.h5" # Construct the full path to the saved model using ProjectConfig.
        tokenizer_name = model_config.model_name # Get the pre-trained model name for tokenizer initialization from ModelConfig.

        if not model_path.exists(): # Check if the model file actually exists on disk.
            logger.error(f"Model file not found at {model_path}. Please train the model first.")
            console.print(Panel(f"[bold red]❌ Error: Model file not found at {model_path}. Please train the model first.[/bold red]", style="red"))
            return # Exit the function if the model file is missing, preventing further errors.

        console.print(f"[bold blue]Loading model from: {model_path}[/bold blue]")
        # Load the Keras model from the H5 file.
        # `custom_objects={'TFBertModel': TFBertModel}` is CRUCIAL here. It explicitly tells Keras how to
        # reconstruct the `TFBertModel` layer, which is a custom layer from Hugging Face Transformers.
        # This resolves the `ValueError: Unknown layer: 'TFBertModel'` error during model loading.
        model = tf.keras.models.load_model(
            model_path, 
            custom_objects={'TFBertModel': TFBertModel}, 
            compile=False # Do not compile the model during loading; it will be compiled manually below.
        ) 
        # Manually compile the loaded model. This is necessary to correctly set up the optimizer
        # and metrics, even if we are only using the model for prediction (as model.predict() benefits).
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=model_config.learning_rate), 
                      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), 
                      metrics=['accuracy'])
        console.print("[green]✅ Model loaded successfully![/green]")

        console.print(f"[bold blue]Loading tokenizer: {tokenizer_name}[/bold blue]")
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) # Load the tokenizer based on the model name.
        console.print("[green]✅ Tokenizer loaded successfully![/green]")

        logger.info("Model and tokenizer loaded successfully.")

    except Exception as e:
        # Catch and log any general exceptions that occur during model or tokenizer loading.
        logger.error(f"Failed to load model or tokenizer: {e}")
        console.print(Panel(f"[bold red]❌ Failed to load model or tokenizer: {e}[/bold red]", style="red"))
        model = None # Set model and tokenizer to None to indicate they are not ready.
        tokenizer = None

@app.get("/health")
async def health_check():
    """
    Defines a simple GET endpoint at /health.
    This endpoint serves as a health check to verify if the API is running and if
    the model and tokenizer have been successfully loaded into memory.
    
    Returns:
        dict: A dictionary indicating the API status and model/tokenizer loading status.
    """
    return {"status": "ok", "model_loaded": model is not None, "tokenizer_loaded": tokenizer is not None}

@app.post("/predict", response_model=PredictionResponse)
async def predict_sentiment(request: PredictionRequest):
    """
    Defines a POST endpoint at /predict to receive text inputs and return sentiment predictions.
    It takes a JSON request body containing a list of texts and returns a structured JSON response.
    
    Args:
        request (PredictionRequest): A Pydantic model instance containing the list of texts to predict.
        
    Returns:
        PredictionResponse: A Pydantic model instance containing a list of dictionaries,
                            each with the original text, predicted sentiment label, and confidence score.
    """
    # Check if the model and tokenizer are loaded. If not, return an error response.
    if model is None or tokenizer is None:
        logger.error("Model or tokenizer not loaded. Cannot process prediction.")
        return PredictionResponse(predictions=[{"text": t, "sentiment": "Error: Model not loaded"} for t in request.texts])

    try:
        console.print(f"[bold blue]Received {len(request.texts)} texts for prediction.[/bold blue]")
        # Clean the input texts using the same `clean_text` function as during training.
        cleaned_texts = [clean_text(text) for text in request.texts] 
        
        # Tokenize the cleaned input texts.
        # `padding='max_length'` is CRUCIAL here. It ensures that all input sequences are padded
        # to the exact `max_length` (128) defined in the model_config, resolving the
        # `ValueError: Input 0 ... incompatible with the layer: expected shape=(None, 128), found shape=(None, X)` error.
        inputs = tokenizer(
            cleaned_texts,
            padding='max_length',  # Explicitly pads all sequences to `model_config.max_length`.
            truncation=True,       # Truncate sequences longer than `max_length`.
            max_length=model_config.max_length, # Uses `max_length` from config for consistency.
            return_tensors='tf'    # Returns TensorFlow tensors for model input.
        )
        
        # Make predictions using the loaded model.
        predictions = model.predict([inputs['input_ids'], inputs['attention_mask']])
        predicted_classes = tf.argmax(predictions, axis=1).numpy() # Get the index of the highest probability (0 or 1).
        confidence_scores = tf.reduce_max(predictions, axis=1).numpy() # Get the probability of the predicted class.

        sentiment_map = {0: "Negative", 1: "Positive"} # Map numerical predictions to human-readable labels.
        results = []
        # Compile results for each input text.
        for i, text in enumerate(request.texts):
            results.append({
                "text": text,
                "sentiment": sentiment_map[predicted_classes[i]],
                "confidence": float(confidence_scores[i]) # Convert NumPy float to standard Python float for JSON serialization.
            })
        logger.info(f"Successfully predicted sentiment for {len(request.texts)} texts.")
        return PredictionResponse(predictions=results) # Return the structured prediction response.

    except Exception as e:
        # Catch and log any errors that occur during the prediction process.
        logger.error(f"Error during prediction: {e}", exc_info=True) # `exc_info=True` logs the full traceback.
        # Return an error message in the response to the client.
        return PredictionResponse(predictions=[{"text": t, "sentiment": f"Error: {e}"} for t in request.texts])

In [None]:
print("Installing dependencies from requirements.txt...")
!pip install -r requirements.txt
print("Dependencies installation complete.")

In [None]:
print("Setting up ngrok...") # Informative print statement indicating the start of ngrok setup.
import os # Module for interacting with the operating system (used for file path checks and environment variables).
import time # For time-related functions (though not directly used for delays in this specific cell).

# ====== IMPORTANT: INSTALL pyngrok AND DOWNLOAD/CONFIGURE ngrok EXECUTABLE HERE ======
# This section ensures that pyngrok (the Python wrapper) is installed and
# the ngrok executable binary is downloaded and configured in the Kaggle environment.
# It's placed at the top of this cell so `pyngrok` is available before any import statement
# that uses it within this same cell, preventing ModuleNotFoundError.

print("Installing pyngrok...") # Inform the user that pyngrok is being installed.
!pip install pyngrok -q # Install the pyngrok Python library. `-q` means quiet output.
print("pyngrok installed.") # Confirmation message.

# Define paths for the ngrok zip file and the executable itself.
ngrok_zip_path = "ngrok-stable-linux-amd64.zip"
ngrok_executable_path = "ngrok"

# Check if the ngrok executable already exists to avoid re-downloading unnecessarily.
if not os.path.exists(ngrok_executable_path):
    print("Downloading ngrok executable...") # Inform the user about the download.
    # Download the ngrok executable zip file for Linux (Kaggle's environment).
    # `-O` specifies the output filename. `-q` is for quiet download progress.
    !wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip -O {ngrok_zip_path} -q 
    !unzip -o {ngrok_zip_path} # Unzip the downloaded file. `-o` means overwrite if already exists.
    !chmod +x {ngrok_executable_path} # Grant execute permissions to the unzipped ngrok binary.
    print("Ngrok executable downloaded and configured.") # Confirmation message.
else:
    print("Ngrok executable already present.") # Inform if ngrok is already there.

# ====================================================================================

# Now, import ngrok AFTER it's confirmed installed/present in the environment.
from pyngrok import ngrok # Import the pyngrok library, which provides Python bindings for ngrok.

# Kill any residual ngrok processes or tunnels from previous sessions.
# This is crucial for free ngrok accounts as they are limited to 1 simultaneous ngrok agent session (ERR_NGROK_108).
# It ensures a clean slate before attempting to open a new tunnel.
ngrok.kill() 
print("Killed any residual ngrok processes.") # Confirmation message.

# Authenticate ngrok with your personal authentication token.
# This token is required for using ngrok's service and creating public tunnels under your account.
ngrok_auth_token = "2xgYVbiMX6umq43cgdupTkDYOoA_6sEh5wdWfe7YBsTq7HhGf" # REPLACE this with your actual ngrok auth token from ngrok.com dashboard.

# Validate if the authentication token has been set correctly.
if ngrok_auth_token == "YOUR_NGROK_AUTH_TOKEN" or not ngrok_auth_token:
    print("🚨 Warning: Please replace 'YOUR_NGROK_AUTH_TOKEN' with your actual ngrok auth token from ngrok.com")
    # Raise a ValueError if the token is not set, as ngrok will fail without it.
    raise ValueError("Ngrok auth token is not set. Cannot start tunnel.") 
else:
    # Set the ngrok authentication token as an environment variable.
    os.environ["NGROK_AUTH_TOKEN"] = ngrok_auth_token 
    # Authenticate pyngrok with the provided auth token. This connects your pyngrok client to your ngrok account.
    ngrok.set_auth_token(ngrok_auth_token) 
    print("Ngrok auth token set and authenticated.") # Confirmation message.

print("Ngrok setup complete. Ready to start tunnel.") # Final confirmation for the cell.

In [None]:
import sys # Import the 'sys' module to interact with the Python runtime environment.

# Add the current working directory ('/kaggle/working/') to Python's system path.
# This is crucial for Python to be able to find and import our custom modules
# (like 'config', 'logger', 'data_loader', 'model', 'trainer', 'evaluator', 'main')
# that are written to this directory using `%%writefile`.
if '/kaggle/working/' not in sys.path: # Check if the path is not already added to avoid duplicates.
    sys.path.append('/kaggle/working/') 

# Import the 'main' function from our 'main.py' script.
# This function encapsulates the entire BERT sentiment analysis pipeline.
from main import main

print("Running main.py to ensure model is saved (will train if not found)...") # Informative message for the user.

# Execute the 'main' function.
# This call orchestrates the core logic:
# - It first checks if a trained model ('best_model.h5') exists.
# - If found, it loads the model. This step is optimized to avoid redundant training.
# - If not found, it triggers the full training process and saves the model.
# - Afterwards, it proceeds with model evaluation and visualization generation.
main()

print("main.py finished. Model should now be saved.") # Confirmation message after the pipeline completes.

In [None]:
# This cell is designed to start the FastAPI application and expose it to the internet
# using an ngrok public tunnel. It integrates Uvicorn (the ASGI server) with ngrok.

print("Starting FastAPI application...") # Informative message for the user.

import os # Module for interacting with the operating system (e.g., managing environment variables).
import threading # Module for running tasks in separate threads, allowing the notebook cell to remain active.
from IPython.display import display, HTML # For displaying HTML content (like clickable links) in the notebook output.
import time # For pausing execution (e.g., to allow services to start).
import subprocess  # For running external commands (like Uvicorn) as subprocesses.

# ====== IMPORTANT: This section ensures pyngrok is installed and ngrok executable is ready. ======
# It is placed here for robustness, although usually handled in a dedicated setup cell (Cell 12).
# If Cell 12 consistently runs first, these lines might be redundant but harmless.

print("Installing pyngrok...") # Inform the user that pyngrok is being installed.
!pip install pyngrok -q # Install the pyngrok Python library. `-q` ensures quiet output.

# Define paths for the ngrok zip file and the executable.
ngrok_zip_path = "ngrok-stable-linux-amd64.zip"
ngrok_executable_path = "ngrok"

# Check if the ngrok executable already exists to avoid re-downloading.
if not os.path.exists(ngrok_executable_path):
    print("Downloading ngrok executable...") # Inform the user about the download.
    # Download the ngrok executable zip file for Linux (Kaggle's environment).
    !wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip -O {ngrok_zip_path} -q 
    !unzip -o {ngrok_zip_path} # Unzip the downloaded file. `-o` overwrites if exists.
    !chmod +x {ngrok_executable_path} # Grant execute permissions to the ngrok binary.
    print("Ngrok executable downloaded and configured.") # Confirmation message.
else:
    print("Ngrok executable already present.") # Inform if ngrok is already there.
# =================================================================================================

# Import ngrok only after ensuring its installation and executable are ready.
from pyngrok import ngrok # pyngrok is the Python wrapper for the ngrok service.

# Retrieve your ngrok authentication token. This token is essential for ngrok to operate.
# It should have been set as an environment variable in a previous setup cell (Cell 12).
ngrok_auth_token = "2xgYVbiMX6umq43cgdupTkDYOoA_6sEh5wdWfe7YBsTq7HhGf" # REPLACE this with your actual ngrok auth token.

# Validate the ngrok auth token. If it's not set, raise an error as ngrok won't function.
if ngrok_auth_token == "YOUR_NGROK_AUTH_TOKEN" or not ngrok_auth_token:
    print("🚨 Warning: Please replace 'YOUR_NGROK_AUTH_TOKEN' with your actual ngrok auth token from ngrok.com")
    print("Register for free at https://ngrok.com/signup and get your token from https://dashboard.ngrok.com/get-started/your-authtoken")
    raise ValueError("Ngrok auth token is not set. Cannot start tunnel.") 
else:
    os.environ["NGROK_AUTH_TOKEN"] = ngrok_auth_token # Set the token as an environment variable.
    ngrok.set_auth_token(ngrok_auth_token) # Authenticate pyngrok with the provided auth token.

# Define a function to run the Uvicorn server. This function will be executed in a separate thread.
def run_uvicorn():
    """
    Runs the Uvicorn server in a subprocess to host the FastAPI application.
    Captures Uvicorn's output for monitoring.
    """
    try:
        # Command to start Uvicorn, serving the 'app' FastAPI instance from 'app.py' module
        # on all network interfaces (0.0.0.0) and port 8000.
        command = ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
        
        # Use subprocess.Popen to run Uvicorn. `Popen` is non-blocking, allowing the main thread to continue.
        # `stdout=subprocess.PIPE` and `stderr=subprocess.PIPE` capture Uvicorn's console output.
        # `text=True` decodes output as text.
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        
        # Read Uvicorn's output line by line to detect when the server has fully started.
        for line in process.stdout:
            print(f"Uvicorn: {line.strip()}") # Print Uvicorn's output to the notebook.
            if "Uvicorn running on" in line: # Look for a specific pattern indicating the server is active.
                print("Uvicorn server confirmed to be starting...") # Confirm Uvicorn startup.
                break # Exit the loop once the server is confirmed to be starting, to proceed with ngrok.
        
        # Wait for the Uvicorn process to complete. For a server, this means it will run indefinitely
        # unless it crashes or is explicitly stopped. This makes the thread (and thus the cell) remain active.
        process.wait()

    except FileNotFoundError:
        print("Error: 'uvicorn' command not found. Ensure uvicorn is installed (from requirements.txt).")
    except Exception as e:
        print(f"Error in Uvicorn thread: {e}")

try:
    # Create and start a new thread to run the `run_uvicorn` function.
    # This allows the FastAPI server to run in the background without blocking the notebook cell.
    uvicorn_thread = threading.Thread(target=run_uvicorn)
    uvicorn_thread.daemon = True # Set the thread as a daemon; it will terminate when the main program exits.
    uvicorn_thread.start() # Start the Uvicorn thread.

    time.sleep(15) # Pause execution to allow time for Uvicorn to start and the FastAPI app (including model loading) to initialize.
    
    # Establish the ngrok tunnel. This exposes the locally running Uvicorn server (on port 8000)
    # to the internet, providing a public, accessible URL.
    public_url = ngrok.connect(8000).public_url
    print(f"Ngrok Tunnel URL: {public_url}") # Print the generated public URL.
    # Display a clickable HTML link to the FastAPI documentation (Swagger UI).
    display(HTML(f'<h2>Your FastAPI app is running at: <a href="{public_url}/docs" target="_blank">{public_url}/docs</a></h2>'))
    print("Uvicorn server started in a background thread.") # Confirmation message.

    uvicorn_thread.join() # Keep the notebook cell alive indefinitely while the Uvicorn server thread is active.

except Exception as e:
    # Catch any exceptions during ngrok or Uvicorn startup and log them.
    print(f"❌ Error starting ngrok or Uvicorn: {e}")
    print("Please check your ngrok auth token and ensure no other process is using port 8000.")

In [None]:
import requests # Library for making HTTP requests (e.g., GET, POST) to web services/APIs.
import json # For working with JSON data (serializing Python objects to JSON, parsing JSON responses).
import time # For pausing execution (e.g., to wait for the API to start up).
from rich.console import Console # From Rich library, for beautiful and structured console output.
from rich.panel import Panel # From Rich, for displaying visually distinct panels in console output.

console = Console() # Initialize a Rich Console instance for formatted printing to the notebook output.

# ====== IMPORTANT: Update this URL with the actual Ngrok Tunnel URL displayed by the previous cell. ======
# The ngrok URL is temporary and changes each time the tunnel is established (i.e., every time Cell 14 is run).
# You MUST copy the NEW URL from the output of Cell 14 and paste it here before executing this cell.
ngrok_url = "https://f085-146-148-39-26.ngrok-free.app" # Placeholder: REPLACE THIS LINE with the NEW Ngrok URL!
# Example of finding the URL: Look for "Ngrok Tunnel URL:" in the output of the previous cell (Cell 14).
# =========================================================================================================

console.print(f"[bold blue]Testing API at: {ngrok_url}[/bold blue]") # Inform the user about the API URL being tested.

# Pause execution to give the FastAPI application (running via Uvicorn and Ngrok)
# sufficient time to fully start up and load the BERT model.
# This duration might need adjustment based on model size, internet speed, and Kaggle's GPU/CPU speed.
time.sleep(20) 

try:
    # Send a GET request to the /health endpoint of the API.
    # This checks if the API is running and if the model/tokenizer are loaded.
    health_response = requests.get(f"{ngrok_url}/health")
    console.print(f"Health Check Status: {health_response.status_code}") # Print the HTTP status code (e.g., 200 OK).
    console.print(f"Health Check Response: {health_response.json()}") # Print the JSON response from the health check.
    
    # Extract the 'model_loaded' status from the health check response.
    model_loaded = health_response.json().get("model_loaded", False)
    if not model_loaded:
        # If the model is not reported as loaded by the health check, display a warning.
        console.print(Panel("[bold yellow]Warning: Model not yet loaded according to health check. Prediction might fail.[/bold yellow]", style="yellow"))
except requests.exceptions.ConnectionError as e:
    # Catch a ConnectionError, which means the API could not be reached.
    # This often indicates that ngrok or Uvicorn is not running, or the URL is incorrect.
    console.print(Panel(f"[bold red]Error: Could not connect to the API. Make sure ngrok and uvicorn are running. Error: {e}[/bold red]", style="red"))
    health_response = None # Set response to None to prevent further processing.
    model_loaded = False # Indicate that the model is not loaded due to connection failure.
except json.JSONDecodeError as e: 
    # Catch a JSONDecodeError. This means the API responded, but its content was not valid JSON.
    # This can happen if the API returns an HTML error page (e.g., 404 Not Found) instead of JSON.
    console.print(Panel(f"[bold red]Error: Could not decode JSON response from health check. Error: {e}[/bold red]", style="red"))
    health_response = None
    model_loaded = False


# Proceed to send prediction requests only if the health check passed (status 200)
# and the model was confirmed as loaded.
if health_response and health_response.status_code == 200 and model_loaded:
    # Define a list of sample texts to send to the API for sentiment prediction.
    test_texts = [
        "This movie was absolutely fantastic! I loved every moment of it.",
        "The plot was confusing and the acting was terrible. A complete waste of time.",
        "It was an okay movie, nothing special, but not bad either.",
        "What a masterpiece! Highly recommend."
    ]

    headers = {"Content-Type": "application/json"} # Set HTTP header to indicate the request body is JSON.
    data = {"texts": test_texts} # Prepare the request body as a Python dictionary.

    console.print("\n[bold blue]Sending prediction request...[/bold blue]") # Inform the user about sending the prediction request.
    try:
        # Send a POST request to the /predict endpoint.
        # `data=json.dumps(data)` converts the Python dictionary to a JSON string for the request body.
        predict_response = requests.post(f"{ngrok_url}/predict", headers=headers, data=json.dumps(data))
        console.print(f"Prediction Status: {predict_response.status_code}") # Print the HTTP status code for prediction.
        console.print("Prediction Response:") # Label for the prediction response.
        console.print(json.dumps(predict_response.json(), indent=2)) # Parse and print the JSON prediction response, pretty-printed.
        console.print(Panel("[bold green]✅ Prediction successful![/bold green]")) # Confirm successful prediction.
    except requests.exceptions.ConnectionError as e:
        # Handle connection errors during the prediction request.
        console.print(Panel(f"[bold red]Error: Could not connect to the prediction endpoint. Error: {e}[/bold red]", style="red"))
    except Exception as e:
        # Catch any other general errors during the prediction request.
        console.print(Panel(f"[bold red]Error during prediction request: {e}[/bold red]", style="red"))
else:
    # If the API is not ready (e.g., model not loaded, health check failed), inform the user.
    console.print(Panel("[bold red]API is not fully ready for predictions. Check previous output for errors.[/bold red]", style="red"))