In [None]:
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg') 
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from collections import Counter
from wordcloud import WordCloud
import pickle
import warnings
import logging
import os
import sys
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, recall_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                             BaggingClassifier, ExtraTreesClassifier,
                             GradientBoostingClassifier, VotingClassifier,
                             StackingClassifier)
from xgboost import XGBClassifier
from sentence_transformers import SentenceTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import optuna
from datetime import datetime

# --- Determine Base Directory for Notebook/Script ---
# This block intelligently determines the base director

try:
    # If running as a script, __file__ is defined
    # Get the directory of the current script
    current_script_dir = os.path.dirname(os.path.abspath(__file__))
    # Assuming spam.csv is directly in the same directory as the script
    base_directory = current_script_dir
    print(f"Running as a script. Base directory set to: '{base_directory}'")
except NameError:
    base_directory = os.getcwd()
    print(f"Running in a notebook environment. Base directory set to CWD: '{base_directory}'")
    print("Confirm 'spam.csv' is directly in this directory for the current DATA_PATH setting.")


# --- Configuration (Externalize for production) ---
class Config:
    #DATA_PATH = os.path.join(os.path.dirname(base_directory), 'spam.csv')
    DATA_PATH = os.path.join(base_directory, 'spam.csv')
    SENTENCE_TRANSFORMER_MODEL = 'all-MiniLM-L6-v2' # Pre-trained Sentence Transformer model
    LOG_FILE = os.path.join(base_directory, 'spam_classifier.log') # Log file name
    RANDOM_STATE = 42 # Seed for reproducibility
    TEST_SIZE = 0.2 # Proportion of the dataset to include in the test split
    N_TRIALS_OPTUNA = 10 # Number of Optuna trials for hyperparameter tuning per model
    PLOTS_DIR = os.path.join(base_directory, 'plots') # Directory to save plots (EDA, Confusion Matrices)
    MODELS_DIR = os.path.join(base_directory, 'models') # Directory to save trained models

# Ensure plot and model directories exist at startup
os.makedirs(Config.PLOTS_DIR, exist_ok=True)
os.makedirs(Config.MODELS_DIR, exist_ok=True)


class SpamClassifier:
    def __init__(self):
        """Initializes the SpamClassifier with production-ready settings."""
        self._configure_logging()
        self._verify_nltk_resources()
        self._configure_matplotlib()

        self.df = None
        self.encoder = LabelEncoder() # Used to encode 'ham'/'spam' to numerical labels
        self.ps = PorterStemmer() # Used for stemming text
        self.sentence_transformer_model = None # Will hold the loaded SentenceTransformer
        self.X, self.y = None, None # Features and target after vectorization
        self.X_train, self.X_test, self.y_train, self.y_test = [None]*4 # Train/Test splits
        self.clfs = {} # Dictionary to hold initialized classifier objects (before/after tuning)
        self.best_tuned_models_params = {} # Stores best hyperparameters found by Optuna for each model
        self.best_model = None # Stores the best trained model object (highest F1-score on spam)
        self.best_model_name = None # Stores the name of the best model
        self.performance_df = pd.DataFrame() # Stores performance metrics of all models evaluated
        self._initialize_classifiers() # Set up initial classifier instances
        logging.info("SpamClassifier initialized successfully.")

    def _configure_logging(self) -> None:
        """Sets up production-grade logging to a file and the console."""
        logging.basicConfig(
            level=logging.INFO, # Log INFO level and above messages
            format='%(asctime)s - %(levelname)s - %(message)s', # Standard log format
            handlers=[
                logging.FileHandler(Config.LOG_FILE), # Log to a file
                logging.StreamHandler(sys.stdout) # Log to console (stdout)
            ]
        )
        warnings.filterwarnings('ignore') # Suppress warnings to keep logs clean, manage critical warnings separately

    def _verify_nltk_resources(self) -> None:
        """Ensures required NLTK data (punkt tokenizer, stopwords corpus) is available, downloading if necessary."""
        resources = [
            ('tokenizers/punkt', 'punkt'),
            ('corpora/stopwords', 'stopwords'),
            ('tokenizers/punkt_tab', 'punkt_tab') # Add this line
        ]
        for path, package in resources:
            try:
                # Use nltk.data.find() to check if the resource is available
                # This will raise LookupError if not found
                nltk.data.find(path)
                logging.info(f"NLTK {package} resource found.")
            except LookupError:
                logging.warning(f"NLTK {package} not found. Attempting to download...")
                try:
                    # Download the missing package
                    nltk.download(package, quiet=True)
                    logging.info(f"NLTK {package} downloaded successfully.")
                except Exception as e:
                    # Log critical error and exit if download fails
                    logging.critical(f"Failed to download NLTK {package}. Please check network or proxy settings. Error: {e}")
                    sys.exit(1)                

    def _configure_matplotlib(self) -> None:
        """Configures Matplotlib and Seaborn for non-interactive plotting in a production environment."""
        plt.ioff() # Disable interactive mode (important for scripts run without a display)
        sns.set(style='whitegrid', palette='viridis') # Set a professional and visually appealing style

    def _initialize_classifiers(self) -> None:
        self.clfs = {
            'LR': LogisticRegression(
                solver='liblinear', # Good for smaller datasets and L1/L2 regularization
                penalty='l1',       # L1 regularization for feature selection
                random_state=Config.RANDOM_STATE,
                class_weight='balanced', # Automatically adjusts weights inversely proportional to class frequencies
                max_iter=1000       # Increased max_iter for convergence
            ),
            'RF': RandomForestClassifier(
                n_estimators=100,   # Number of trees in the forest
                random_state=Config.RANDOM_STATE,
                class_weight='balanced',
                n_jobs=-1           # Use all available CPU cores for parallel processing
            ),
            'XGB': XGBClassifier(
                n_estimators=100,   # Number of boosting rounds
                random_state=Config.RANDOM_STATE,
                #use_label_encoder=False, # Suppresses a future warning
                eval_metric='logloss', # Evaluation metric, required for newer XGBoost versions
                scale_pos_weight=1  # Placeholder: will be dynamically set in `eda` based on class imbalance
            ),
            'SVC': SVC(kernel='sigmoid', gamma=1.0, probability=True, random_state=Config.RANDOM_STATE, class_weight='balanced'),
            'KN': KNeighborsClassifier(),
            # 'NB': MultinomialNB(), # Commented out as SentenceTransformers produce dense embeddings, not counts
            'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=Config.RANDOM_STATE),
            'BgC': BaggingClassifier(n_estimators=100, random_state=Config.RANDOM_STATE, n_jobs=-1),
            'ETC': ExtraTreesClassifier(n_estimators=100, random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1),
            'GBDT': GradientBoostingClassifier(n_estimators=100, random_state=Config.RANDOM_STATE),
            'DT': DecisionTreeClassifier(max_depth=5, random_state=Config.RANDOM_STATE, class_weight='balanced')
        }

    def load_data(self) -> None:
        """Loads data from the specified CSV path with robust validation."""
        try:
            if not os.path.exists(Config.DATA_PATH):
                raise FileNotFoundError(f"Data file not found at {os.path.abspath(Config.DATA_PATH)}")

            self.df = pd.read_csv(Config.DATA_PATH, encoding='latin-1') # 'latin-1' is common for spam.csv
            if len(self.df) < 100: # Ensure sufficient data for meaningful analysis
                raise ValueError(f"Dataset too small ({len(self.df)} samples). Minimum 100 samples required for robust analysis.")

            logging.info(f"Loaded {len(self.df)} records from {Config.DATA_PATH}.")
        except Exception as e:
            logging.critical(f"Data loading failed: {e}")
            sys.exit(1) # Critical error, cannot proceed

    def clean_data(self) -> None:
        try:
            
            if 'v1' in self.df.columns and 'v2' in self.df.columns:
                self.df = self.df[['v1', 'v2']].copy()
                logging.info("Selected 'v1' and 'v2' columns from the dataset.")
            else:
                found_v1 = next((col for col in self.df.columns if 'target' in col.lower() or 'label' in col.lower() or 'type' in col.lower()), None)
                found_v2 = next((col for col in self.df.columns if 'text' in col.lower() or 'message' in col.lower() or 'sms' in col.lower()), None)

                if found_v1 and found_v2:
                    self.df = self.df[[found_v1, found_v2]].copy()
                    logging.info(f"Mapped columns '{found_v1}' to 'target' and '{found_v2}' to 'text' using heuristics.")
                else:
                    # If neither standard names nor heuristics work, it's a critical error
                    raise ValueError(f"Could not find required 'target' and 'text' columns (v1/v2 or equivalents) in dataset. Found columns: {self.df.columns.tolist()}")

            # Now that self.df only has 2 columns, renaming will succeed.
            self.df.columns = ['target', 'text']

            # Validate and potentially filter target values to ensure only 'ham' and 'spam' exist
            valid_targets = {'ham', 'spam'}
            invalid_targets = set(self.df['target'].unique()) - valid_targets
            if invalid_targets:
                logging.warning(f"Invalid target values found: {invalid_targets}. Filtering out rows with these values.")
                self.df = self.df[self.df['target'].isin(valid_targets)]
                if self.df.empty:
                    raise ValueError("No valid 'ham' or 'spam' records remaining after filtering invalid targets. Dataset is empty.")

            # Encode target labels: 'ham' and 'spam' to numerical (e.g., 0 and 1)
            # The encoder stores the mapping, which is important for inverse_transform later.
            self.df['target'] = self.encoder.fit_transform(self.df['target'])
            initial_rows = len(self.df)
            self.df.drop_duplicates(inplace=True) # Remove duplicate rows
            self.df.dropna(inplace=True) # Remove rows with any NaN values (should be none after column selection)

            logging.info(f"Cleaned dataset. Removed {initial_rows - len(self.df)} duplicates/nulls. Remaining: {len(self.df)} records.")
            if self.df.empty:
                raise ValueError("Dataset became empty after cleaning steps. Check data quality or initial loading.")
        except Exception as e:
            logging.critical(f"Data cleaning failed: {e}")
            sys.exit(1) # Critical error, stop pipeline

    def _safe_tokenize(self, text: str) -> list[str]:
        """Tokenizes input text safely, ensuring it's a string and filtering non-alphanumeric tokens."""
        if not isinstance(text, str):
            text = str(text) # Coerce non-string inputs to string
            logging.debug(f"Coerced non-string text to string for tokenization: {text[:50]}...")
        try:
            tokens = nltk.word_tokenize(text.lower()) # Convert to lowercase and tokenize
            # Filter for alphanumeric tokens and remove common single punctuation marks
            return [t for t in tokens if t.isalnum() and t not in string.punctuation]
        except Exception as e:
            logging.warning(f"Tokenization failed for text (first 50 chars: '{text[:50]}...'). Returning empty list. Error: {e}")
            return []

    def eda(self) -> None:
        try:
            self.df['num_words'] = self.df['text'].apply(
                lambda x: len(self._safe_tokenize(x)))
            self.df['num_chars'] = self.df['text'].apply(len)
            self.df['num_sentences'] = self.df['text'].apply(lambda x:len(nltk.sent_tokenize(x))) # Added sentences

            # Calculate and set scale_pos_weight for XGBoost based on class imbalance
            # This helps XGBoost handle imbalanced datasets by giving more weight to the minority class.
            ham_count = self.df[self.df['target'] == self.encoder.transform(['ham'])[0]].shape[0]
            spam_count = self.df[self.df['target'] == self.encoder.transform(['spam'])[0]].shape[0]
            if spam_count > 0:
                scale_pos_weight_val = ham_count / spam_count # ratio of negative to positive samples
                # Update the XGBoost classifier instance directly
                self.clfs['XGB'].set_params(scale_pos_weight=scale_pos_weight_val)
                logging.info(f"Set XGBoost scale_pos_weight to: {scale_pos_weight_val:.2f} (Ham:{ham_count}, Spam:{spam_count})")
            else:
                logging.warning("No spam samples found to calculate scale_pos_weight for XGBoost. Defaulting to 1.")

            # --- Plotting Section ---
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Unique timestamp for plot filenames

            # 1. Target Distribution Pie Chart
            fig1, ax1 = plt.subplots(figsize=(8, 8))
            self.df['target'].value_counts().plot(
                kind='pie', ax=ax1, autopct='%1.1f%%',
                labels=self.encoder.inverse_transform(self.df['target'].value_counts().index),
                colors=sns.color_palette('pastel')[0:2],
                explode=[0, 0.1] # Explode spam slice for emphasis
            )
            ax1.set_title('Target Class Distribution')
            ax1.set_ylabel('') # Hide default y-label for pie chart for better aesthetics
            fig1_filename = os.path.join(Config.PLOTS_DIR, f'target_distribution_{timestamp}.png')
            plt.savefig(fig1_filename, bbox_inches='tight') # bbox_inches='tight' prevents labels/titles from being cut off
            plt.close(fig1) # Close the figure to free up memory
            logging.info(f"Target distribution plot saved to {fig1_filename}.")

            # 2. Word Count Distribution Histogram
            fig2, ax2 = plt.subplots(figsize=(14, 6))
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['ham'])[0]], x='num_words', ax=ax2, bins=50, kde=True, color='blue', label='Ham')
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['spam'])[0]], x='num_words', ax=ax2, bins=50, kde=True, color='red', label='Spam')
            ax2.set_title('Word Count Distribution by Target Class')
            ax2.set_xlabel('Number of Words')
            ax2.set_ylabel('Count')
            ax2.legend()
            fig2_filename = os.path.join(Config.PLOTS_DIR, f'word_count_distribution_{timestamp}.png')
            plt.savefig(fig2_filename, bbox_inches='tight')
            plt.close(fig2)
            logging.info(f"Word count distribution plot saved to {fig2_filename}.")

            # 3. Character Count Distribution Histogram
            fig3, ax3 = plt.subplots(figsize=(14, 6))
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['ham'])[0]], x='num_chars', ax=ax3, bins=50, kde=True, color='blue', label='Ham')
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['spam'])[0]], x='num_chars', ax=ax3, bins=50, kde=True, color='red', label='Spam')
            ax3.set_title('Character Count Distribution by Target Class')
            ax3.set_xlabel('Number of Characters')
            ax3.set_ylabel('Count')
            ax3.legend()
            fig3_filename = os.path.join(Config.PLOTS_DIR, f'char_count_distribution_{timestamp}.png')
            plt.savefig(fig3_filename, bbox_inches='tight')
            plt.close(fig3)
            logging.info(f"Character count distribution plot saved to {fig3_filename}.")

            # 4. Correlation Heatmap
            fig4, ax4 = plt.subplots(figsize=(8, 6))
            sns.heatmap(self.df[['num_chars', 'num_words', 'num_sentences', 'target']].corr(), annot=True, cmap='coolwarm', ax=ax4)
            ax4.set_title('Correlation Matrix of Text Features and Target')
            fig4_filename = os.path.join(Config.PLOTS_DIR, f'correlation_heatmap_{timestamp}.png')
            plt.savefig(fig4_filename, bbox_inches='tight')
            plt.close(fig4)
            logging.info(f"Correlation heatmap plot saved to {fig4_filename}.")

            logging.info(f"Descriptive statistics for Ham emails:\n{self.df[self.df['target'] == self.encoder.transform(['ham'])[0]][['num_chars', 'num_words', 'num_sentences']].describe()}")
            logging.info(f"Descriptive statistics for Spam emails:\n{self.df[self.df['target'] == self.encoder.transform(['spam'])[0]][['num_chars', 'num_words', 'num_sentences']].describe()}")

        except Exception as e:
            logging.error(f"EDA process failed: {e}")
            raise # Re-raise the exception to stop the pipeline if EDA is critical

    def transform_text(self, text: str) -> str:
        if not isinstance(text, str):
            text = str(text) # Coerce non-string inputs to string
            logging.debug(f"Coerced non-string text to string for transform_text: {text[:50]}...")

        tokens = nltk.word_tokenize(text.lower())

        processed_tokens = [token for token in tokens if token.isalnum()]

        stop_words = set(stopwords.words('english'))
        filtered_tokens = [token for token in processed_tokens if token not in stop_words and token not in string.punctuation]

        stemmed_tokens = [self.ps.stem(token) for token in filtered_tokens]

        final_tokens = [token for token in stemmed_tokens if len(token) > 1 or token.isdigit()]

        return " ".join(final_tokens)

    def preprocess_text(self) -> None:
        try:
            logging.info("\n--- Text Preprocessing for EDA and Visualizations ---")
            # Apply transformation for EDA specific columns
            self.df['transformed_text'] = self.df['text'].apply(self.transform_text)
            logging.info("Text transformation for EDA complete. Example:")
            logging.info(f"\n{self.df[['text', 'transformed_text']].head().to_string()}")

            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

            logging.info("\nGenerating Word Clouds (saved to plots directory):")
            spam_wc = WordCloud(width=800, height=400, min_font_size=10, background_color='white').generate(
                self.df[self.df['target'] == self.encoder.transform(['spam'])[0]]['transformed_text'].str.cat(sep=" ")
            )
            plt.figure(figsize=(10, 5))
            plt.imshow(spam_wc)
            plt.title('Spam Word Cloud')
            plt.axis('off')
            wc_spam_filename = os.path.join(Config.PLOTS_DIR, f'spam_wordcloud_{timestamp}.png')
            plt.savefig(wc_spam_filename, bbox_inches='tight')
            plt.close()
            logging.info(f"Spam word cloud saved to {wc_spam_filename}.")

            ham_wc = WordCloud(width=800, height=400, min_font_size=10, background_color='white').generate(
                self.df[self.df['target'] == self.encoder.transform(['ham'])[0]]['transformed_text'].str.cat(sep=" ")
            )
            plt.figure(figsize=(10, 5))
            plt.imshow(ham_wc)
            plt.title('Ham Word Cloud')
            plt.axis('off')
            wc_ham_filename = os.path.join(Config.PLOTS_DIR, f'ham_wordcloud_{timestamp}.png')
            plt.savefig(wc_ham_filename, bbox_inches='tight')
            plt.close()
            logging.info(f"Ham word cloud saved to {wc_ham_filename}.")

            logging.info("\nMost common words in Spam (saved as plot):")
            spam_corpus = ' '.join(self.df[self.df['target'] == self.encoder.transform(['spam'])[0]]['transformed_text']).split()
            self._plot_most_common_words(spam_corpus, title='Top 30 Spam Words', filename=f'top_spam_words_{timestamp}.png')

            logging.info("\nMost common words in Ham (saved as plot):")
            ham_corpus = ' '.join(self.df[self.df['target'] == self.encoder.transform(['ham'])[0]]['transformed_text']).split()
            self._plot_most_common_words(ham_corpus, title='Top 30 Ham Words', filename=f'top_ham_words_{timestamp}.png')

        except Exception as e:
            logging.critical(f"Text preprocessing for EDA failed: {e}")
            sys.exit(1) # Critical error, stop pipeline

    def _plot_most_common_words(self, corpus: list[str], title: str, n: int = 30, filename: str = "common_words.png") -> None:
        """Helper to plot and save most common words."""
        common_words = Counter(corpus).most_common(n)
        df_common_words = pd.DataFrame(common_words, columns=['Word', 'Count'])
        
        fig, ax = plt.subplots(figsize=(12, 6))
        sns.barplot(x='Word', y='Count', data=df_common_words, ax=ax, palette='viridis')
        ax.set_xticklabels(ax.get_xticklabels(), rotation='vertical')
        ax.set_title(title)
        
        plot_filepath = os.path.join(Config.PLOTS_DIR, filename)
        plt.savefig(plot_filepath, bbox_inches='tight')
        plt.close(fig)
        logging.info(f"Plot '{title}' saved to {plot_filepath}.")


    def vectorize_text_with_embeddings(self) -> None:
        """
        Converts text data into numerical features using SentenceTransformer
        for contextual embeddings.
        """
        try:
            logging.info(f"\n--- Text Vectorization (SentenceTransformer: {Config.SENTENCE_TRANSFORMER_MODEL}) ---")
            if self.sentence_transformer_model is None:
                self.sentence_transformer_model = SentenceTransformer(Config.SENTENCE_TRANSFORMER_MODEL)
            
            # Use raw text for SentenceTransformer as it handles internal tokenization/preprocessing best
            # It's generally not recommended to stem/remove stopwords for S-BERT
            self.X = self.sentence_transformer_model.encode(
                self.df['text'].tolist(),
                show_progress_bar=True,
                convert_to_tensor=False,
                batch_size=64 # Adjust based on memory
            )
            self.y = self.df['target'].values
            logging.info(f"SentenceTransformer embedding complete. X shape: {self.X.shape}, Y shape: {self.y.shape}.")
        except Exception as e:
            logging.critical(f"Text vectorization failed: {e}")
            sys.exit(1)

    def split_data(self) -> None:
        """
        Splits the data into training and testing sets using stratified sampling
        to maintain class proportions.
        """
        try:
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
                self.X, self.y, test_size=Config.TEST_SIZE,
                random_state=Config.RANDOM_STATE, stratify=self.y)
            logging.info(f"Data split: Train {len(self.X_train)} samples, Test {len(self.X_test)} samples.")
            logging.info(f"Train target distribution: {np.bincount(self.y_train)}")
            logging.info(f"Test target distribution: {np.bincount(self.y_test)}")
        except Exception as e:
            logging.critical(f"Data splitting failed: {e}")
            sys.exit(1)

    def _objective(self, trial: optuna.trial.Trial, model_name: str) -> float:
        """
        Optuna objective function for hyperparameter tuning.
        It defines the search space for a given model, trains it within an ImbPipeline
        (to handle SMOTE correctly), and returns the cross-validated F1-score for 'spam' to be maximized.
        """
        # Define hyperparameter search spaces for each model
        if model_name == 'LR':
            c_param = trial.suggest_loguniform('C', 1e-4, 1e2)
            solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])
            model = LogisticRegression(C=c_param, solver=solver, random_state=Config.RANDOM_STATE,
                                       class_weight='balanced', max_iter=2000,
                                       n_jobs=-1 if solver == 'saga' else None)
        elif model_name == 'RF':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 5, 40, log=True)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           min_samples_leaf=min_samples_leaf,
                                           random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1)
        elif model_name == 'XGB':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 3, 12)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.005, 0.5)
            subsample = trial.suggest_uniform('subsample', 0.6, 1.0)
            colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
            gamma = trial.suggest_loguniform('gamma', 1e-8, 1.0)
            current_scale_pos_weight = self.clfs['XGB'].get_params().get('scale_pos_weight', 1)
            model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                  learning_rate=learning_rate, subsample=subsample,
                                  colsample_bytree=colsample_bytree, gamma=gamma,
                                  random_state=Config.RANDOM_STATE,
                                  eval_metric='logloss',
                                  scale_pos_weight=current_scale_pos_weight)
        elif model_name == 'SVC':
            C_param = trial.suggest_loguniform('C', 1e-2, 1e2)
            gamma_param = trial.suggest_loguniform('gamma', 1e-3, 1e1)
            kernel = trial.suggest_categorical('kernel', ['rbf', 'sigmoid']) # Linear also an option
            model = SVC(C=C_param, gamma=gamma_param, kernel=kernel, probability=True,
                        random_state=Config.RANDOM_STATE, class_weight='balanced')
        elif model_name == 'KN':
            n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
            weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
            algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
            model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, n_jobs=-1)
        elif model_name == 'AdaBoost':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 1.0)
            model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=Config.RANDOM_STATE)
        elif model_name == 'BgC':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            model = BaggingClassifier(n_estimators=n_estimators, random_state=Config.RANDOM_STATE, n_jobs=-1)
        elif model_name == 'ETC':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 5, 40, log=True)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            model = ExtraTreesClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                         min_samples_split=min_samples_split,
                                         min_samples_leaf=min_samples_leaf,
                                         random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1)
        elif model_name == 'GBDT':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 1.0)
            max_depth = trial.suggest_int('max_depth', 3, 10)
            model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=Config.RANDOM_STATE)
        elif model_name == 'DT':
            max_depth = trial.suggest_int('max_depth', 3, 20)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
            model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split,
                                           min_samples_leaf=min_samples_leaf, criterion=criterion,
                                           random_state=Config.RANDOM_STATE, class_weight='balanced')
        else:
            raise ValueError(f"Model '{model_name}' is not configured for Optuna tuning.")

        # Create an ImbPipeline: SMOTE is applied *only* to the training fold in cross-validation.
        # This prevents data leakage and provides a more realistic performance estimate.
        pipeline = ImbPipeline([
            ('smote', SMOTE(random_state=Config.RANDOM_STATE)),
            ('classifier', model)
        ])

        # Perform stratified k-fold cross-validation, optimizing for F1-score of the 'spam' class
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=Config.RANDOM_STATE)
        # Use make_scorer for specific positive label for F1 score, or simply 'f1' if positive class is 1
        # The encoder transforms 'spam' to 1 in this dataset, so 'f1' should work.
        scores = cross_val_score(pipeline, self.X_train, self.y_train, cv=cv, scoring='f1', n_jobs=-1)
        return scores.mean() # Return the mean F1-score across folds

    def tune_models(self) -> None:
        """Performs hyperparameter tuning for each selected classifier using Optuna."""
        try:
            if self.X_train is None or self.y_train is None:
                logging.error("Data not split for tuning. Calling split_data().")
                self.split_data() # Ensure data is split before tuning

            logging.info("Starting hyperparameter tuning with Optuna for selected models...")
            # We tune models with sufficient complexity or those where tuning typically yields large gains
            # For simpler models like KN, tuning might be less critical or a wider range might be needed.
            models_to_tune = ['LR', 'RF', 'XGB', 'SVC', 'ETC'] # Focus tuning on key models

            for name in models_to_tune:
                if name not in self.clfs:
                    logging.warning(f"Model '{name}' not found in initialized classifiers, skipping tuning.")
                    continue

                logging.info(f"Tuning {name} model with {Config.N_TRIALS_OPTUNA} trials...")
                study = optuna.create_study(direction='maximize', # Maximize the objective (F1-score)
                                            sampler=optuna.samplers.TPESampler(seed=Config.RANDOM_STATE),
                                            study_name=f"{name}_tuning_study")
                
                # Suppress Optuna logs from callbacks for cleaner output if desired, or let them flow
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", UserWarning) # Suppress "Figure has been deleted" warnings from Optuna plots
                    study.optimize(lambda trial: self._objective(trial, name),
                                   n_trials=Config.N_TRIALS_OPTUNA,
                                   show_progress_bar=True,
                                   gc_after_trial=True) # <--- ADD THIS LINE HERE
                                   # Removed plot_optimization_history from direct callbacks in _objective
                                   # Can be called separately if plotting all studies is needed.

                self.best_tuned_models_params[name] = study.best_trial.params
                logging.info(f"Best parameters for {name}: {study.best_trial.params}")
                logging.info(f"Best cross-validated F1-score for {name}: {study.best_trial.value:.4f}")

                self.clfs[name].set_params(**study.best_trial.params)
                # Re-apply `scale_pos_weight` for XGBoost if it was set dynamically and not part of tuning.
                if name == 'XGB':
                    current_scale_pos_weight = self.clfs[name].get_params().get('scale_pos_weight', 1)
                    self.clfs[name].set_params(scale_pos_weight=current_scale_pos_weight)

            logging.info("Hyperparameter tuning completed for all selected models.")
        except Exception as e:
            logging.critical(f"Model tuning failed: {e}")
            sys.exit(1)

    def train_final_models(self) -> None:
        try:
            # Ensure X_train, y_train, X_test, y_test are set.
            if self.X_train is None or self.X_test is None:
                 logging.error("Data not split for final training. Calling split_data().")
                 self.split_data()

            logging.info("Applying SMOTE to the entire training data for final model training...")
            smote = SMOTE(random_state=Config.RANDOM_STATE)
            X_train_resampled, y_train_resampled = smote.fit_resample(self.X_train, self.y_train)
            logging.info(f"SMOTE applied. Original train: {len(self.X_train)} samples. Resampled train: {len(X_train_resampled)} samples.")

            results = [] # To store metrics for all models
            best_f1_overall = -1 # To track the highest F1-score for 'spam'
            self.best_model = None # Reset best model before evaluation
            self.best_model_name = None # Reset best model name

            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Unique timestamp for plot/model filenames

            for name, model in self.clfs.items(): # Iterate over all initialized classifiers (now potentially tuned)
                logging.info(f"Training final {name} model on resampled data and evaluating...")
                
                # Handle potential errors during training/prediction for a single model to not stop entire pipeline
                try:
                    model.fit(X_train_resampled, y_train_resampled) # Train the model
                    y_pred = model.predict(self.X_test) # Make predictions on the unseen test set

                    accuracy = accuracy_score(self.y_test, y_pred)
                    # Precision, Recall, F1-score for the 'spam' class (positive class)
                    precision = precision_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)
                    recall = recall_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)
                    f1 = f1_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)

                    report_dict = classification_report(self.y_test, y_pred, target_names=self.encoder.classes_, output_dict=True)

                    results.append({
                        'Model': name,
                        'Accuracy': accuracy,
                        'Precision (Spam)': precision,
                        'Recall (Spam)': recall,
                        'F1-Score (Spam)': f1,
                        'Full Classification Report': report_dict # Store the comprehensive report
                    })

                    logging.info(f"\n--- Performance for {name} ---")
                    logging.info(f"Accuracy: {accuracy:.4f}")
                    logging.info(f"Precision (Spam): {precision:.4f}")
                    logging.info(f"Recall (Spam): {recall:.4f}")
                    logging.info(f"F1-Score (Spam): {f1:.4f}")
                    logging.info(f"\nFull Classification Report for {name}:\n{classification_report(self.y_test, y_pred, target_names=self.encoder.classes_)}")

                    # --- Plot and Log Confusion Matrix ---
                    cm = confusion_matrix(self.y_test, y_pred)
                    logging.info(f"\nRaw Confusion Matrix for {name}:\n{cm}") # Log the raw matrix

                    fig_cm, ax_cm = plt.subplots(figsize=(7, 6))
                    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                                xticklabels=self.encoder.classes_, # Predicted labels
                                yticklabels=self.encoder.classes_, # True labels
                                linecolor='gray', linewidths=0.5, # Add grid lines for better readability
                                annot_kws={"size": 14}) # Adjust annotation font size
                    ax_cm.set_xlabel('Predicted Label', fontsize=12)
                    ax_cm.set_ylabel('True Label', fontsize=12)
                    ax_cm.set_title(f'Confusion Matrix for {name}', fontsize=14)
                    cm_filename = os.path.join(Config.PLOTS_DIR, f'confusion_matrix_{name}_{timestamp}.png')
                    plt.savefig(cm_filename, bbox_inches='tight')
                    plt.close(fig_cm) # Close the figure to free memory
                    logging.info(f"Confusion matrix plot for {name} saved to {cm_filename}.")

                    # Identify the best performing model based on F1-score for the 'spam' class
                    if f1 > best_f1_overall:
                        best_f1_overall = f1
                        self.best_model_name = name
                        self.best_model = model # Store the actual best model object
                except Exception as model_e:
                    logging.error(f"Error training or evaluating model {name}: {model_e}")
                    # Optionally, append a failed entry to results
                    results.append({
                        'Model': name,
                        'Accuracy': np.nan,
                        'Precision (Spam)': np.nan,
                        'Recall (Spam)': np.nan,
                        'F1-Score (Spam)': np.nan,
                        'Full Classification Report': {'error': str(model_e)}
                    })


            self.performance_df = pd.DataFrame(results) # Convert results list to DataFrame
            # Sort by F1-Score (Spam) to easily identify the best model
            self.performance_df = self.performance_df.sort_values(by='F1-Score (Spam)', ascending=False).reset_index(drop=True)
            logging.info(f"\n--- Overall Best Model Identified: {self.best_model_name} (F1-Score on Spam: {best_f1_overall:.4f}) ---")
            logging.info("All model evaluations completed.")

            self._save_best_model() # Save only the best performing model for deployment

            # Plot overall performance comparison
            self._plot_performance_comparison(timestamp)

        except Exception as e:
            logging.critical(f"Final model training and evaluation failed: {e}")
            sys.exit(1) # Critical error, stop pipeline

    def _save_best_model(self) -> None:
        """Saves the best performing model and related components to a pickle file."""
        try:
            if self.best_model is None or self.best_model_name is None:
                logging.warning("No best model identified or stored. Skipping model save operation.")
                return

            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            # Create a descriptive filename including the model name and timestamp
            model_filename = os.path.join(Config.MODELS_DIR, f'best_model_{self.best_model_name}_{timestamp}.pkl')

            # Serialize and save the best model along with the SentenceTransformer and LabelEncoder.
            # These are essential for making predictions on new, raw text.
            with open(model_filename, 'wb') as f:
                pickle.dump({
                    'model': self.best_model,
                    'transformer': self.sentence_transformer_model,
                    'encoder': self.encoder,
                    'model_name': self.best_model_name,
                    'performance_summary': self.performance_df.to_dict('records') # Save full performance summary
                }, f)
            logging.info(f"Best performing model ({self.best_model_name}) saved to {model_filename}")
        except Exception as e:
            logging.error(f"Failed to save the best model: {e}")

    def _plot_performance_comparison(self, timestamp: str) -> None:
        """Plots and saves the comparison of model performance metrics."""
        if self.performance_df.empty:
            logging.warning("Performance DataFrame is empty, cannot plot comparison.")
            return

        # Prepare data for plotting
        plot_df = self.performance_df[['Model', 'Accuracy', 'Precision (Spam)', 'Recall (Spam)', 'F1-Score (Spam)']].copy()
        plot_df_melted = plot_df.melt(id_vars="Model", var_name="Metric", value_name="Score")

        fig, ax = plt.subplots(figsize=(14, 7))
        sns.barplot(x='Model', y='Score', hue='Metric', data=plot_df_melted, palette='tab10', ax=ax)
        ax.set_ylim(0.5, 1.0) # Set a sensible y-limit for scores
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
        ax.set_title('Model Performance Comparison (Test Set)')
        ax.set_ylabel('Score')
        ax.set_xlabel('Model')
        ax.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()

        plot_filename = os.path.join(Config.PLOTS_DIR, f'model_performance_comparison_{timestamp}.png')
        plt.savefig(plot_filename, bbox_inches='tight')
        plt.close(fig)
        logging.info(f"Model performance comparison plot saved to {plot_filename}.")

    def run_pipeline(self) -> bool:
        """Executes the full spam classification pipeline sequentially."""
        steps = [
            ('Data Loading', self.load_data),
            ('Data Cleaning', self.clean_data),
            ('EDA and Feature Engineering', self.eda), # Descriptive step name
            ('Text Preprocessing for EDA', self.preprocess_text), # New step for EDA specific text processing
            ('Text Vectorization (Embeddings)', self.vectorize_text_with_embeddings),
            ('Data Splitting', self.split_data), # Explicit splitting step
            ('Hyperparameter Tuning', self.tune_models), # Optuna tuning step
            ('Final Model Training & Evaluation', self.train_final_models)
        ]

        for name, step in steps:
            try:
                logging.info(f"\n--- Starting Pipeline Step: {name} ---")
                step() # Execute the current pipeline step
                logging.info(f"--- Completed Pipeline Step: {name} ---\n")
            except SystemExit: # Catch sys.exit() calls from critical errors within steps
                logging.critical(f"Pipeline stopped due to critical error in step: '{name}'.")
                return False # Indicate pipeline failure
            except Exception as e: # Catch any other unexpected exceptions from a step
                logging.critical(f"Pipeline failed unexpectedly in step '{name}': {e}")
                return False # Indicate pipeline failure

        logging.info("Spam classification pipeline completed successfully.")
        return True

    @staticmethod
    def load_for_inference(model_path: str) -> 'SpamClassifier':
        """
        Loads a saved model and its components (SentenceTransformer, LabelEncoder) for making predictions.
        This is a static method, allowing loading without first initializing the full SpamClassifier class.
        """
        try:
            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Model file not found at {os.path.abspath(model_path)}")

            with open(model_path, 'rb') as f:
                data = pickle.load(f)

            # Create a new SpamClassifier instance to encapsulate the loaded components.
            # This ensures all necessary helper methods (like _safe_tokenize) are available.
            classifier = SpamClassifier()
            classifier.best_model = data['model']
            classifier.sentence_transformer_model = data['transformer']
            classifier.encoder = data['encoder']
            classifier.best_model_name = data.get('model_name', 'Unknown_Model') # Get saved model name

            # Re-initialize PorterStemmer as it's not directly picklable within the model object.
            # It's a stateless component.
            classifier.ps = PorterStemmer()

            logging.info(f"Model '{classifier.best_model_name}' loaded successfully from {model_path} for inference.")
            return classifier
        except Exception as e:
            logging.critical(f"Failed to load model for inference from {model_path}: {e}")
            raise # Re-raise the exception for external handling (e.g., in __main__)

    def predict(self, text: str) -> str:
        if self.best_model is None or self.sentence_transformer_model is None or self.encoder is None:
            logging.error("Model components not loaded. Please run run_pipeline() or load model using load_for_inference() before calling predict().")
            raise RuntimeError("Model components not available for prediction.")

        try:
            # Vectorize the input text. `encode` expects a list of strings.
            vector = self.sentence_transformer_model.encode([text], convert_to_tensor=False)

            # Make prediction using the best model
            prediction_encoded = self.best_model.predict(vector)[0]

            # Decode the numerical prediction back to 'ham' or 'spam'
            prediction_label = self.encoder.inverse_transform([prediction_encoded])[0]
            return prediction_label
        except Exception as e:
            logging.error(f"Prediction failed for text '{text[:50]}...': {e}")
            return "error" # Return an error state for production systems


# --- Main execution block for training, evaluation, and inference demonstration ---
if __name__ == '__main__':
    # Initialize and run the full spam classification pipeline
    classifier = SpamClassifier()
    pipeline_success = classifier.run_pipeline()

    if pipeline_success:
        logging.info("\n=== Spam Classification Pipeline Completed Successfully ===")
        logging.info("Overall Model Performance Summary (Sorted by F1-Score on Spam):")
        # Print a clean summary of the performance DataFrame
        # Ensure we're printing the F1-Score (Spam) for relevance
        print(classifier.performance_df[['Model', 'Accuracy', 'Precision (Spam)', 'Recall (Spam)', 'F1-Score (Spam)']].to_string())

        logging.info(f"\nBest Performing Model Identified: {classifier.best_model_name}")
        logging.info(f"Check '{Config.PLOTS_DIR}' for EDA and Confusion Matrix plots.")
        logging.info(f"Check '{Config.MODELS_DIR}' for the saved best model.")

        # --- Example of Loading Saved Model and Making Predictions (Inference) ---
        try:
            logging.info("\n--- Demonstrating Model Inference from Saved Model ---")
            # Find the most recently saved model file for demonstration purposes.
            # In a real deployment, you would specify the exact path to your desired model.
            model_files = [f for f in os.listdir(Config.MODELS_DIR) if f.startswith('best_model_') and f.endswith('.pkl')]
            if model_files:
                # Sort files by modification time to get the latest one
                latest_model_file = max(model_files, key=lambda f: os.path.getmtime(os.path.join(Config.MODELS_DIR, f)))
                latest_model_path = os.path.join(Config.MODELS_DIR, latest_model_file)

                logging.info(f"Attempting to load the latest best model from: {latest_model_path}")
                loaded_classifier = SpamClassifier.load_for_inference(latest_model_path)

                # Define a few test cases
                test_spam_text_1 = "WINNER! You have been selected for a £1000 prize! Call 09061701300 now or claim at link.co.uk/prize. T&C's apply."
                test_spam_text_2 = "URGENT! Your bank account has been locked due to suspicious activity. Verify immediately at http://bit.ly/malicious-site to avoid closure."
                test_ham_text_1 = "Hey, just checking in. How are you doing today? Let's catch up soon for coffee!"
                test_ham_text_2 = "Hi mom, can you pick up milk and bread on your way home? Thanks, love you!"
                test_empty_text = "???!!!#@%" # Example of text that might become empty after preprocessing

                print(f"\nPrediction for SPAM text 1: '{test_spam_text_1}' -> {loaded_classifier.predict(test_spam_text_1)}")
                print(f"Prediction for SPAM text 2: '{test_spam_text_2}' -> {loaded_classifier.predict(test_spam_text_2)}")
                print(f"Prediction for HAM text 1: '{test_ham_text_1}' -> {loaded_classifier.predict(test_ham_text_1)}")
                print(f"Prediction for HAM text 2: '{test_ham_text_2}' -> {loaded_classifier.predict(test_ham_text_2)}")
                print(f"Prediction for EMPTY/NOISY text: '{test_empty_text}' -> {loaded_classifier.predict(test_empty_text)}")

            else:
                logging.warning("No model files found in the 'models' directory to demonstrate inference. Run the pipeline first.")

        except Exception as e:
            logging.error(f"An error occurred during the inference demonstration: {e}")
            sys.exit(1) # Indicate an error in the inference demo phase
    else:
        logging.critical("Spam classification pipeline failed during execution. Please review the log file for details.")
        sys.exit(1) # Exit with an error code if the pipeline failed

In [1]:
# Spam Classification Pipeline with EDA, Text Preprocessing, and Model Training
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from collections import Counter
from wordcloud import WordCloud
import pickle
import warnings
import logging
import os
import sys
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, recall_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                             BaggingClassifier, ExtraTreesClassifier,
                             GradientBoostingClassifier, VotingClassifier,
                             StackingClassifier)
from xgboost import XGBClassifier
from sentence_transformers import SentenceTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import optuna
from datetime import datetime

# --- Determine Base Directory for Notebook/Script ---
try:
    current_script_dir = os.path.dirname(os.path.abspath(__file__))
    base_directory = current_script_dir
    print(f"Running as a script. Base directory set to: '{base_directory}'")
except NameError:
    base_directory = os.getcwd()
    print(f"Running in a notebook environment. Base directory set to CWD: '{base_directory}'")


# --- Configuration (Externalize for production) ---
class Config:
    DATA_PATH = os.path.join(base_directory, 'spam.csv')
    SENTENCE_TRANSFORMER_MODEL = 'all-MiniLM-L6-v2'
    LOG_FILE = os.path.join(base_directory, 'spam_classifier.log')
    RANDOM_STATE = 42
    TEST_SIZE = 0.2
    N_TRIALS_OPTUNA = 15
    PLOTS_DIR = os.path.join(base_directory, 'plots')
    MODELS_DIR = os.path.join(base_directory, 'models')

# Ensure plot and model directories exist at startup
os.makedirs(Config.PLOTS_DIR, exist_ok=True)
os.makedirs(Config.MODELS_DIR, exist_ok=True)


class SpamClassifier:
    def __init__(self):
        self._configure_logging()
        self._verify_nltk_resources()
        self._configure_matplotlib()
        self.df = None
        self.encoder = LabelEncoder()
        self.ps = PorterStemmer()
        self.sentence_transformer_model = None
        self.X, self.y = None, None
        self.X_train, self.X_test, self.y_train, self.y_test = [None]*4
        self.clfs = {}
        self.best_tuned_models_params = {}
        self.best_model = None
        self.best_model_name = None
        self.performance_df = pd.DataFrame()
        self._initialize_classifiers()
        logging.info("SpamClassifier initialized successfully.")

    def _configure_logging(self) -> None:
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(Config.LOG_FILE),
                logging.StreamHandler(sys.stdout)
            ]
        )
        warnings.filterwarnings('ignore')

    def _verify_nltk_resources(self) -> None:
        resources = [
            ('tokenizers/punkt', 'punkt'),
            ('corpora/stopwords', 'stopwords'),
            ('tokenizers/punkt_tab', 'punkt_tab')
        ]
        for path, package in resources:
            try:
                nltk.data.find(path)
                logging.info(f"NLTK {package} resource found.")
            except LookupError:
                logging.warning(f"NLTK {package} not found. Attempting to download...")
                try:
                    nltk.download(package, quiet=True)
                    logging.info(f"NLTK {package} downloaded successfully.")
                except Exception as e:
                    logging.critical(f"Failed to download NLTK {package}. Error: {e}")
                    sys.exit(1)

    def _configure_matplotlib(self) -> None:
        plt.ioff()
        sns.set(style='whitegrid', palette='viridis')

    def _initialize_classifiers(self) -> None:
        self.clfs = {
            'LR': LogisticRegression(
                solver='liblinear',
                penalty='l1',
                random_state=Config.RANDOM_STATE,
                class_weight='balanced',
                max_iter=1000
            ),
            'RF': RandomForestClassifier(
                n_estimators=100,
                random_state=Config.RANDOM_STATE,
                class_weight='balanced',
                n_jobs=-1
            ),
            'XGB': XGBClassifier(
                n_estimators=100,
                random_state=Config.RANDOM_STATE,
                eval_metric='logloss',
                scale_pos_weight=1
            ),
            'SVC': SVC(kernel='sigmoid', gamma=1.0, probability=True, random_state=Config.RANDOM_STATE, class_weight='balanced'),
            'KN': KNeighborsClassifier(),
            'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=Config.RANDOM_STATE),
            'BgC': BaggingClassifier(n_estimators=100, random_state=Config.RANDOM_STATE, n_jobs=-1),
            'ETC': ExtraTreesClassifier(n_estimators=100, random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1),
            'GBDT': GradientBoostingClassifier(n_estimators=100, random_state=Config.RANDOM_STATE),
            'DT': DecisionTreeClassifier(max_depth=5, random_state=Config.RANDOM_STATE, class_weight='balanced')
        }

    def load_data(self) -> None:
        try:
            if not os.path.exists(Config.DATA_PATH):
                raise FileNotFoundError(f"Data file not found at {os.path.abspath(Config.DATA_PATH)}")
            self.df = pd.read_csv(Config.DATA_PATH, encoding='latin-1')
            if len(self.df) < 100:
                raise ValueError(f"Dataset too small ({len(self.df)} samples). Minimum 100 samples required for robust analysis.")
            logging.info(f"Loaded {len(self.df)} records from {Config.DATA_PATH}.")
        except Exception as e:
            logging.critical(f"Data loading failed: {e}")
            sys.exit(1)

    def clean_data(self) -> None:
        try:
            if 'v1' in self.df.columns and 'v2' in self.df.columns:
                self.df = self.df[['v1', 'v2']].copy()
                logging.info("Selected 'v1' and 'v2' columns from the dataset.")
            else:
                found_v1 = next((col for col in self.df.columns if 'target' in col.lower() or 'label' in col.lower() or 'type' in col.lower()), None)
                found_v2 = next((col for col in self.df.columns if 'text' in col.lower() or 'message' in col.lower() or 'sms' in col.lower()), None)
                if found_v1 and found_v2:
                    self.df = self.df[[found_v1, found_v2]].copy()
                    logging.info(f"Mapped columns '{found_v1}' to 'target' and '{found_v2}' to 'text' using heuristics.")
                else:
                    raise ValueError(f"Could not find required 'target' and 'text' columns (v1/v2 or equivalents) in dataset. Found columns: {self.df.columns.tolist()}")

            self.df.columns = ['target', 'text']
            valid_targets = {'ham', 'spam'}
            invalid_targets = set(self.df['target'].unique()) - valid_targets
            if invalid_targets:
                logging.warning(f"Invalid target values found: {invalid_targets}. Filtering out rows with these values.")
                self.df = self.df[self.df['target'].isin(valid_targets)]
                if self.df.empty:
                    raise ValueError("No valid 'ham' or 'spam' records remaining after filtering invalid targets. Dataset is empty.")

            self.df['target'] = self.encoder.fit_transform(self.df['target'])
            initial_rows = len(self.df)
            self.df.drop_duplicates(inplace=True)
            self.df.dropna(inplace=True)

            logging.info(f"Cleaned dataset. Removed {initial_rows - len(self.df)} duplicates/nulls. Remaining: {len(self.df)} records.")
            if self.df.empty:
                raise ValueError("Dataset became empty after cleaning steps. Check data quality or initial loading.")
        except Exception as e:
            logging.critical(f"Data cleaning failed: {e}")
            sys.exit(1)

    def _safe_tokenize(self, text: str) -> list[str]:
        if not isinstance(text, str):
            text = str(text)
            logging.debug(f"Coerced non-string text to string for tokenization: {text[:50]}...")
        try:
            tokens = nltk.word_tokenize(text.lower())
            return [t for t in tokens if t.isalnum() and t not in string.punctuation]
        except Exception as e:
            logging.warning(f"Tokenization failed for text (first 50 chars: '{text[:50]}...'). Returning empty list. Error: {e}")
            return []

    def eda(self) -> None:
        try:
            self.df['num_words'] = self.df['text'].apply(lambda x: len(self._safe_tokenize(x)))
            self.df['num_chars'] = self.df['text'].apply(len)
            self.df['num_sentences'] = self.df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

            ham_count = self.df[self.df['target'] == self.encoder.transform(['ham'])[0]].shape[0]
            spam_count = self.df[self.df['target'] == self.encoder.transform(['spam'])[0]].shape[0]
            if spam_count > 0:
                scale_pos_weight_val = ham_count / spam_count
                self.clfs['XGB'].set_params(scale_pos_weight=scale_pos_weight_val)
                logging.info(f"Set XGBoost scale_pos_weight to: {scale_pos_weight_val:.2f} (Ham:{ham_count}, Spam:{spam_count})")
            else:
                logging.warning("No spam samples found to calculate scale_pos_weight for XGBoost. Defaulting to 1.")

            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            fig1, ax1 = plt.subplots(figsize=(8, 8))
            self.df['target'].value_counts().plot(
                kind='pie', ax=ax1, autopct='%1.1f%%',
                labels=self.encoder.inverse_transform(self.df['target'].value_counts().index),
                colors=sns.color_palette('pastel')[0:2],
                explode=[0, 0.1]
            )
            ax1.set_title('Target Class Distribution')
            ax1.set_ylabel('')
            fig1_filename = os.path.join(Config.PLOTS_DIR, f'target_distribution_{timestamp}.png')
            plt.savefig(fig1_filename, bbox_inches='tight')
            plt.close(fig1)
            logging.info(f"Target distribution plot saved to {fig1_filename}.")

            fig2, ax2 = plt.subplots(figsize=(14, 6))
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['ham'])[0]], x='num_words', ax=ax2, bins=50, kde=True, color='blue', label='Ham')
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['spam'])[0]], x='num_words', ax=ax2, bins=50, kde=True, color='red', label='Spam')
            ax2.set_title('Word Count Distribution by Target Class')
            ax2.set_xlabel('Number of Words')
            ax2.set_ylabel('Count')
            ax2.legend()
            fig2_filename = os.path.join(Config.PLOTS_DIR, f'word_count_distribution_{timestamp}.png')
            plt.savefig(fig2_filename, bbox_inches='tight')
            plt.close(fig2)
            logging.info(f"Word count distribution plot saved to {fig2_filename}.")

            fig3, ax3 = plt.subplots(figsize=(14, 6))
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['ham'])[0]], x='num_chars', ax=ax3, bins=50, kde=True, color='blue', label='Ham')
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['spam'])[0]], x='num_chars', ax=ax3, bins=50, kde=True, color='red', label='Spam')
            ax3.set_title('Character Count Distribution by Target Class')
            ax3.set_xlabel('Number of Characters')
            ax3.set_ylabel('Count')
            ax3.legend()
            fig3_filename = os.path.join(Config.PLOTS_DIR, f'char_count_distribution_{timestamp}.png')
            plt.savefig(fig3_filename, bbox_inches='tight')
            plt.close(fig3)
            logging.info(f"Character count distribution plot saved to {fig3_filename}.")

            fig4, ax4 = plt.subplots(figsize=(8, 6))
            sns.heatmap(self.df[['num_chars', 'num_words', 'num_sentences', 'target']].corr(), annot=True, cmap='coolwarm', ax=ax4)
            ax4.set_title('Correlation Matrix of Text Features and Target')
            fig4_filename = os.path.join(Config.PLOTS_DIR, f'correlation_heatmap_{timestamp}.png')
            plt.savefig(fig4_filename, bbox_inches='tight')
            plt.close(fig4)
            logging.info(f"Correlation heatmap plot saved to {fig4_filename}.")

            logging.info(f"Descriptive statistics for Ham emails:\n{self.df[self.df['target'] == self.encoder.transform(['ham'])[0]][['num_chars', 'num_words', 'num_sentences']].describe()}")
            logging.info(f"Descriptive statistics for Spam emails:\n{self.df[self.df['target'] == self.encoder.transform(['spam'])[0]][['num_chars', 'num_words', 'num_sentences']].describe()}")

        except Exception as e:
            logging.error(f"EDA process failed: {e}")
            raise

    def transform_text(self, text: str) -> str:
        if not isinstance(text, str):
            text = str(text)
            logging.debug(f"Coerced non-string text to string for transform_text: {text[:50]}...")
        tokens = nltk.word_tokenize(text.lower())
        processed_tokens = [token for token in tokens if token.isalnum()]
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [token for token in processed_tokens if token not in stop_words and token not in string.punctuation]
        stemmed_tokens = [self.ps.stem(token) for token in filtered_tokens]
        final_tokens = [token for token in stemmed_tokens if len(token) > 1 or token.isdigit()]
        return " ".join(final_tokens)

    def preprocess_text(self) -> None:
        try:
            logging.info("\n--- Text Preprocessing for EDA and Visualizations ---")
            self.df['transformed_text'] = self.df['text'].apply(self.transform_text)
            logging.info("Text transformation for EDA complete. Example:")
            logging.info(f"\n{self.df[['text', 'transformed_text']].head().to_string()}")
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            logging.info("\nGenerating Word Clouds (saved to plots directory):")
            spam_wc = WordCloud(width=800, height=400, min_font_size=10, background_color='white').generate(
                self.df[self.df['target'] == self.encoder.transform(['spam'])[0]]['transformed_text'].str.cat(sep=" ")
            )
            plt.figure(figsize=(10, 5))
            plt.imshow(spam_wc)
            plt.title('Spam Word Cloud')
            plt.axis('off')
            wc_spam_filename = os.path.join(Config.PLOTS_DIR, f'spam_wordcloud_{timestamp}.png')
            plt.savefig(wc_spam_filename, bbox_inches='tight')
            plt.close()
            logging.info(f"Spam word cloud saved to {wc_spam_filename}.")

            ham_wc = WordCloud(width=800, height=400, min_font_size=10, background_color='white').generate(
                self.df[self.df['target'] == self.encoder.transform(['ham'])[0]]['transformed_text'].str.cat(sep=" ")
            )
            plt.figure(figsize=(10, 5))
            plt.imshow(ham_wc)
            plt.title('Ham Word Cloud')
            plt.axis('off')
            wc_ham_filename = os.path.join(Config.PLOTS_DIR, f'ham_wordcloud_{timestamp}.png')
            plt.savefig(wc_ham_filename, bbox_inches='tight')
            plt.close()
            logging.info(f"Ham word cloud saved to {wc_ham_filename}.")

            logging.info("\nMost common words in Spam (saved as plot):")
            spam_corpus = ' '.join(self.df[self.df['target'] == self.encoder.transform(['spam'])[0]]['transformed_text']).split()
            self._plot_most_common_words(spam_corpus, title='Top 30 Spam Words', filename=f'top_spam_words_{timestamp}.png')

            logging.info("\nMost common words in Ham (saved as plot):")
            ham_corpus = ' '.join(self.df[self.df['target'] == self.encoder.transform(['ham'])[0]]['transformed_text']).split()
            self._plot_most_common_words(ham_corpus, title='Top 30 Ham Words', filename=f'top_ham_words_{timestamp}.png')

        except Exception as e:
            logging.critical(f"Text preprocessing for EDA failed: {e}")
            sys.exit(1)

    def _plot_most_common_words(self, corpus: list[str], title: str, n: int = 30, filename: str = "common_words.png") -> None:
        common_words = Counter(corpus).most_common(n)
        df_common_words = pd.DataFrame(common_words, columns=['Word', 'Count'])
        fig, ax = plt.subplots(figsize=(12, 6))
        sns.barplot(x='Word', y='Count', data=df_common_words, ax=ax, palette='viridis')
        ax.set_xticklabels(ax.get_xticklabels(), rotation='vertical')
        ax.set_title(title)
        plot_filepath = os.path.join(Config.PLOTS_DIR, filename)
        plt.savefig(plot_filepath, bbox_inches='tight')
        plt.close(fig)
        logging.info(f"Plot '{title}' saved to {plot_filepath}.")

    def vectorize_text_with_embeddings(self) -> None:
        try:
            logging.info(f"\n--- Text Vectorization (SentenceTransformer: {Config.SENTENCE_TRANSFORMER_MODEL}) ---")
            if self.sentence_transformer_model is None:
                self.sentence_transformer_model = SentenceTransformer(Config.SENTENCE_TRANSFORMER_MODEL)

            self.X = self.sentence_transformer_model.encode(
                self.df['text'].tolist(),
                show_progress_bar=True,
                convert_to_tensor=False,
                batch_size=64
            )
            self.y = self.df['target'].values
            logging.info(f"SentenceTransformer embedding complete. X shape: {self.X.shape}, Y shape: {self.y.shape}.")
        except Exception as e:
            logging.critical(f"Text vectorization failed: {e}")
            sys.exit(1)

    def split_data(self) -> None:
        try:
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
                self.X, self.y, test_size=Config.TEST_SIZE,
                random_state=Config.RANDOM_STATE, stratify=self.y)
            logging.info(f"Data split: Train {len(self.X_train)} samples, Test {len(self.X_test)} samples.")
            logging.info(f"Train target distribution: {np.bincount(self.y_train)}")
            logging.info(f"Test target distribution: {np.bincount(self.y_test)}")
        except Exception as e:
            logging.critical(f"Data splitting failed: {e}")
            sys.exit(1)

    def _objective(self, trial: optuna.trial.Trial, model_name: str) -> float:
        if model_name == 'LR':
            c_param = trial.suggest_loguniform('C', 1e-4, 1e2)
            solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])
            model = LogisticRegression(C=c_param, solver=solver, random_state=Config.RANDOM_STATE,
                                       class_weight='balanced', max_iter=2000,
                                       n_jobs=-1 if solver == 'saga' else None)
        elif model_name == 'RF':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 5, 40, log=True)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           min_samples_leaf=min_samples_leaf,
                                           random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1)
        elif model_name == 'XGB':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 3, 12)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.005, 0.5)
            subsample = trial.suggest_uniform('subsample', 0.6, 1.0)
            colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
            gamma = trial.suggest_loguniform('gamma', 1e-8, 1.0)
            current_scale_pos_weight = self.clfs['XGB'].get_params().get('scale_pos_weight', 1)
            model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                  learning_rate=learning_rate, subsample=subsample,
                                  colsample_bytree=colsample_bytree, gamma=gamma,
                                  random_state=Config.RANDOM_STATE,
                                  eval_metric='logloss',
                                  scale_pos_weight=current_scale_pos_weight)
        elif model_name == 'SVC':
            C_param = trial.suggest_loguniform('C', 1e-2, 1e2)
            gamma_param = trial.suggest_loguniform('gamma', 1e-3, 1e1)
            kernel = trial.suggest_categorical('kernel', ['rbf', 'sigmoid'])
            model = SVC(C=C_param, gamma=gamma_param, kernel=kernel, probability=True,
                        random_state=Config.RANDOM_STATE, class_weight='balanced')
        elif model_name == 'KN':
            n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
            weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
            algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
            model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, n_jobs=-1)
        elif model_name == 'AdaBoost':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 1.0)
            model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=Config.RANDOM_STATE)
        elif model_name == 'BgC':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            model = BaggingClassifier(n_estimators=n_estimators, random_state=Config.RANDOM_STATE, n_jobs=-1)
        elif model_name == 'ETC':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 5, 40, log=True)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            model = ExtraTreesClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                         min_samples_split=min_samples_split,
                                         min_samples_leaf=min_samples_leaf,
                                         random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1)
        elif model_name == 'GBDT':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 1.0)
            max_depth = trial.suggest_int('max_depth', 3, 10)
            model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=Config.RANDOM_STATE)
        elif model_name == 'DT':
            max_depth = trial.suggest_int('max_depth', 3, 20)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
            model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split,
                                           min_samples_leaf=min_samples_leaf, criterion=criterion,
                                           random_state=Config.RANDOM_STATE, class_weight='balanced')
        else:
            raise ValueError(f"Model '{model_name}' is not configured for Optuna tuning.")

        pipeline = ImbPipeline([
            ('smote', SMOTE(random_state=Config.RANDOM_STATE)),
            ('classifier', model)
        ])
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=Config.RANDOM_STATE)
        scores = cross_val_score(pipeline, self.X_train, self.y_train, cv=cv, scoring='f1', n_jobs=-1)
        return scores.mean()

    def tune_models(self) -> None:
        try:
            if self.X_train is None or self.y_train is None:
                logging.error("Data not split for tuning. Calling split_data().")
                self.split_data()

            logging.info("Starting hyperparameter tuning with Optuna for selected models...")
            models_to_tune = ['LR', 'RF', 'XGB', 'SVC', 'ETC']

            for name in models_to_tune:
                if name not in self.clfs:
                    logging.warning(f"Model '{name}' not found in initialized classifiers, skipping tuning.")
                    continue

                logging.info(f"Tuning {name} model with {Config.N_TRIALS_OPTUNA} trials...")
                study = optuna.create_study(direction='maximize',
                                            sampler=optuna.samplers.TPESampler(seed=Config.RANDOM_STATE),
                                            study_name=f"{name}_tuning_study")

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", UserWarning)
                    study.optimize(lambda trial: self._objective(trial, name),
                                   n_trials=Config.N_TRIALS_OPTUNA,
                                   show_progress_bar=True,
                                   gc_after_trial=True)

                self.best_tuned_models_params[name] = study.best_trial.params
                logging.info(f"Best parameters for {name}: {study.best_trial.params}")
                logging.info(f"Best cross-validated F1-score for {name}: {study.best_trial.value:.4f}")

                self.clfs[name].set_params(**study.best_trial.params)
                if name == 'XGB':
                    current_scale_pos_weight = self.clfs[name].get_params().get('scale_pos_weight', 1)
                    self.clfs[name].set_params(scale_pos_weight=current_scale_pos_weight)

            logging.info("Hyperparameter tuning completed for all selected models.")
        except Exception as e:
            logging.critical(f"Model tuning failed: {e}")
            sys.exit(1)

    def train_final_models(self) -> None:
        try:
            if self.X_train is None or self.X_test is None:
                 logging.error("Data not split for final training. Calling split_data().")
                 self.split_data()

            logging.info("Applying SMOTE to the entire training data for final model training...")
            smote = SMOTE(random_state=Config.RANDOM_STATE)
            X_train_resampled, y_train_resampled = smote.fit_resample(self.X_train, self.y_train)
            logging.info(f"SMOTE applied. Original train: {len(self.X_train)} samples. Resampled train: {len(X_train_resampled)} samples.")

            results = []
            best_f1_overall = -1
            self.best_model = None
            self.best_model_name = None
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

            for name, model in self.clfs.items():
                logging.info(f"Training final {name} model on resampled data and evaluating...")
                try:
                    # To use the ImbPipeline, we need to pass the model, not just the classifier
                    pipeline = ImbPipeline([('smote', SMOTE(random_state=Config.RANDOM_STATE)), ('classifier', model)])
                    pipeline.fit(self.X_train, self.y_train)
                    y_pred = pipeline.predict(self.X_test)

                    accuracy = accuracy_score(self.y_test, y_pred)
                    precision = precision_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)
                    recall = recall_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)
                    f1 = f1_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)

                    report_dict = classification_report(self.y_test, y_pred, target_names=self.encoder.classes_, output_dict=True)

                    results.append({
                        'Model': name,
                        'Accuracy': accuracy,
                        'Precision (Spam)': precision,
                        'Recall (Spam)': recall,
                        'F1-Score (Spam)': f1,
                        'Full Classification Report': report_dict
                    })

                    logging.info(f"\n--- Performance for {name} ---")
                    logging.info(f"Accuracy: {accuracy:.4f}")
                    logging.info(f"Precision (Spam): {precision:.4f}")
                    logging.info(f"Recall (Spam): {recall:.4f}")
                    logging.info(f"F1-Score (Spam): {f1:.4f}")
                    logging.info(f"\nFull Classification Report for {name}:\n{classification_report(self.y_test, y_pred, target_names=self.encoder.classes_)}")

                    cm = confusion_matrix(self.y_test, y_pred)
                    logging.info(f"\nRaw Confusion Matrix for {name}:\n{cm}")

                    fig_cm, ax_cm = plt.subplots(figsize=(7, 6))
                    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                                xticklabels=self.encoder.classes_,
                                yticklabels=self.encoder.classes_,
                                linecolor='gray', linewidths=0.5,
                                annot_kws={"size": 14})
                    ax_cm.set_xlabel('Predicted Label', fontsize=12)
                    ax_cm.set_ylabel('True Label', fontsize=12)
                    ax_cm.set_title(f'Confusion Matrix for {name}', fontsize=14)
                    cm_filename = os.path.join(Config.PLOTS_DIR, f'confusion_matrix_{name}_{timestamp}.png')
                    plt.savefig(cm_filename, bbox_inches='tight')
                    plt.close(fig_cm)
                    logging.info(f"Confusion matrix plot for {name} saved to {cm_filename}.")

                    if f1 > best_f1_overall:
                        best_f1_overall = f1
                        self.best_model_name = name
                        self.best_model = pipeline # Store the entire pipeline
                except Exception as model_e:
                    logging.error(f"Error training or evaluating model {name}: {model_e}")
                    results.append({
                        'Model': name,
                        'Accuracy': np.nan,
                        'Precision (Spam)': np.nan,
                        'Recall (Spam)': np.nan,
                        'F1-Score (Spam)': np.nan,
                        'Full Classification Report': {'error': str(model_e)}
                    })

            self.performance_df = pd.DataFrame(results)
            self.performance_df = self.performance_df.sort_values(by='F1-Score (Spam)', ascending=False).reset_index(drop=True)
            logging.info(f"\n--- Overall Best Model Identified: {self.best_model_name} (F1-Score on Spam: {best_f1_overall:.4f}) ---")
            logging.info("All model evaluations completed.")
            self._save_best_model()
            self._plot_performance_comparison(timestamp)

        except Exception as e:
            logging.critical(f"Final model training and evaluation failed: {e}")
            sys.exit(1)

    def _save_best_model(self) -> None:
        """Saves the best performing model and related components to a pickle file."""
        try:
            if self.best_model is None or self.best_model_name is None:
                logging.warning("No best model identified or stored. Skipping model save operation.")
                return
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_filename = os.path.join(Config.MODELS_DIR, f'best_model_{self.best_model_name}_{timestamp}.pkl')
            with open(model_filename, 'wb') as f:
                pickle.dump({
                    'model': self.best_model,
                    'transformer': Config.SENTENCE_TRANSFORMER_MODEL, # THIS IS THE FIX
                    'encoder': self.encoder,
                    'model_name': self.best_model_name,
                    'performance_summary': self.performance_df.to_dict('records')
                }, f)
            logging.info(f"Best performing model ({self.best_model_name}) saved to {model_filename}")
        except Exception as e:
            logging.error(f"Failed to save the best model: {e}")

    def _plot_performance_comparison(self, timestamp: str) -> None:
        if self.performance_df.empty:
            logging.warning("Performance DataFrame is empty, cannot plot comparison.")
            return
        plot_df = self.performance_df[['Model', 'Accuracy', 'Precision (Spam)', 'Recall (Spam)', 'F1-Score (Spam)']].copy()
        plot_df_melted = plot_df.melt(id_vars="Model", var_name="Metric", value_name="Score")
        fig, ax = plt.subplots(figsize=(14, 7))
        sns.barplot(x='Model', y='Score', hue='Metric', data=plot_df_melted, palette='tab10', ax=ax)
        ax.set_ylim(0.5, 1.0)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
        ax.set_title('Model Performance Comparison (Test Set)')
        ax.set_ylabel('Score')
        ax.set_xlabel('Model')
        ax.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plot_filename = os.path.join(Config.PLOTS_DIR, f'model_performance_comparison_{timestamp}.png')
        plt.savefig(plot_filename, bbox_inches='tight')
        plt.close(fig)
        logging.info(f"Model performance comparison plot saved to {plot_filename}.")

    def run_pipeline(self) -> bool:
        steps = [
            ('Data Loading', self.load_data),
            ('Data Cleaning', self.clean_data),
            ('EDA and Feature Engineering', self.eda),
            ('Text Preprocessing for EDA', self.preprocess_text),
            ('Text Vectorization (Embeddings)', self.vectorize_text_with_embeddings),
            ('Data Splitting', self.split_data),
            ('Hyperparameter Tuning', self.tune_models),
            ('Final Model Training & Evaluation', self.train_final_models)
        ]
        for name, step in steps:
            try:
                logging.info(f"\n--- Starting Pipeline Step: {name} ---")
                step()
                logging.info(f"--- Completed Pipeline Step: {name} ---\n")
            except SystemExit:
                logging.critical(f"Pipeline stopped due to critical error in step: '{name}'.")
                return False
            except Exception as e:
                logging.critical(f"Pipeline failed unexpectedly in step '{name}': {e}")
                return False
        logging.info("Spam classification pipeline completed successfully.")
        return True

    @staticmethod
    def load_for_inference(model_path: str) -> 'SpamClassifier':
        try:
            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Model file not found at {os.path.abspath(model_path)}")
            with open(model_path, 'rb') as f:
                data = pickle.load(f)
            classifier = SpamClassifier()
            classifier.best_model = data['model']
            classifier.encoder = data['encoder']
            classifier.best_model_name = data.get('model_name', 'Unknown_Model')
            transformer_data = data['transformer']
            if isinstance(transformer_data, str):
                logging.info(f"Loading SentenceTransformer by name: '{transformer_data}'")
                classifier.sentence_transformer_model = SentenceTransformer(transformer_data)
            else:
                logging.warning("Loaded SentenceTransformer object directly from pickle.")
                classifier.sentence_transformer_model = transformer_data
            classifier.ps = PorterStemmer()
            logging.info(f"Model '{classifier.best_model_name}' loaded successfully from {model_path} for inference.")
            return classifier
        except Exception as e:
            logging.critical(f"Failed to load model for inference from {model_path}: {e}")
            raise

    def predict(self, text: str) -> str:
        if self.best_model is None or self.sentence_transformer_model is None or self.encoder is None:
            logging.error("Model components not loaded. Please run run_pipeline() or load model using load_for_inference() before calling predict().")
            raise RuntimeError("Model components not available for prediction.")
        try:
            vector = self.sentence_transformer_model.encode([text], convert_to_tensor=False)
            prediction_encoded = self.best_model.predict(vector)[0]
            prediction_label = self.encoder.inverse_transform([prediction_encoded])[0]
            return prediction_label
        except Exception as e:
            logging.error(f"Prediction failed for text '{text[:50]}...': {e}")
            return "error"


if __name__ == '__main__':
    classifier = SpamClassifier()
    pipeline_success = classifier.run_pipeline()

    if pipeline_success:
        logging.info("\n=== Spam Classification Pipeline Completed Successfully ===")
        logging.info("Overall Model Performance Summary (Sorted by F1-Score on Spam):")
        print(classifier.performance_df[['Model', 'Accuracy', 'Precision (Spam)', 'Recall (Spam)', 'F1-Score (Spam)']].to_string())
        logging.info(f"\nBest Performing Model Identified: {classifier.best_model_name}")
        logging.info(f"Check '{Config.PLOTS_DIR}' for EDA and Confusion Matrix plots.")
        logging.info(f"Check '{Config.MODELS_DIR}' for the saved best model.")

        try:
            logging.info("\n--- Demonstrating Model Inference from Saved Model ---")
            model_files = [f for f in os.listdir(Config.MODELS_DIR) if f.startswith('best_model_') and f.endswith('.pkl')]
            if model_files:
                latest_model_file = max(model_files, key=lambda f: os.path.getmtime(os.path.join(Config.MODELS_DIR, f)))
                latest_model_path = os.path.join(Config.MODELS_DIR, latest_model_file)
                logging.info(f"Attempting to load the latest best model from: {latest_model_path}")
                loaded_classifier = SpamClassifier.load_for_inference(latest_model_path)
                test_spam_text_1 = "WINNER! You have been selected for a £1000 prize! Call 09061701300 now or claim at link.co.uk/prize. T&C's apply."
                test_spam_text_2 = "URGENT! Your bank account has been locked due to suspicious activity. Verify immediately at http://bit.ly/malicious-site to avoid closure."
                test_ham_text_1 = "Hey, just checking in. How are you doing today? Let's catch up soon for coffee!"
                test_ham_text_2 = "Hi mom, can you pick up milk and bread on your way home? Thanks, love you!"
                test_empty_text = "???!!!#@%"
                print(f"\nPrediction for SPAM text 1: '{test_spam_text_1}' -> {loaded_classifier.predict(test_spam_text_1)}")
                print(f"Prediction for SPAM text 2: '{test_spam_text_2}' -> {loaded_classifier.predict(test_spam_text_2)}")
                print(f"Prediction for HAM text 1: '{test_ham_text_1}' -> {loaded_classifier.predict(test_ham_text_1)}")
                print(f"Prediction for HAM text 2: '{test_ham_text_2}' -> {loaded_classifier.predict(test_ham_text_2)}")
                print(f"Prediction for EMPTY/NOISY text: '{test_empty_text}' -> {loaded_classifier.predict(test_empty_text)}")
            else:
                logging.warning("No model files found in the 'models' directory to demonstrate inference. Run the pipeline first.")
        except Exception as e:
            logging.error(f"An error occurred during the inference demonstration: {e}")
            sys.exit(1)
    else:
        logging.critical("Spam classification pipeline failed during execution. Please review the log file for details.")
        sys.exit(1)

  from .autonotebook import tqdm as notebook_tqdm


Running in a notebook environment. Base directory set to CWD: '/home/dev/spam_classifier_project'
2025-07-31 19:11:33,663 - INFO - NLTK punkt resource found.
2025-07-31 19:11:33,664 - INFO - NLTK stopwords resource found.
2025-07-31 19:11:33,665 - INFO - NLTK punkt_tab resource found.
2025-07-31 19:11:33,668 - INFO - SpamClassifier initialized successfully.
2025-07-31 19:11:33,669 - INFO - 
--- Starting Pipeline Step: Data Loading ---
2025-07-31 19:11:33,686 - INFO - Loaded 5572 records from /home/dev/spam_classifier_project/spam.csv.
2025-07-31 19:11:33,688 - INFO - --- Completed Pipeline Step: Data Loading ---

2025-07-31 19:11:33,689 - INFO - 
--- Starting Pipeline Step: Data Cleaning ---
2025-07-31 19:11:33,692 - INFO - Selected 'v1' and 'v2' columns from the dataset.
2025-07-31 19:11:33,704 - INFO - Cleaned dataset. Removed 403 duplicates/nulls. Remaining: 5169 records.
2025-07-31 19:11:33,705 - INFO - --- Completed Pipeline Step: Data Cleaning ---

2025-07-31 19:11:33,706 - INFO 

Batches: 100%|██████████| 81/81 [00:38<00:00,  2.08it/s]

2025-07-31 19:12:35,460 - INFO - SentenceTransformer embedding complete. X shape: (5169, 384), Y shape: (5169,).
2025-07-31 19:12:35,461 - INFO - --- Completed Pipeline Step: Text Vectorization (Embeddings) ---

2025-07-31 19:12:35,462 - INFO - 
--- Starting Pipeline Step: Data Splitting ---
2025-07-31 19:12:35,475 - INFO - Data split: Train 4135 samples, Test 1034 samples.
2025-07-31 19:12:35,476 - INFO - Train target distribution: [3613  522]
2025-07-31 19:12:35,478 - INFO - Test target distribution: [903 131]
2025-07-31 19:12:35,479 - INFO - --- Completed Pipeline Step: Data Splitting ---

2025-07-31 19:12:35,481 - INFO - 
--- Starting Pipeline Step: Hyperparameter Tuning ---
2025-07-31 19:12:35,482 - INFO - Starting hyperparameter tuning with Optuna for selected models...
2025-07-31 19:12:35,483 - INFO - Tuning LR model with 15 trials...



[I 2025-07-31 19:12:35,487] A new study created in memory with name: LR_tuning_study
  0%|          | 0/15 [00:04<?, ?it/s]

[I 2025-07-31 19:12:39,583] Trial 0 finished with value: 0.8195201694380092 and parameters: {'C': 0.017670169402947963, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8195201694380092.


Best trial: 0. Best value: 0.81952:   7%|▋         | 1/15 [00:07<01:01,  4.41s/it]

[I 2025-07-31 19:12:43,218] Trial 1 finished with value: 0.9067637144074732 and parameters: {'C': 0.39079671568228835, 'solver': 'liblinear'}. Best is trial 1 with value: 0.9067637144074732.


Best trial: 1. Best value: 0.906764:  13%|█▎        | 2/15 [00:08<00:51,  3.94s/it]

[I 2025-07-31 19:12:44,036] Trial 2 finished with value: 0.5636527533822596 and parameters: {'C': 0.00022310108018679258, 'solver': 'liblinear'}. Best is trial 1 with value: 0.9067637144074732.


Best trial: 1. Best value: 0.906764:  20%|██        | 3/15 [00:11<00:30,  2.55s/it]

[I 2025-07-31 19:12:46,719] Trial 3 finished with value: 0.9252627284490291 and parameters: {'C': 1.7718847354806828, 'solver': 'saga'}. Best is trial 3 with value: 0.9252627284490291.


Best trial: 3. Best value: 0.925263:  27%|██▋       | 4/15 [00:13<00:28,  2.59s/it]

[I 2025-07-31 19:12:48,786] Trial 4 finished with value: 0.9358347576312023 and parameters: {'C': 9.877700294007917, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  33%|███▎      | 5/15 [00:15<00:23,  2.38s/it]

[I 2025-07-31 19:12:51,170] Trial 5 finished with value: 0.7982814551985044 and parameters: {'C': 0.0012601639723276807, 'solver': 'saga'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  40%|████      | 6/15 [00:18<00:21,  2.39s/it]

[I 2025-07-31 19:12:53,799] Trial 6 finished with value: 0.8533267901688955 and parameters: {'C': 0.039054412752107935, 'solver': 'saga'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  47%|████▋     | 7/15 [00:20<00:19,  2.46s/it]

[I 2025-07-31 19:12:56,423] Trial 7 finished with value: 0.7962869532134003 and parameters: {'C': 0.0006870101665590031, 'solver': 'saga'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  53%|█████▎    | 8/15 [00:22<00:17,  2.51s/it]

[I 2025-07-31 19:12:57,560] Trial 8 finished with value: 0.8553585616674987 and parameters: {'C': 0.054502936945582565, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  60%|██████    | 9/15 [00:23<00:12,  2.07s/it]

[I 2025-07-31 19:12:58,679] Trial 9 finished with value: 0.8757717445900015 and parameters: {'C': 0.12173252504194051, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  67%|██████▋   | 10/15 [00:25<00:08,  1.77s/it]

[I 2025-07-31 19:13:01,208] Trial 10 finished with value: 0.9347469338394623 and parameters: {'C': 73.7864208342295, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  73%|███████▎  | 11/15 [00:28<00:08,  2.04s/it]

[I 2025-07-31 19:13:03,799] Trial 11 finished with value: 0.9338781698536784 and parameters: {'C': 65.64817611753449, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  80%|████████  | 12/15 [00:30<00:06,  2.21s/it]

[I 2025-07-31 19:13:06,362] Trial 12 finished with value: 0.9357473287321831 and parameters: {'C': 78.72383571224226, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  87%|████████▋ | 13/15 [00:32<00:04,  2.31s/it]

[I 2025-07-31 19:13:08,255] Trial 13 finished with value: 0.9331448068701527 and parameters: {'C': 8.224108054741553, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  93%|█████████▎| 14/15 [00:34<00:02,  2.20s/it]

[I 2025-07-31 19:13:10,175] Trial 14 finished with value: 0.9331984783268465 and parameters: {'C': 6.611757606926467, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835: 100%|██████████| 15/15 [00:35<00:00,  2.34s/it]

2025-07-31 19:13:10,533 - INFO - Best parameters for LR: {'C': 9.877700294007917, 'solver': 'liblinear'}
2025-07-31 19:13:10,534 - INFO - Best cross-validated F1-score for LR: 0.9358
2025-07-31 19:13:10,536 - INFO - Tuning RF model with 15 trials...



[I 2025-07-31 19:13:10,538] A new study created in memory with name: RF_tuning_study
  0%|          | 0/15 [00:27<?, ?it/s]

[I 2025-07-31 19:13:38,057] Trial 0 finished with value: 0.9105635300372142 and parameters: {'n_estimators': 144, 'max_depth': 36, 'min_samples_split': 15, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.9105635300372142.


Best trial: 0. Best value: 0.910564:   7%|▋         | 1/15 [00:38<06:30, 27.89s/it]

[I 2025-07-31 19:13:48,618] Trial 1 finished with value: 0.9127467132964885 and parameters: {'n_estimators': 89, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.9127467132964885.


Best trial: 1. Best value: 0.912747:  13%|█▎        | 2/15 [01:13<03:49, 17.68s/it]

[I 2025-07-31 19:14:23,876] Trial 2 finished with value: 0.9091251939576921 and parameters: {'n_estimators': 200, 'max_depth': 21, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 1 with value: 0.9127467132964885.


Best trial: 1. Best value: 0.912747:  20%|██        | 3/15 [01:46<05:08, 25.70s/it]

[I 2025-07-31 19:14:56,692] Trial 3 finished with value: 0.9170959825132309 and parameters: {'n_estimators': 258, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  27%|██▋       | 4/15 [02:10<05:13, 28.52s/it]

[I 2025-07-31 19:15:20,769] Trial 4 finished with value: 0.9058311877181092 and parameters: {'n_estimators': 126, 'max_depth': 14, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  33%|███▎      | 5/15 [02:32<04:28, 26.90s/it]

[I 2025-07-31 19:15:43,129] Trial 5 finished with value: 0.9090006738787435 and parameters: {'n_estimators': 203, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  40%|████      | 6/15 [03:04<03:48, 25.39s/it]

[I 2025-07-31 19:16:14,604] Trial 6 finished with value: 0.9097470973538184 and parameters: {'n_estimators': 164, 'max_depth': 25, 'min_samples_split': 5, 'min_samples_leaf': 6}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  47%|████▋     | 7/15 [03:23<03:39, 27.38s/it]

[I 2025-07-31 19:16:33,826] Trial 7 finished with value: 0.9093436919014714 and parameters: {'n_estimators': 198, 'max_depth': 5, 'min_samples_split': 13, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  53%|█████▎    | 8/15 [03:35<02:53, 24.75s/it]

[I 2025-07-31 19:16:45,923] Trial 8 finished with value: 0.9052117927047891 and parameters: {'n_estimators': 66, 'max_depth': 36, 'min_samples_split': 20, 'min_samples_leaf': 9}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  60%|██████    | 9/15 [03:49<02:04, 20.81s/it]

[I 2025-07-31 19:17:00,375] Trial 9 finished with value: 0.9089022907441778 and parameters: {'n_estimators': 126, 'max_depth': 6, 'min_samples_split': 15, 'min_samples_leaf': 5}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  67%|██████▋   | 10/15 [04:35<01:34, 18.84s/it]

[I 2025-07-31 19:17:46,214] Trial 10 finished with value: 0.9116850200695618 and parameters: {'n_estimators': 287, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  73%|███████▎  | 11/15 [05:14<01:48, 27.08s/it]

[I 2025-07-31 19:18:25,532] Trial 11 finished with value: 0.9108888736746182 and parameters: {'n_estimators': 277, 'max_depth': 9, 'min_samples_split': 2, 'min_samples_leaf': 8}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  80%|████████  | 12/15 [05:23<01:32, 30.82s/it]

[I 2025-07-31 19:18:33,825] Trial 12 finished with value: 0.9084614524269989 and parameters: {'n_estimators': 59, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 8}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  87%|████████▋ | 13/15 [06:07<00:48, 24.02s/it]

[I 2025-07-31 19:19:17,725] Trial 13 finished with value: 0.911159221076747 and parameters: {'n_estimators': 250, 'max_depth': 13, 'min_samples_split': 5, 'min_samples_leaf': 7}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  93%|█████████▎| 14/15 [06:19<00:29, 29.98s/it]

[I 2025-07-31 19:19:29,836] Trial 14 finished with value: 0.9118333987728944 and parameters: {'n_estimators': 90, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096: 100%|██████████| 15/15 [06:19<00:00, 25.31s/it]

2025-07-31 19:19:30,194 - INFO - Best parameters for RF: {'n_estimators': 258, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 2}
2025-07-31 19:19:30,196 - INFO - Best cross-validated F1-score for RF: 0.9171
2025-07-31 19:19:30,197 - INFO - Tuning XGB model with 15 trials...



[I 2025-07-31 19:19:30,199] A new study created in memory with name: XGB_tuning_study
  0%|          | 0/15 [00:22<?, ?it/s]

[I 2025-07-31 19:19:52,357] Trial 0 finished with value: 0.922533456170615 and parameters: {'n_estimators': 144, 'max_depth': 12, 'learning_rate': 0.14553179565665345, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.6624074561769746, 'gamma': 1.7699302940633311e-07}. Best is trial 0 with value: 0.922533456170615.


Best trial: 0. Best value: 0.922533:   7%|▋         | 1/15 [00:41<05:14, 22.48s/it]

[I 2025-07-31 19:20:11,456] Trial 1 finished with value: 0.8968282232826492 and parameters: {'n_estimators': 64, 'max_depth': 11, 'learning_rate': 0.07965261308120507, 'subsample': 0.8832290311184181, 'colsample_bytree': 0.608233797718321, 'gamma': 0.574485163632042}. Best is trial 0 with value: 0.922533456170615.


Best trial: 0. Best value: 0.922533:  13%|█▎        | 2/15 [01:16<04:26, 20.52s/it]

[I 2025-07-31 19:20:46,564] Trial 2 finished with value: 0.8411553008352888 and parameters: {'n_estimators': 258, 'max_depth': 5, 'learning_rate': 0.011551009439226469, 'subsample': 0.6733618039413735, 'colsample_bytree': 0.7216968971838151, 'gamma': 0.00015777981883364995}. Best is trial 0 with value: 0.922533456170615.


Best trial: 0. Best value: 0.922533:  20%|██        | 3/15 [01:34<05:25, 27.16s/it]

[I 2025-07-31 19:21:04,211] Trial 3 finished with value: 0.924591148223439 and parameters: {'n_estimators': 158, 'max_depth': 5, 'learning_rate': 0.08369042894376064, 'subsample': 0.6557975442608167, 'colsample_bytree': 0.7168578594140873, 'gamma': 8.528933855762793e-06}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  27%|██▋       | 4/15 [02:36<04:17, 23.39s/it]

[I 2025-07-31 19:22:06,410] Trial 4 finished with value: 0.8409469883992775 and parameters: {'n_estimators': 164, 'max_depth': 10, 'learning_rate': 0.01254057843022616, 'subsample': 0.8056937753654446, 'colsample_bytree': 0.836965827544817, 'gamma': 2.3528990899815284e-08}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  33%|███▎      | 5/15 [02:57<06:13, 37.38s/it]

[I 2025-07-31 19:22:27,943] Trial 5 finished with value: 0.6888456060996102 and parameters: {'n_estimators': 202, 'max_depth': 4, 'learning_rate': 0.006746417134006626, 'subsample': 0.9795542149013333, 'colsample_bytree': 0.9862528132298237, 'gamma': 0.02932100047183291}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  40%|████      | 6/15 [03:05<04:48, 32.01s/it]

[I 2025-07-31 19:22:36,198] Trial 6 finished with value: 0.909142632349273 and parameters: {'n_estimators': 126, 'max_depth': 3, 'learning_rate': 0.11679817513130797, 'subsample': 0.7760609974958406, 'colsample_bytree': 0.6488152939379115, 'gamma': 9.149877525022172e-05}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  47%|████▋     | 7/15 [03:28<03:13, 24.22s/it]

[I 2025-07-31 19:22:58,569] Trial 7 finished with value: 0.7659710875806484 and parameters: {'n_estimators': 58, 'max_depth': 12, 'learning_rate': 0.01646379567211809, 'subsample': 0.8650089137415928, 'colsample_bytree': 0.7246844304357644, 'gamma': 0.00014472520367197597}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  53%|█████▎    | 8/15 [03:37<02:45, 23.65s/it]

[I 2025-07-31 19:23:07,769] Trial 8 finished with value: 0.9134719551686679 and parameters: {'n_estimators': 187, 'max_depth': 4, 'learning_rate': 0.43464957555697725, 'subsample': 0.9100531293444458, 'colsample_bytree': 0.9757995766256756, 'gamma': 0.14408501080722544}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  60%|██████    | 9/15 [04:42<01:54, 19.13s/it]

[I 2025-07-31 19:24:12,551] Trial 9 finished with value: 0.8332102676585844 and parameters: {'n_estimators': 200, 'max_depth': 12, 'learning_rate': 0.007515450322528414, 'subsample': 0.6783931449676581, 'colsample_bytree': 0.6180909155642152, 'gamma': 4.005370050283172e-06}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  67%|██████▋   | 10/15 [05:29<02:46, 33.23s/it]

[I 2025-07-31 19:24:59,969] Trial 10 finished with value: 0.9143035711477321 and parameters: {'n_estimators': 287, 'max_depth': 7, 'learning_rate': 0.03621799474202481, 'subsample': 0.6071847502459278, 'colsample_bytree': 0.8391524267229545, 'gamma': 0.0033264162114920023}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  73%|███████▎  | 11/15 [05:43<02:30, 37.56s/it]

[I 2025-07-31 19:25:13,337] Trial 11 finished with value: 0.9243616144942959 and parameters: {'n_estimators': 121, 'max_depth': 8, 'learning_rate': 0.22955406185548316, 'subsample': 0.7518416973680894, 'colsample_bytree': 0.7248996679248748, 'gamma': 1.3465901496770342e-07}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  80%|████████  | 12/15 [05:53<01:30, 30.20s/it]

[I 2025-07-31 19:25:23,746] Trial 12 finished with value: 0.9179095040995241 and parameters: {'n_estimators': 106, 'max_depth': 8, 'learning_rate': 0.32976584052032165, 'subsample': 0.7273145000499233, 'colsample_bytree': 0.7624979833916741, 'gamma': 1.307420395434413e-06}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  87%|████████▋ | 13/15 [06:05<00:48, 24.21s/it]

[I 2025-07-31 19:25:36,054] Trial 13 finished with value: 0.9265312829611185 and parameters: {'n_estimators': 102, 'max_depth': 7, 'learning_rate': 0.2197325710169943, 'subsample': 0.610547233762323, 'colsample_bytree': 0.7874820875551369, 'gamma': 6.7169034639277175e-06}. Best is trial 13 with value: 0.9265312829611185.


Best trial: 13. Best value: 0.926531:  93%|█████████▎| 14/15 [06:22<00:20, 20.61s/it]

[I 2025-07-31 19:25:52,978] Trial 14 finished with value: 0.8645482549698402 and parameters: {'n_estimators': 78, 'max_depth': 6, 'learning_rate': 0.04374146076402921, 'subsample': 0.6041792656687146, 'colsample_bytree': 0.8955427636018558, 'gamma': 9.146410590181663e-06}. Best is trial 13 with value: 0.9265312829611185.


Best trial: 13. Best value: 0.926531: 100%|██████████| 15/15 [06:23<00:00, 25.54s/it]

2025-07-31 19:25:53,285 - INFO - Best parameters for XGB: {'n_estimators': 102, 'max_depth': 7, 'learning_rate': 0.2197325710169943, 'subsample': 0.610547233762323, 'colsample_bytree': 0.7874820875551369, 'gamma': 6.7169034639277175e-06}
2025-07-31 19:25:53,287 - INFO - Best cross-validated F1-score for XGB: 0.9265
2025-07-31 19:25:53,288 - INFO - Tuning SVC model with 15 trials...



[I 2025-07-31 19:25:53,290] A new study created in memory with name: SVC_tuning_study
  0%|          | 0/15 [04:47<?, ?it/s]

[I 2025-07-31 19:30:40,911] Trial 0 finished with value: 0.48185640015644654 and parameters: {'C': 0.31489116479568624, 'gamma': 6.351221010640703, 'kernel': 'rbf'}. Best is trial 0 with value: 0.48185640015644654.


Best trial: 0. Best value: 0.481856:   7%|▋         | 1/15 [11:20<1:07:10, 287.87s/it]

[I 2025-07-31 19:37:13,868] Trial 1 finished with value: 0.8040040979459049 and parameters: {'C': 0.04207988669606638, 'gamma': 0.004207053950287938, 'kernel': 'sigmoid'}. Best is trial 1 with value: 0.8040040979459049.


Best trial: 1. Best value: 0.804004:  13%|█▎        | 2/15 [11:47<1:15:46, 349.70s/it]

[I 2025-07-31 19:37:41,022] Trial 2 finished with value: 0.9221662225082857 and parameters: {'C': 2.5378155082656657, 'gamma': 0.679657809075816, 'kernel': 'sigmoid'}. Best is trial 2 with value: 0.9221662225082857.


Best trial: 2. Best value: 0.922166:  20%|██        | 3/15 [12:54<40:29, 202.43s/it]  

[I 2025-07-31 19:38:48,164] Trial 3 finished with value: 0.9274468578122226 and parameters: {'C': 21.368329072358772, 'gamma': 0.0070689749506246055, 'kernel': 'sigmoid'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  27%|██▋       | 4/15 [14:41<27:19, 149.02s/it]

[I 2025-07-31 19:40:35,088] Trial 4 finished with value: 0.9000954051681628 and parameters: {'C': 0.1648044642797898, 'gamma': 0.12561043700013558, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  33%|███▎      | 5/15 [18:22<22:18, 133.84s/it]

[I 2025-07-31 19:44:15,714] Trial 5 finished with value: 0.8466772244638058 and parameters: {'C': 2.801635158716261, 'gamma': 0.003613894271216527, 'kernel': 'sigmoid'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  40%|████      | 6/15 [18:58<24:30, 163.34s/it]

[I 2025-07-31 19:44:52,123] Trial 6 finished with value: 0.8754548416250257 and parameters: {'C': 0.6672367170464207, 'gamma': 1.382623217936987, 'kernel': 'sigmoid'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  47%|████▋     | 7/15 [22:38<16:14, 121.85s/it]

[I 2025-07-31 19:48:31,802] Trial 7 finished with value: 0.839863591318843 and parameters: {'C': 2.342384984711291, 'gamma': 0.0015339162591163618, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  53%|█████▎    | 8/15 [28:01<17:50, 153.00s/it]

[I 2025-07-31 19:53:54,938] Trial 8 finished with value: 0.08042653953868908 and parameters: {'C': 0.018205657658407266, 'gamma': 6.245139574743075, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  60%|██████    | 9/15 [33:43<20:37, 206.18s/it]

[I 2025-07-31 19:59:36,923] Trial 9 finished with value: 0.8047218721060352 and parameters: {'C': 0.1653693718282443, 'gamma': 0.002458603276328005, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  67%|██████▋   | 10/15 [34:10<20:40, 248.10s/it]

[I 2025-07-31 20:00:03,323] Trial 10 finished with value: 0.9415302848927896 and parameters: {'C': 60.33178530661243, 'gamma': 0.028504320627871515, 'kernel': 'sigmoid'}. Best is trial 10 with value: 0.9415302848927896.


Best trial: 10. Best value: 0.94153:  73%|███████▎  | 11/15 [34:35<12:01, 180.25s/it]

[I 2025-07-31 20:00:28,976] Trial 11 finished with value: 0.9424880088028337 and parameters: {'C': 64.64947866087911, 'gamma': 0.024218157448679556, 'kernel': 'sigmoid'}. Best is trial 11 with value: 0.9424880088028337.


Best trial: 11. Best value: 0.942488:  80%|████████  | 12/15 [34:55<06:39, 133.22s/it]

[I 2025-07-31 20:00:49,082] Trial 12 finished with value: 0.9392053629128879 and parameters: {'C': 88.19429776626716, 'gamma': 0.03713740624438133, 'kernel': 'sigmoid'}. Best is trial 11 with value: 0.9424880088028337.


Best trial: 11. Best value: 0.942488:  87%|████████▋ | 13/15 [35:20<03:17, 98.98s/it] 

[I 2025-07-31 20:01:13,319] Trial 13 finished with value: 0.9385834226508812 and parameters: {'C': 97.65296156943181, 'gamma': 0.025774482038992817, 'kernel': 'sigmoid'}. Best is trial 11 with value: 0.9424880088028337.


Best trial: 11. Best value: 0.942488:  93%|█████████▎| 14/15 [35:45<01:16, 76.40s/it]

[I 2025-07-31 20:01:38,086] Trial 14 finished with value: 0.9365904399909507 and parameters: {'C': 15.59864319752155, 'gamma': 0.13883438990307442, 'kernel': 'sigmoid'}. Best is trial 11 with value: 0.9424880088028337.


Best trial: 11. Best value: 0.942488: 100%|██████████| 15/15 [35:45<00:00, 143.01s/it]

2025-07-31 20:01:38,378 - INFO - Best parameters for SVC: {'C': 64.64947866087911, 'gamma': 0.024218157448679556, 'kernel': 'sigmoid'}
2025-07-31 20:01:38,380 - INFO - Best cross-validated F1-score for SVC: 0.9425
2025-07-31 20:01:38,382 - INFO - Tuning ETC model with 15 trials...



[I 2025-07-31 20:01:38,384] A new study created in memory with name: ETC_tuning_study
  0%|          | 0/15 [00:04<?, ?it/s]

[I 2025-07-31 20:01:42,769] Trial 0 finished with value: 0.905345439765127 and parameters: {'n_estimators': 144, 'max_depth': 36, 'min_samples_split': 15, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.905345439765127.


Best trial: 0. Best value: 0.905345:   7%|▋         | 1/15 [00:06<01:05,  4.70s/it]

[I 2025-07-31 20:01:45,348] Trial 1 finished with value: 0.9033105221386787 and parameters: {'n_estimators': 89, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.905345439765127.


Best trial: 0. Best value: 0.905345:  13%|█▎        | 2/15 [00:12<00:44,  3.40s/it]

[I 2025-07-31 20:01:51,360] Trial 2 finished with value: 0.9048320961326024 and parameters: {'n_estimators': 200, 'max_depth': 21, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.905345439765127.


Best trial: 0. Best value: 0.905345:  20%|██        | 3/15 [00:19<00:55,  4.65s/it]

[I 2025-07-31 20:01:57,838] Trial 3 finished with value: 0.9191989621106709 and parameters: {'n_estimators': 258, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  27%|██▋       | 4/15 [00:23<00:59,  5.37s/it]

[I 2025-07-31 20:02:02,118] Trial 4 finished with value: 0.9037124762642886 and parameters: {'n_estimators': 126, 'max_depth': 14, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  33%|███▎      | 5/15 [00:28<00:49,  4.97s/it]

[I 2025-07-31 20:02:06,948] Trial 5 finished with value: 0.9055210896400828 and parameters: {'n_estimators': 203, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  40%|████      | 6/15 [00:33<00:44,  4.91s/it]

[I 2025-07-31 20:02:12,189] Trial 6 finished with value: 0.8996331187012563 and parameters: {'n_estimators': 164, 'max_depth': 25, 'min_samples_split': 5, 'min_samples_leaf': 6}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  47%|████▋     | 7/15 [00:38<00:40,  5.03s/it]

[I 2025-07-31 20:02:16,560] Trial 7 finished with value: 0.8992309400726045 and parameters: {'n_estimators': 198, 'max_depth': 5, 'min_samples_split': 13, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  53%|█████▎    | 8/15 [00:40<00:33,  4.82s/it]

[I 2025-07-31 20:02:18,987] Trial 8 finished with value: 0.8987381998441906 and parameters: {'n_estimators': 66, 'max_depth': 36, 'min_samples_split': 20, 'min_samples_leaf': 9}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  60%|██████    | 9/15 [00:44<00:24,  4.10s/it]

[I 2025-07-31 20:02:22,464] Trial 9 finished with value: 0.9065322904908435 and parameters: {'n_estimators': 126, 'max_depth': 6, 'min_samples_split': 15, 'min_samples_leaf': 5}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  67%|██████▋   | 10/15 [00:52<00:19,  3.90s/it]

[I 2025-07-31 20:02:30,826] Trial 10 finished with value: 0.9112263682741635 and parameters: {'n_estimators': 287, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  73%|███████▎  | 11/15 [01:00<00:20,  5.24s/it]

[I 2025-07-31 20:02:39,024] Trial 11 finished with value: 0.907783505879283 and parameters: {'n_estimators': 290, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  80%|████████  | 12/15 [01:08<00:18,  6.17s/it]

[I 2025-07-31 20:02:47,376] Trial 12 finished with value: 0.9131610037820834 and parameters: {'n_estimators': 293, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  87%|████████▋ | 13/15 [01:15<00:13,  6.80s/it]

[I 2025-07-31 20:02:54,332] Trial 13 finished with value: 0.9150964771322074 and parameters: {'n_estimators': 250, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  93%|█████████▎| 14/15 [01:22<00:06,  6.85s/it]

[I 2025-07-31 20:03:00,892] Trial 14 finished with value: 0.9171971586951543 and parameters: {'n_estimators': 242, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199: 100%|██████████| 15/15 [01:22<00:00,  5.52s/it]

2025-07-31 20:03:01,224 - INFO - Best parameters for ETC: {'n_estimators': 258, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 2}
2025-07-31 20:03:01,225 - INFO - Best cross-validated F1-score for ETC: 0.9192
2025-07-31 20:03:01,226 - INFO - Hyperparameter tuning completed for all selected models.
2025-07-31 20:03:01,228 - INFO - --- Completed Pipeline Step: Hyperparameter Tuning ---

2025-07-31 20:03:01,229 - INFO - 
--- Starting Pipeline Step: Final Model Training & Evaluation ---
2025-07-31 20:03:01,231 - INFO - Applying SMOTE to the entire training data for final model training...
2025-07-31 20:03:01,329 - INFO - SMOTE applied. Original train: 4135 samples. Resampled train: 7226 samples.
2025-07-31 20:03:01,331 - INFO - Training final LR model on resampled data and evaluating...





2025-07-31 20:03:02,759 - INFO - 
--- Performance for LR ---
2025-07-31 20:03:02,760 - INFO - Accuracy: 0.9787
2025-07-31 20:03:02,762 - INFO - Precision (Spam): 0.8978
2025-07-31 20:03:02,763 - INFO - Recall (Spam): 0.9389
2025-07-31 20:03:02,763 - INFO - F1-Score (Spam): 0.9179
2025-07-31 20:03:02,778 - INFO - 
Full Classification Report for LR:
              precision    recall  f1-score   support

         ham       0.99      0.98      0.99       903
        spam       0.90      0.94      0.92       131

    accuracy                           0.98      1034
   macro avg       0.94      0.96      0.95      1034
weighted avg       0.98      0.98      0.98      1034

2025-07-31 20:03:02,783 - INFO - 
Raw Confusion Matrix for LR:
[[889  14]
 [  8 123]]
2025-07-31 20:03:02,964 - INFO - Confusion matrix plot for LR saved to /home/dev/spam_classifier_project/plots/confusion_matrix_LR_20250731_200301.png.
2025-07-31 20:03:02,965 - INFO - Training final RF model on resampled data and evalua

Batches: 100%|██████████| 1/1 [00:00<00:00,  9.98it/s]



Prediction for SPAM text 1: 'WINNER! You have been selected for a £1000 prize! Call 09061701300 now or claim at link.co.uk/prize. T&C's apply.' -> spam


Batches: 100%|██████████| 1/1 [00:00<00:00, 16.37it/s]


Prediction for SPAM text 2: 'URGENT! Your bank account has been locked due to suspicious activity. Verify immediately at http://bit.ly/malicious-site to avoid closure.' -> ham


Batches: 100%|██████████| 1/1 [00:00<00:00, 23.60it/s]


Prediction for HAM text 1: 'Hey, just checking in. How are you doing today? Let's catch up soon for coffee!' -> ham


Batches: 100%|██████████| 1/1 [00:00<00:00, 52.31it/s]


Prediction for HAM text 2: 'Hi mom, can you pick up milk and bread on your way home? Thanks, love you!' -> ham


Batches: 100%|██████████| 1/1 [00:00<00:00, 48.32it/s]

Prediction for EMPTY/NOISY text: '???!!!#@%' -> spam





In [2]:
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from collections import Counter
from wordcloud import WordCloud
import pickle
import warnings
import logging
import os
import sys
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, recall_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                             BaggingClassifier, ExtraTreesClassifier,
                             GradientBoostingClassifier, VotingClassifier,
                             StackingClassifier)
from xgboost import XGBClassifier
from sentence_transformers import SentenceTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import optuna
from datetime import datetime

# --- Determine Base Directory for Notebook/Script ---
try:
    current_script_dir = os.path.dirname(os.path.abspath(__file__))
    base_directory = current_script_dir
    print(f"Running as a script. Base directory set to: '{base_directory}'")
except NameError:
    base_directory = os.getcwd()
    print(f"Running in a notebook environment. Base directory set to CWD: '{base_directory}'")


# --- Configuration (Externalize for production) ---
class Config:
    DATA_PATH = os.path.join(base_directory, 'spam.csv')
    SENTENCE_TRANSFORMER_MODEL = 'all-MiniLM-L6-v2'
    LOG_FILE = os.path.join(base_directory, 'spam_classifier.log')
    RANDOM_STATE = 42
    TEST_SIZE = 0.2
    N_TRIALS_OPTUNA = 15
    PLOTS_DIR = os.path.join(base_directory, 'plots')
    MODELS_DIR = os.path.join(base_directory, 'models')

# Ensure plot and model directories exist at startup
os.makedirs(Config.PLOTS_DIR, exist_ok=True)
os.makedirs(Config.MODELS_DIR, exist_ok=True)


class SpamClassifier:
    def __init__(self):
        self._configure_logging()
        self._verify_nltk_resources()
        self._configure_matplotlib()
        self.df = None
        self.encoder = LabelEncoder()
        self.ps = PorterStemmer()
        self.sentence_transformer_model = None
        self.X, self.y = None, None
        self.X_train, self.X_test, self.y_train, self.y_test = [None]*4
        self.clfs = {}
        self.best_tuned_models_params = {}
        self.best_model = None
        self.best_model_name = None
        self.performance_df = pd.DataFrame()
        self._initialize_classifiers()
        logging.info("SpamClassifier initialized successfully.")

    def _configure_logging(self) -> None:
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(Config.LOG_FILE),
                logging.StreamHandler(sys.stdout)
            ]
        )
        warnings.filterwarnings('ignore')

    def _verify_nltk_resources(self) -> None:
        resources = [
            ('tokenizers/punkt', 'punkt'),
            ('corpora/stopwords', 'stopwords'),
            ('tokenizers/punkt_tab', 'punkt_tab')
        ]
        for path, package in resources:
            try:
                nltk.data.find(path)
                logging.info(f"NLTK {package} resource found.")
            except LookupError:
                logging.warning(f"NLTK {package} not found. Attempting to download...")
                try:
                    nltk.download(package, quiet=True)
                    logging.info(f"NLTK {package} downloaded successfully.")
                except Exception as e:
                    logging.critical(f"Failed to download NLTK {package}. Error: {e}")
                    sys.exit(1)

    def _configure_matplotlib(self) -> None:
        plt.ioff()
        sns.set(style='whitegrid', palette='viridis')

    def _initialize_classifiers(self) -> None:
        """Initializes all individual and ensemble classifiers."""
        self.clfs = {
            'LR': LogisticRegression(
                solver='liblinear',
                penalty='l1',
                random_state=Config.RANDOM_STATE,
                class_weight='balanced',
                max_iter=1000
            ),
            'RF': RandomForestClassifier(
                n_estimators=100,
                random_state=Config.RANDOM_STATE,
                class_weight='balanced',
                n_jobs=-1
            ),
            'XGB': XGBClassifier(
                n_estimators=100,
                random_state=Config.RANDOM_STATE,
                eval_metric='logloss',
                scale_pos_weight=1
            ),
            'SVC': SVC(kernel='sigmoid', gamma=1.0, probability=True, random_state=Config.RANDOM_STATE, class_weight='balanced'),
            'KN': KNeighborsClassifier(),
            'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=Config.RANDOM_STATE),
            'BgC': BaggingClassifier(n_estimators=100, random_state=Config.RANDOM_STATE, n_jobs=-1),
            'ETC': ExtraTreesClassifier(n_estimators=100, random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1),
            'GBDT': GradientBoostingClassifier(n_estimators=100, random_state=Config.RANDOM_STATE),
            'DT': DecisionTreeClassifier(max_depth=5, random_state=Config.RANDOM_STATE, class_weight='balanced'),
        }
        # Add a VotingClassifier using some of the best models
        self.clfs['Voting'] = VotingClassifier(
            estimators=[
                ('xgb', self.clfs['XGB']),
                ('svc', self.clfs['SVC']),
                ('rf', self.clfs['RF']),
            ],
            voting='soft',  # Use 'soft' voting for probability-based prediction
            weights=[0.3, 0.4, 0.3],  # Example weights (can be tuned)
            n_jobs=-1
        )
        logging.info("Initialized all individual and ensemble classifiers.")

    def load_data(self) -> None:
        try:
            if not os.path.exists(Config.DATA_PATH):
                raise FileNotFoundError(f"Data file not found at {os.path.abspath(Config.DATA_PATH)}")
            self.df = pd.read_csv(Config.DATA_PATH, encoding='latin-1')
            if len(self.df) < 100:
                raise ValueError(f"Dataset too small ({len(self.df)} samples). Minimum 100 samples required for robust analysis.")
            logging.info(f"Loaded {len(self.df)} records from {Config.DATA_PATH}.")
        except Exception as e:
            logging.critical(f"Data loading failed: {e}")
            sys.exit(1)

    def clean_data(self) -> None:
        try:
            if 'v1' in self.df.columns and 'v2' in self.df.columns:
                self.df = self.df[['v1', 'v2']].copy()
                logging.info("Selected 'v1' and 'v2' columns from the dataset.")
            else:
                found_v1 = next((col for col in self.df.columns if 'target' in col.lower() or 'label' in col.lower() or 'type' in col.lower()), None)
                found_v2 = next((col for col in self.df.columns if 'text' in col.lower() or 'message' in col.lower() or 'sms' in col.lower()), None)
                if found_v1 and found_v2:
                    self.df = self.df[[found_v1, found_v2]].copy()
                    logging.info(f"Mapped columns '{found_v1}' to 'target' and '{found_v2}' to 'text' using heuristics.")
                else:
                    raise ValueError(f"Could not find required 'target' and 'text' columns (v1/v2 or equivalents) in dataset. Found columns: {self.df.columns.tolist()}")

            self.df.columns = ['target', 'text']
            valid_targets = {'ham', 'spam'}
            invalid_targets = set(self.df['target'].unique()) - valid_targets
            if invalid_targets:
                logging.warning(f"Invalid target values found: {invalid_targets}. Filtering out rows with these values.")
                self.df = self.df[self.df['target'].isin(valid_targets)]
                if self.df.empty:
                    raise ValueError("No valid 'ham' or 'spam' records remaining after filtering invalid targets. Dataset is empty.")

            self.df['target'] = self.encoder.fit_transform(self.df['target'])
            initial_rows = len(self.df)
            self.df.drop_duplicates(inplace=True)
            self.df.dropna(inplace=True)

            logging.info(f"Cleaned dataset. Removed {initial_rows - len(self.df)} duplicates/nulls. Remaining: {len(self.df)} records.")
            if self.df.empty:
                raise ValueError("Dataset became empty after cleaning steps. Check data quality or initial loading.")
        except Exception as e:
            logging.critical(f"Data cleaning failed: {e}")
            sys.exit(1)

    def _safe_tokenize(self, text: str) -> list[str]:
        if not isinstance(text, str):
            text = str(text)
            logging.debug(f"Coerced non-string text to string for tokenization: {text[:50]}...")
        try:
            tokens = nltk.word_tokenize(text.lower())
            return [t for t in tokens if t.isalnum() and t not in string.punctuation]
        except Exception as e:
            logging.warning(f"Tokenization failed for text (first 50 chars: '{text[:50]}...'). Returning empty list. Error: {e}")
            return []

    def eda(self) -> None:
        try:
            self.df['num_words'] = self.df['text'].apply(lambda x: len(self._safe_tokenize(x)))
            self.df['num_chars'] = self.df['text'].apply(len)
            self.df['num_sentences'] = self.df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

            ham_count = self.df[self.df['target'] == self.encoder.transform(['ham'])[0]].shape[0]
            spam_count = self.df[self.df['target'] == self.encoder.transform(['spam'])[0]].shape[0]
            if spam_count > 0:
                scale_pos_weight_val = ham_count / spam_count
                if 'XGB' in self.clfs:
                    self.clfs['XGB'].set_params(scale_pos_weight=scale_pos_weight_val)
                logging.info(f"Set XGBoost scale_pos_weight to: {scale_pos_weight_val:.2f} (Ham:{ham_count}, Spam:{spam_count})")
            else:
                logging.warning("No spam samples found to calculate scale_pos_weight for XGBoost. Defaulting to 1.")

            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            fig1, ax1 = plt.subplots(figsize=(8, 8))
            self.df['target'].value_counts().plot(
                kind='pie', ax=ax1, autopct='%1.1f%%',
                labels=self.encoder.inverse_transform(self.df['target'].value_counts().index),
                colors=sns.color_palette('pastel')[0:2],
                explode=[0, 0.1]
            )
            ax1.set_title('Target Class Distribution')
            ax1.set_ylabel('')
            fig1_filename = os.path.join(Config.PLOTS_DIR, f'target_distribution_{timestamp}.png')
            plt.savefig(fig1_filename, bbox_inches='tight')
            plt.close(fig1)
            logging.info(f"Target distribution plot saved to {fig1_filename}.")

            fig2, ax2 = plt.subplots(figsize=(14, 6))
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['ham'])[0]], x='num_words', ax=ax2, bins=50, kde=True, color='blue', label='Ham')
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['spam'])[0]], x='num_words', ax=ax2, bins=50, kde=True, color='red', label='Spam')
            ax2.set_title('Word Count Distribution by Target Class')
            ax2.set_xlabel('Number of Words')
            ax2.set_ylabel('Count')
            ax2.legend()
            fig2_filename = os.path.join(Config.PLOTS_DIR, f'word_count_distribution_{timestamp}.png')
            plt.savefig(fig2_filename, bbox_inches='tight')
            plt.close(fig2)
            logging.info(f"Word count distribution plot saved to {fig2_filename}.")

            fig3, ax3 = plt.subplots(figsize=(14, 6))
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['ham'])[0]], x='num_chars', ax=ax3, bins=50, kde=True, color='blue', label='Ham')
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['spam'])[0]], x='num_chars', ax=ax3, bins=50, kde=True, color='red', label='Spam')
            ax3.set_title('Character Count Distribution by Target Class')
            ax3.set_xlabel('Number of Characters')
            ax3.set_ylabel('Count')
            ax3.legend()
            fig3_filename = os.path.join(Config.PLOTS_DIR, f'char_count_distribution_{timestamp}.png')
            plt.savefig(fig3_filename, bbox_inches='tight')
            plt.close(fig3)
            logging.info(f"Character count distribution plot saved to {fig3_filename}.")

            fig4, ax4 = plt.subplots(figsize=(8, 6))
            sns.heatmap(self.df[['num_chars', 'num_words', 'num_sentences', 'target']].corr(), annot=True, cmap='coolwarm', ax=ax4)
            ax4.set_title('Correlation Matrix of Text Features and Target')
            fig4_filename = os.path.join(Config.PLOTS_DIR, f'correlation_heatmap_{timestamp}.png')
            plt.savefig(fig4_filename, bbox_inches='tight')
            plt.close(fig4)
            logging.info(f"Correlation heatmap plot saved to {fig4_filename}.")

            logging.info(f"Descriptive statistics for Ham emails:\n{self.df[self.df['target'] == self.encoder.transform(['ham'])[0]][['num_chars', 'num_words', 'num_sentences']].describe()}")
            logging.info(f"Descriptive statistics for Spam emails:\n{self.df[self.df['target'] == self.encoder.transform(['spam'])[0]][['num_chars', 'num_words', 'num_sentences']].describe()}")

        except Exception as e:
            logging.error(f"EDA process failed: {e}")
            raise

    def transform_text(self, text: str) -> str:
        if not isinstance(text, str):
            text = str(text)
            logging.debug(f"Coerced non-string text to string for transform_text: {text[:50]}...")
        tokens = nltk.word_tokenize(text.lower())
        processed_tokens = [token for token in tokens if token.isalnum()]
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [token for token in processed_tokens if token not in stop_words and token not in string.punctuation]
        stemmed_tokens = [self.ps.stem(token) for token in filtered_tokens]
        final_tokens = [token for token in stemmed_tokens if len(token) > 1 or token.isdigit()]
        return " ".join(final_tokens)

    def preprocess_text(self) -> None:
        try:
            logging.info("\n--- Text Preprocessing for EDA and Visualizations ---")
            self.df['transformed_text'] = self.df['text'].apply(self.transform_text)
            logging.info("Text transformation for EDA complete. Example:")
            logging.info(f"\n{self.df[['text', 'transformed_text']].head().to_string()}")
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            logging.info("\nGenerating Word Clouds (saved to plots directory):")
            spam_wc = WordCloud(width=800, height=400, min_font_size=10, background_color='white').generate(
                self.df[self.df['target'] == self.encoder.transform(['spam'])[0]]['transformed_text'].str.cat(sep=" ")
            )
            plt.figure(figsize=(10, 5))
            plt.imshow(spam_wc)
            plt.title('Spam Word Cloud')
            plt.axis('off')
            wc_spam_filename = os.path.join(Config.PLOTS_DIR, f'spam_wordcloud_{timestamp}.png')
            plt.savefig(wc_spam_filename, bbox_inches='tight')
            plt.close()
            logging.info(f"Spam word cloud saved to {wc_spam_filename}.")

            ham_wc = WordCloud(width=800, height=400, min_font_size=10, background_color='white').generate(
                self.df[self.df['target'] == self.encoder.transform(['ham'])[0]]['transformed_text'].str.cat(sep=" ")
            )
            plt.figure(figsize=(10, 5))
            plt.imshow(ham_wc)
            plt.title('Ham Word Cloud')
            plt.axis('off')
            wc_ham_filename = os.path.join(Config.PLOTS_DIR, f'ham_wordcloud_{timestamp}.png')
            plt.savefig(wc_ham_filename, bbox_inches='tight')
            plt.close()
            logging.info(f"Ham word cloud saved to {wc_ham_filename}.")

            logging.info("\nMost common words in Spam (saved as plot):")
            spam_corpus = ' '.join(self.df[self.df['target'] == self.encoder.transform(['spam'])[0]]['transformed_text']).split()
            self._plot_most_common_words(spam_corpus, title='Top 30 Spam Words', filename=f'top_spam_words_{timestamp}.png')

            logging.info("\nMost common words in Ham (saved as plot):")
            ham_corpus = ' '.join(self.df[self.df['target'] == self.encoder.transform(['ham'])[0]]['transformed_text']).split()
            self._plot_most_common_words(ham_corpus, title='Top 30 Ham Words', filename=f'top_ham_words_{timestamp}.png')

        except Exception as e:
            logging.critical(f"Text preprocessing for EDA failed: {e}")
            sys.exit(1)

    def _plot_most_common_words(self, corpus: list[str], title: str, n: int = 30, filename: str = "common_words.png") -> None:
        common_words = Counter(corpus).most_common(n)
        df_common_words = pd.DataFrame(common_words, columns=['Word', 'Count'])
        fig, ax = plt.subplots(figsize=(12, 6))
        sns.barplot(x='Word', y='Count', data=df_common_words, ax=ax, palette='viridis')
        ax.set_xticklabels(ax.get_xticklabels(), rotation='vertical')
        ax.set_title(title)
        plot_filepath = os.path.join(Config.PLOTS_DIR, filename)
        plt.savefig(plot_filepath, bbox_inches='tight')
        plt.close(fig)
        logging.info(f"Plot '{title}' saved to {plot_filepath}.")

    def vectorize_text_with_embeddings(self) -> None:
        try:
            logging.info(f"\n--- Text Vectorization (SentenceTransformer: {Config.SENTENCE_TRANSFORMER_MODEL}) ---")
            if self.sentence_transformer_model is None:
                self.sentence_transformer_model = SentenceTransformer(Config.SENTENCE_TRANSFORMER_MODEL)

            self.X = self.sentence_transformer_model.encode(
                self.df['text'].tolist(),
                show_progress_bar=True,
                convert_to_tensor=False,
                batch_size=64
            )
            self.y = self.df['target'].values
            logging.info(f"SentenceTransformer embedding complete. X shape: {self.X.shape}, Y shape: {self.y.shape}.")
        except Exception as e:
            logging.critical(f"Text vectorization failed: {e}")
            sys.exit(1)

    def split_data(self) -> None:
        try:
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
                self.X, self.y, test_size=Config.TEST_SIZE,
                random_state=Config.RANDOM_STATE, stratify=self.y)
            logging.info(f"Data split: Train {len(self.X_train)} samples, Test {len(self.X_test)} samples.")
            logging.info(f"Train target distribution: {np.bincount(self.y_train)}")
            logging.info(f"Test target distribution: {np.bincount(self.y_test)}")
        except Exception as e:
            logging.critical(f"Data splitting failed: {e}")
            sys.exit(1)

    def _objective(self, trial: optuna.trial.Trial, model_name: str) -> float:
        if model_name == 'LR':
            c_param = trial.suggest_loguniform('C', 1e-4, 1e2)
            solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])
            model = LogisticRegression(C=c_param, solver=solver, random_state=Config.RANDOM_STATE,
                                       class_weight='balanced', max_iter=2000,
                                       n_jobs=-1 if solver == 'saga' else None)
        elif model_name == 'RF':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 5, 40, log=True)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           min_samples_leaf=min_samples_leaf,
                                           random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1)
        elif model_name == 'XGB':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 3, 12)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.005, 0.5)
            subsample = trial.suggest_uniform('subsample', 0.6, 1.0)
            colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
            gamma = trial.suggest_loguniform('gamma', 1e-8, 1.0)
            current_scale_pos_weight = self.clfs['XGB'].get_params().get('scale_pos_weight', 1)
            model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                  learning_rate=learning_rate, subsample=subsample,
                                  colsample_bytree=colsample_bytree, gamma=gamma,
                                  random_state=Config.RANDOM_STATE,
                                  eval_metric='logloss',
                                  scale_pos_weight=current_scale_pos_weight)
        elif model_name == 'SVC':
            C_param = trial.suggest_loguniform('C', 1e-2, 1e2)
            gamma_param = trial.suggest_loguniform('gamma', 1e-3, 1e1)
            kernel = trial.suggest_categorical('kernel', ['rbf', 'sigmoid'])
            model = SVC(C=C_param, gamma=gamma_param, kernel=kernel, probability=True,
                        random_state=Config.RANDOM_STATE, class_weight='balanced')
        elif model_name == 'KN':
            n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
            weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
            algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
            model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, n_jobs=-1)
        elif model_name == 'AdaBoost':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 1.0)
            model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=Config.RANDOM_STATE)
        elif model_name == 'BgC':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            model = BaggingClassifier(n_estimators=n_estimators, random_state=Config.RANDOM_STATE, n_jobs=-1)
        elif model_name == 'ETC':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 5, 40, log=True)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            model = ExtraTreesClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                         min_samples_split=min_samples_split,
                                         min_samples_leaf=min_samples_leaf,
                                         random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1)
        elif model_name == 'GBDT':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 1.0)
            max_depth = trial.suggest_int('max_depth', 3, 10)
            model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=Config.RANDOM_STATE)
        elif model_name == 'DT':
            max_depth = trial.suggest_int('max_depth', 3, 20)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
            model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split,
                                           min_samples_leaf=min_samples_leaf, criterion=criterion,
                                           random_state=Config.RANDOM_STATE, class_weight='balanced')
        else:
            raise ValueError(f"Model '{model_name}' is not configured for Optuna tuning.")

        pipeline = ImbPipeline([
            ('smote', SMOTE(random_state=Config.RANDOM_STATE)),
            ('classifier', model)
        ])
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=Config.RANDOM_STATE)
        scores = cross_val_score(pipeline, self.X_train, self.y_train, cv=cv, scoring='f1', n_jobs=-1)
        return scores.mean()

    def tune_models(self) -> None:
        try:
            if self.X_train is None or self.y_train is None:
                logging.error("Data not split for tuning. Calling split_data().")
                self.split_data()

            logging.info("Starting hyperparameter tuning with Optuna for selected models...")
            models_to_tune = ['LR', 'RF', 'XGB', 'SVC', 'ETC']

            for name in models_to_tune:
                if name not in self.clfs:
                    logging.warning(f"Model '{name}' not found in initialized classifiers, skipping tuning.")
                    continue

                logging.info(f"Tuning {name} model with {Config.N_TRIALS_OPTUNA} trials...")
                study = optuna.create_study(direction='maximize',
                                            sampler=optuna.samplers.TPESampler(seed=Config.RANDOM_STATE),
                                            study_name=f"{name}_tuning_study")

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", UserWarning)
                    study.optimize(lambda trial: self._objective(trial, name),
                                   n_trials=Config.N_TRIALS_OPTUNA,
                                   show_progress_bar=True,
                                   gc_after_trial=True)

                self.best_tuned_models_params[name] = study.best_trial.params
                logging.info(f"Best parameters for {name}: {study.best_trial.params}")
                logging.info(f"Best cross-validated F1-score for {name}: {study.best_trial.value:.4f}")

                self.clfs[name].set_params(**study.best_trial.params)
                if name == 'XGB':
                    current_scale_pos_weight = self.clfs[name].get_params().get('scale_pos_weight', 1)
                    self.clfs[name].set_params(scale_pos_weight=current_scale_pos_weight)

            logging.info("Hyperparameter tuning completed for all selected models.")
        except Exception as e:
            logging.critical(f"Model tuning failed: {e}")
            sys.exit(1)

    def train_final_models(self) -> None:
        try:
            if self.X_train is None or self.X_test is None:
                 logging.error("Data not split for final training. Calling split_data().")
                 self.split_data()

            logging.info("Applying SMOTE to the entire training data for final model training...")
            smote = SMOTE(random_state=Config.RANDOM_STATE)
            X_train_resampled, y_train_resampled = smote.fit_resample(self.X_train, self.y_train)
            logging.info(f"SMOTE applied. Original train: {len(self.X_train)} samples. Resampled train: {len(X_train_resampled)} samples.")

            results = []
            best_f1_overall = -1
            self.best_model = None
            self.best_model_name = None
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

            # Training loop now includes the VotingClassifier
            for name, model in self.clfs.items():
                logging.info(f"Training final {name} model on resampled data and evaluating...")
                try:
                    # To use the ImbPipeline, we need to pass the model, not just the classifier
                    pipeline = ImbPipeline([('smote', SMOTE(random_state=Config.RANDOM_STATE)), ('classifier', model)])
                    pipeline.fit(self.X_train, self.y_train)
                    y_pred = pipeline.predict(self.X_test)

                    accuracy = accuracy_score(self.y_test, y_pred)
                    precision = precision_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)
                    recall = recall_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)
                    f1 = f1_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)

                    report_dict = classification_report(self.y_test, y_pred, target_names=self.encoder.classes_, output_dict=True)

                    results.append({
                        'Model': name,
                        'Accuracy': accuracy,
                        'Precision (Spam)': precision,
                        'Recall (Spam)': recall,
                        'F1-Score (Spam)': f1,
                        'Full Classification Report': report_dict
                    })

                    logging.info(f"\n--- Performance for {name} ---")
                    logging.info(f"Accuracy: {accuracy:.4f}")
                    logging.info(f"Precision (Spam): {precision:.4f}")
                    logging.info(f"Recall (Spam): {recall:.4f}")
                    logging.info(f"F1-Score (Spam): {f1:.4f}")
                    logging.info(f"\nFull Classification Report for {name}:\n{classification_report(self.y_test, y_pred, target_names=self.encoder.classes_)}")

                    cm = confusion_matrix(self.y_test, y_pred)
                    logging.info(f"\nRaw Confusion Matrix for {name}:\n{cm}")

                    fig_cm, ax_cm = plt.subplots(figsize=(7, 6))
                    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                                xticklabels=self.encoder.classes_,
                                yticklabels=self.encoder.classes_,
                                linecolor='gray', linewidths=0.5,
                                annot_kws={"size": 14})
                    ax_cm.set_xlabel('Predicted Label', fontsize=12)
                    ax_cm.set_ylabel('True Label', fontsize=12)
                    ax_cm.set_title(f'Confusion Matrix for {name}', fontsize=14)
                    cm_filename = os.path.join(Config.PLOTS_DIR, f'confusion_matrix_{name}_{timestamp}.png')
                    plt.savefig(cm_filename, bbox_inches='tight')
                    plt.close(fig_cm)
                    logging.info(f"Confusion matrix plot for {name} saved to {cm_filename}.")

                    if f1 > best_f1_overall:
                        best_f1_overall = f1
                        self.best_model_name = name
                        self.best_model = pipeline # Store the entire pipeline
                except Exception as model_e:
                    logging.error(f"Error training or evaluating model {name}: {model_e}")
                    results.append({
                        'Model': name,
                        'Accuracy': np.nan,
                        'Precision (Spam)': np.nan,
                        'Recall (Spam)': np.nan,
                        'F1-Score (Spam)': np.nan,
                        'Full Classification Report': {'error': str(model_e)}
                    })

            self.performance_df = pd.DataFrame(results)
            self.performance_df = self.performance_df.sort_values(by='F1-Score (Spam)', ascending=False).reset_index(drop=True)
            logging.info(f"\n--- Overall Best Model Identified: {self.best_model_name} (F1-Score on Spam: {best_f1_overall:.4f}) ---")
            logging.info("All model evaluations completed.")
            self._save_best_model()
            self._plot_performance_comparison(timestamp)

        except Exception as e:
            logging.critical(f"Final model training and evaluation failed: {e}")
            sys.exit(1)

    def _save_best_model(self) -> None:
        """Saves the best performing model and related components to a pickle file."""
        try:
            if self.best_model is None or self.best_model_name is None:
                logging.warning("No best model identified or stored. Skipping model save operation.")
                return
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_filename = os.path.join(Config.MODELS_DIR, f'best_model_{self.best_model_name}_{timestamp}.pkl')
            with open(model_filename, 'wb') as f:
                pickle.dump({
                    'model': self.best_model,
                    'transformer': Config.SENTENCE_TRANSFORMER_MODEL,
                    'encoder': self.encoder,
                    'model_name': self.best_model_name,
                    'performance_summary': self.performance_df.to_dict('records')
                }, f)
            logging.info(f"Best performing model ({self.best_model_name}) saved to {model_filename}")
        except Exception as e:
            logging.error(f"Failed to save the best model: {e}")

    def _plot_performance_comparison(self, timestamp: str) -> None:
        if self.performance_df.empty:
            logging.warning("Performance DataFrame is empty, cannot plot comparison.")
            return
        plot_df = self.performance_df[['Model', 'Accuracy', 'Precision (Spam)', 'Recall (Spam)', 'F1-Score (Spam)']].copy()
        plot_df_melted = plot_df.melt(id_vars="Model", var_name="Metric", value_name="Score")
        fig, ax = plt.subplots(figsize=(14, 7))
        sns.barplot(x='Model', y='Score', hue='Metric', data=plot_df_melted, palette='tab10', ax=ax)
        ax.set_ylim(0.5, 1.0)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
        ax.set_title('Model Performance Comparison (Test Set)')
        ax.set_ylabel('Score')
        ax.set_xlabel('Model')
        ax.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plot_filename = os.path.join(Config.PLOTS_DIR, f'model_performance_comparison_{timestamp}.png')
        plt.savefig(plot_filename, bbox_inches='tight')
        plt.close(fig)
        logging.info(f"Model performance comparison plot saved to {plot_filename}.")

    def run_pipeline(self) -> bool:
        steps = [
            ('Data Loading', self.load_data),
            ('Data Cleaning', self.clean_data),
            ('EDA and Feature Engineering', self.eda),
            ('Text Preprocessing for EDA', self.preprocess_text),
            ('Text Vectorization (Embeddings)', self.vectorize_text_with_embeddings),
            ('Data Splitting', self.split_data),
            ('Hyperparameter Tuning', self.tune_models),
            ('Final Model Training & Evaluation', self.train_final_models)
        ]
        for name, step in steps:
            try:
                logging.info(f"\n--- Starting Pipeline Step: {name} ---")
                step()
                logging.info(f"--- Completed Pipeline Step: {name} ---\n")
            except SystemExit:
                logging.critical(f"Pipeline stopped due to critical error in step: '{name}'.")
                return False
            except Exception as e:
                logging.critical(f"Pipeline failed unexpectedly in step '{name}': {e}")
                return False
        logging.info("Spam classification pipeline completed successfully.")
        return True

    @staticmethod
    def load_for_inference(model_path: str) -> 'SpamClassifier':
        try:
            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Model file not found at {os.path.abspath(model_path)}")
            with open(model_path, 'rb') as f:
                data = pickle.load(f)
            classifier = SpamClassifier()
            classifier.best_model = data['model']
            classifier.encoder = data['encoder']
            classifier.best_model_name = data.get('model_name', 'Unknown_Model')
            transformer_data = data['transformer']
            if isinstance(transformer_data, str):
                logging.info(f"Loading SentenceTransformer by name: '{transformer_data}'")
                classifier.sentence_transformer_model = SentenceTransformer(transformer_data)
            else:
                logging.warning("Loaded SentenceTransformer object directly from pickle.")
                classifier.sentence_transformer_model = transformer_data
            classifier.ps = PorterStemmer()
            logging.info(f"Model '{classifier.best_model_name}' loaded successfully from {model_path} for inference.")
            return classifier
        except Exception as e:
            logging.critical(f"Failed to load model for inference from {model_path}: {e}")
            raise

    def predict(self, text: str) -> str:
        """
        Predicts the label for a given text.
        This method is now a simplified wrapper for predict_with_confidence.
        """
        prediction_label, _, _ = self.predict_with_confidence(text)
        return prediction_label

    def predict_with_confidence(self, text: str) -> tuple[str, float, float]:
        """
        Predicts the label and returns the confidence score for spam/ham.
        Returns: (prediction_label, spam_confidence, ham_confidence)
        """
        if self.best_model is None or self.sentence_transformer_model is None or self.encoder is None:
            logging.error("Model components not loaded. Please load model using load_for_inference() before calling predict().")
            raise RuntimeError("Model components not available for prediction.")
        try:
            vector = self.sentence_transformer_model.encode([text], convert_to_tensor=False)
            prediction_encoded = self.best_model.predict(vector)[0]
            prediction_label = self.encoder.inverse_transform([prediction_encoded])[0]

            # Get probabilities and confidence
            prediction_proba = self.best_model.predict_proba(vector)[0]
            classes = self.best_model.named_steps['classifier'].classes_
            
            spam_prob_idx = np.where(classes == self.encoder.transform(['spam'])[0])[0]
            ham_prob_idx = np.where(classes == self.encoder.transform(['ham'])[0])[0]
            
            spam_confidence = prediction_proba[spam_prob_idx][0] if spam_prob_idx.size > 0 else 0.0
            ham_confidence = prediction_proba[ham_prob_idx][0] if ham_prob_idx.size > 0 else 0.0
            
            return prediction_label, spam_confidence, ham_confidence

        except Exception as e:
            logging.error(f"Prediction failed for text '{text[:50]}...': {e}")
            return "error", 0.0, 0.0

if __name__ == '__main__':
    classifier = SpamClassifier()
    pipeline_success = classifier.run_pipeline()

    if pipeline_success:
        logging.info("\n=== Spam Classification Pipeline Completed Successfully ===")
        logging.info("Overall Model Performance Summary (Sorted by F1-Score on Spam):")
        print(classifier.performance_df[['Model', 'Accuracy', 'Precision (Spam)', 'Recall (Spam)', 'F1-Score (Spam)']].to_string())
        logging.info(f"\nBest Performing Model Identified: {classifier.best_model_name}")
        logging.info(f"Check '{Config.PLOTS_DIR}' for EDA and Confusion Matrix plots.")
        logging.info(f"Check '{Config.MODELS_DIR}' for the saved best model.")

        try:
            logging.info("\n--- Demonstrating Model Inference from Saved Model ---")
            model_files = [f for f in os.listdir(Config.MODELS_DIR) if f.startswith('best_model_') and f.endswith('.pkl')]
            if model_files:
                latest_model_file = max(model_files, key=lambda f: os.path.getmtime(os.path.join(Config.MODELS_DIR, f)))
                latest_model_path = os.path.join(Config.MODELS_DIR, latest_model_file)
                logging.info(f"Attempting to load the latest best model from: {latest_model_path}")
                loaded_classifier = SpamClassifier.load_for_inference(latest_model_path)

                test_spam_text_1 = "WINNER! You have been selected for a £1000 prize! Call 09061701300 now or claim at link.co.uk/prize. T&C's apply."
                test_ham_text_1 = "Hey, just checking in. How are you doing today? Let's catch up soon for coffee!"
                
                label, spam_conf, ham_conf = loaded_classifier.predict_with_confidence(test_spam_text_1)
                print(f"Prediction for SPAM text: '{test_spam_text_1}' -> Label: {label}, Spam Confidence: {spam_conf:.4f}")
                
                label, spam_conf, ham_conf = loaded_classifier.predict_with_confidence(test_ham_text_1)
                print(f"Prediction for HAM text: '{test_ham_text_1}' -> Label: {label}, Ham Confidence: {ham_conf:.4f}")

            else:
                logging.warning("No model files found in the 'models' directory to demonstrate inference. Run the pipeline first.")
        except Exception as e:
            logging.error(f"An error occurred during the inference demonstration: {e}")
            sys.exit(1)
    else:
        logging.critical("Spam classification pipeline failed during execution. Please review the log file for details.")
        sys.exit

Running in a notebook environment. Base directory set to CWD: '/home/dev/spam_classifier_project'
2025-08-01 14:36:27,208 - INFO - NLTK punkt resource found.
2025-08-01 14:36:27,209 - INFO - NLTK stopwords resource found.
2025-08-01 14:36:27,210 - INFO - NLTK punkt_tab resource found.
2025-08-01 14:36:27,212 - INFO - Initialized all individual and ensemble classifiers.
2025-08-01 14:36:27,213 - INFO - SpamClassifier initialized successfully.
2025-08-01 14:36:27,216 - INFO - 
--- Starting Pipeline Step: Data Loading ---
2025-08-01 14:36:27,236 - INFO - Loaded 5572 records from /home/dev/spam_classifier_project/spam.csv.
2025-08-01 14:36:27,237 - INFO - --- Completed Pipeline Step: Data Loading ---

2025-08-01 14:36:27,239 - INFO - 
--- Starting Pipeline Step: Data Cleaning ---
2025-08-01 14:36:27,241 - INFO - Selected 'v1' and 'v2' columns from the dataset.
2025-08-01 14:36:27,256 - INFO - Cleaned dataset. Removed 403 duplicates/nulls. Remaining: 5169 records.
2025-08-01 14:36:27,258 - 

Batches: 100%|██████████| 81/81 [00:39<00:00,  2.07it/s]

2025-08-01 14:37:25,677 - INFO - SentenceTransformer embedding complete. X shape: (5169, 384), Y shape: (5169,).
2025-08-01 14:37:25,678 - INFO - --- Completed Pipeline Step: Text Vectorization (Embeddings) ---

2025-08-01 14:37:25,679 - INFO - 
--- Starting Pipeline Step: Data Splitting ---
2025-08-01 14:37:25,687 - INFO - Data split: Train 4135 samples, Test 1034 samples.
2025-08-01 14:37:25,688 - INFO - Train target distribution: [3613  522]
2025-08-01 14:37:25,689 - INFO - Test target distribution: [903 131]
2025-08-01 14:37:25,691 - INFO - --- Completed Pipeline Step: Data Splitting ---

2025-08-01 14:37:25,692 - INFO - 
--- Starting Pipeline Step: Hyperparameter Tuning ---
2025-08-01 14:37:25,693 - INFO - Starting hyperparameter tuning with Optuna for selected models...
2025-08-01 14:37:25,695 - INFO - Tuning LR model with 15 trials...



[I 2025-08-01 14:37:25,698] A new study created in memory with name: LR_tuning_study
  0%|          | 0/15 [00:03<?, ?it/s]

[I 2025-08-01 14:37:29,397] Trial 0 finished with value: 0.8195201694380092 and parameters: {'C': 0.017670169402947963, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8195201694380092.


Best trial: 0. Best value: 0.81952:   7%|▋         | 1/15 [00:06<00:57,  4.12s/it]

[I 2025-08-01 14:37:32,645] Trial 1 finished with value: 0.9067637144074732 and parameters: {'C': 0.39079671568228835, 'solver': 'liblinear'}. Best is trial 1 with value: 0.9067637144074732.


Best trial: 1. Best value: 0.906764:  13%|█▎        | 2/15 [00:07<00:46,  3.58s/it]

[I 2025-08-01 14:37:33,566] Trial 2 finished with value: 0.5636527533822596 and parameters: {'C': 0.00022310108018679258, 'solver': 'liblinear'}. Best is trial 1 with value: 0.9067637144074732.


Best trial: 1. Best value: 0.906764:  20%|██        | 3/15 [00:10<00:28,  2.34s/it]

[I 2025-08-01 14:37:36,188] Trial 3 finished with value: 0.9252627284490291 and parameters: {'C': 1.7718847354806828, 'solver': 'saga'}. Best is trial 3 with value: 0.9252627284490291.


Best trial: 3. Best value: 0.925263:  27%|██▋       | 4/15 [00:12<00:26,  2.44s/it]

[I 2025-08-01 14:37:38,020] Trial 4 finished with value: 0.9358347576312023 and parameters: {'C': 9.877700294007917, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  33%|███▎      | 5/15 [00:14<00:22,  2.22s/it]

[I 2025-08-01 14:37:40,565] Trial 5 finished with value: 0.7982814551985044 and parameters: {'C': 0.0012601639723276807, 'solver': 'saga'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  40%|████      | 6/15 [00:17<00:20,  2.33s/it]

[I 2025-08-01 14:37:42,914] Trial 6 finished with value: 0.8533267901688955 and parameters: {'C': 0.039054412752107935, 'solver': 'saga'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  47%|████▋     | 7/15 [00:19<00:18,  2.33s/it]

[I 2025-08-01 14:37:45,304] Trial 7 finished with value: 0.7962869532134003 and parameters: {'C': 0.0006870101665590031, 'solver': 'saga'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  53%|█████▎    | 8/15 [00:20<00:16,  2.36s/it]

[I 2025-08-01 14:37:46,423] Trial 8 finished with value: 0.8553585616674987 and parameters: {'C': 0.054502936945582565, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  60%|██████    | 9/15 [00:21<00:11,  1.98s/it]

[I 2025-08-01 14:37:47,553] Trial 9 finished with value: 0.8757717445900015 and parameters: {'C': 0.12173252504194051, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  67%|██████▋   | 10/15 [00:24<00:08,  1.72s/it]

[I 2025-08-01 14:37:49,990] Trial 10 finished with value: 0.9347469338394623 and parameters: {'C': 73.7864208342295, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  73%|███████▎  | 11/15 [00:26<00:07,  1.94s/it]

[I 2025-08-01 14:37:52,463] Trial 11 finished with value: 0.9338781698536784 and parameters: {'C': 65.64817611753449, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  80%|████████  | 12/15 [00:29<00:06,  2.09s/it]

[I 2025-08-01 14:37:54,906] Trial 12 finished with value: 0.9357473287321831 and parameters: {'C': 78.72383571224226, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  87%|████████▋ | 13/15 [00:30<00:04,  2.21s/it]

[I 2025-08-01 14:37:56,664] Trial 13 finished with value: 0.9331448068701527 and parameters: {'C': 8.224108054741553, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  93%|█████████▎| 14/15 [00:32<00:02,  2.09s/it]

[I 2025-08-01 14:37:58,481] Trial 14 finished with value: 0.9331984783268465 and parameters: {'C': 6.611757606926467, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835: 100%|██████████| 15/15 [00:33<00:00,  2.21s/it]

2025-08-01 14:37:58,805 - INFO - Best parameters for LR: {'C': 9.877700294007917, 'solver': 'liblinear'}
2025-08-01 14:37:58,807 - INFO - Best cross-validated F1-score for LR: 0.9358
2025-08-01 14:37:58,808 - INFO - Tuning RF model with 15 trials...



[I 2025-08-01 14:37:58,810] A new study created in memory with name: RF_tuning_study
  0%|          | 0/15 [00:27<?, ?it/s]

[I 2025-08-01 14:38:25,816] Trial 0 finished with value: 0.9105635300372142 and parameters: {'n_estimators': 144, 'max_depth': 36, 'min_samples_split': 15, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.9105635300372142.


Best trial: 0. Best value: 0.910564:   7%|▋         | 1/15 [00:37<06:22, 27.33s/it]

[I 2025-08-01 14:38:35,840] Trial 1 finished with value: 0.9127467132964885 and parameters: {'n_estimators': 89, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.9127467132964885.


Best trial: 1. Best value: 0.912747:  13%|█▎        | 2/15 [01:11<03:43, 17.16s/it]

[I 2025-08-01 14:39:10,618] Trial 2 finished with value: 0.9091251939576921 and parameters: {'n_estimators': 200, 'max_depth': 21, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 1 with value: 0.9127467132964885.


Best trial: 1. Best value: 0.912747:  20%|██        | 3/15 [01:44<05:02, 25.22s/it]

[I 2025-08-01 14:39:43,773] Trial 3 finished with value: 0.9170959825132309 and parameters: {'n_estimators': 258, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  27%|██▋       | 4/15 [02:08<05:11, 28.33s/it]

[I 2025-08-01 14:40:07,779] Trial 4 finished with value: 0.9058311877181092 and parameters: {'n_estimators': 126, 'max_depth': 14, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  33%|███▎      | 5/15 [02:31<04:27, 26.77s/it]

[I 2025-08-01 14:40:30,234] Trial 5 finished with value: 0.9090006738787435 and parameters: {'n_estimators': 203, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  40%|████      | 6/15 [03:03<03:47, 25.30s/it]

[I 2025-08-01 14:41:01,815] Trial 6 finished with value: 0.9097470973538184 and parameters: {'n_estimators': 164, 'max_depth': 25, 'min_samples_split': 5, 'min_samples_leaf': 6}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  47%|████▋     | 7/15 [03:22<03:39, 27.39s/it]

[I 2025-08-01 14:41:21,295] Trial 7 finished with value: 0.9093436919014714 and parameters: {'n_estimators': 198, 'max_depth': 5, 'min_samples_split': 13, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  53%|█████▎    | 8/15 [03:34<02:53, 24.85s/it]

[I 2025-08-01 14:41:33,507] Trial 8 finished with value: 0.9052117927047891 and parameters: {'n_estimators': 66, 'max_depth': 36, 'min_samples_split': 20, 'min_samples_leaf': 9}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  60%|██████    | 9/15 [03:48<02:05, 20.89s/it]

[I 2025-08-01 14:41:47,690] Trial 9 finished with value: 0.9089022907441778 and parameters: {'n_estimators': 126, 'max_depth': 6, 'min_samples_split': 15, 'min_samples_leaf': 5}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  67%|██████▋   | 10/15 [04:34<01:34, 18.84s/it]

[I 2025-08-01 14:42:33,644] Trial 10 finished with value: 0.9116850200695618 and parameters: {'n_estimators': 287, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  73%|███████▎  | 11/15 [05:14<01:48, 27.13s/it]

[I 2025-08-01 14:43:13,229] Trial 11 finished with value: 0.9108888736746182 and parameters: {'n_estimators': 277, 'max_depth': 9, 'min_samples_split': 2, 'min_samples_leaf': 8}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  80%|████████  | 12/15 [05:22<01:32, 30.91s/it]

[I 2025-08-01 14:43:21,616] Trial 12 finished with value: 0.9084614524269989 and parameters: {'n_estimators': 59, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 8}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  87%|████████▋ | 13/15 [06:05<00:48, 24.08s/it]

[I 2025-08-01 14:44:04,212] Trial 13 finished with value: 0.911159221076747 and parameters: {'n_estimators': 250, 'max_depth': 13, 'min_samples_split': 5, 'min_samples_leaf': 7}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  93%|█████████▎| 14/15 [06:17<00:29, 29.66s/it]

[I 2025-08-01 14:44:15,888] Trial 14 finished with value: 0.9118333987728944 and parameters: {'n_estimators': 90, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096: 100%|██████████| 15/15 [06:17<00:00, 25.16s/it]

2025-08-01 14:44:16,197 - INFO - Best parameters for RF: {'n_estimators': 258, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 2}
2025-08-01 14:44:16,198 - INFO - Best cross-validated F1-score for RF: 0.9171
2025-08-01 14:44:16,201 - INFO - Tuning XGB model with 15 trials...



[I 2025-08-01 14:44:16,203] A new study created in memory with name: XGB_tuning_study
  0%|          | 0/15 [00:25<?, ?it/s]

[I 2025-08-01 14:44:41,555] Trial 0 finished with value: 0.922533456170615 and parameters: {'n_estimators': 144, 'max_depth': 12, 'learning_rate': 0.14553179565665345, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.6624074561769746, 'gamma': 1.7699302940633311e-07}. Best is trial 0 with value: 0.922533456170615.


Best trial: 0. Best value: 0.922533:   7%|▋         | 1/15 [00:47<05:58, 25.64s/it]

[I 2025-08-01 14:45:03,264] Trial 1 finished with value: 0.8968282232826492 and parameters: {'n_estimators': 64, 'max_depth': 11, 'learning_rate': 0.07965261308120507, 'subsample': 0.8832290311184181, 'colsample_bytree': 0.608233797718321, 'gamma': 0.574485163632042}. Best is trial 0 with value: 0.922533456170615.


Best trial: 0. Best value: 0.922533:  13%|█▎        | 2/15 [01:23<05:02, 23.29s/it]

[I 2025-08-01 14:45:40,023] Trial 2 finished with value: 0.8411553008352888 and parameters: {'n_estimators': 258, 'max_depth': 5, 'learning_rate': 0.011551009439226469, 'subsample': 0.6733618039413735, 'colsample_bytree': 0.7216968971838151, 'gamma': 0.00015777981883364995}. Best is trial 0 with value: 0.922533456170615.


Best trial: 0. Best value: 0.922533:  20%|██        | 3/15 [01:41<05:53, 29.47s/it]

[I 2025-08-01 14:45:57,453] Trial 3 finished with value: 0.924591148223439 and parameters: {'n_estimators': 158, 'max_depth': 5, 'learning_rate': 0.08369042894376064, 'subsample': 0.6557975442608167, 'colsample_bytree': 0.7168578594140873, 'gamma': 8.528933855762793e-06}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  27%|██▋       | 4/15 [02:45<04:31, 24.69s/it]

[I 2025-08-01 14:47:01,524] Trial 4 finished with value: 0.8409469883992775 and parameters: {'n_estimators': 164, 'max_depth': 10, 'learning_rate': 0.01254057843022616, 'subsample': 0.8056937753654446, 'colsample_bytree': 0.836965827544817, 'gamma': 2.3528990899815284e-08}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  33%|███▎      | 5/15 [03:05<06:29, 38.91s/it]

[I 2025-08-01 14:47:21,577] Trial 5 finished with value: 0.6888456060996102 and parameters: {'n_estimators': 202, 'max_depth': 4, 'learning_rate': 0.006746417134006626, 'subsample': 0.9795542149013333, 'colsample_bytree': 0.9862528132298237, 'gamma': 0.02932100047183291}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  40%|████      | 6/15 [03:13<04:52, 32.50s/it]

[I 2025-08-01 14:47:29,925] Trial 6 finished with value: 0.909142632349273 and parameters: {'n_estimators': 126, 'max_depth': 3, 'learning_rate': 0.11679817513130797, 'subsample': 0.7760609974958406, 'colsample_bytree': 0.6488152939379115, 'gamma': 9.149877525022172e-05}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  47%|████▋     | 7/15 [03:36<03:16, 24.61s/it]

[I 2025-08-01 14:47:52,719] Trial 7 finished with value: 0.7659710875806484 and parameters: {'n_estimators': 58, 'max_depth': 12, 'learning_rate': 0.01646379567211809, 'subsample': 0.8650089137415928, 'colsample_bytree': 0.7246844304357644, 'gamma': 0.00014472520367197597}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  53%|█████▎    | 8/15 [03:49<02:48, 24.03s/it]

[I 2025-08-01 14:48:05,600] Trial 8 finished with value: 0.9134719551686679 and parameters: {'n_estimators': 187, 'max_depth': 4, 'learning_rate': 0.43464957555697725, 'subsample': 0.9100531293444458, 'colsample_bytree': 0.9757995766256756, 'gamma': 0.14408501080722544}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  60%|██████    | 9/15 [04:58<02:03, 20.55s/it]

[I 2025-08-01 14:49:14,397] Trial 9 finished with value: 0.8332102676585844 and parameters: {'n_estimators': 200, 'max_depth': 12, 'learning_rate': 0.007515450322528414, 'subsample': 0.6783931449676581, 'colsample_bytree': 0.6180909155642152, 'gamma': 4.005370050283172e-06}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  67%|██████▋   | 10/15 [05:48<02:57, 35.44s/it]

[I 2025-08-01 14:50:04,321] Trial 10 finished with value: 0.9143035711477321 and parameters: {'n_estimators': 287, 'max_depth': 7, 'learning_rate': 0.03621799474202481, 'subsample': 0.6071847502459278, 'colsample_bytree': 0.8391524267229545, 'gamma': 0.0033264162114920023}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  73%|███████▎  | 11/15 [06:01<02:39, 39.87s/it]

[I 2025-08-01 14:50:17,796] Trial 11 finished with value: 0.9243616144942959 and parameters: {'n_estimators': 121, 'max_depth': 8, 'learning_rate': 0.22955406185548316, 'subsample': 0.7518416973680894, 'colsample_bytree': 0.7248996679248748, 'gamma': 1.3465901496770342e-07}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  80%|████████  | 12/15 [06:12<01:35, 31.85s/it]

[I 2025-08-01 14:50:28,504] Trial 12 finished with value: 0.9179095040995241 and parameters: {'n_estimators': 106, 'max_depth': 8, 'learning_rate': 0.32976584052032165, 'subsample': 0.7273145000499233, 'colsample_bytree': 0.7624979833916741, 'gamma': 1.307420395434413e-06}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  87%|████████▋ | 13/15 [06:24<00:50, 25.45s/it]

[I 2025-08-01 14:50:40,875] Trial 13 finished with value: 0.9265312829611185 and parameters: {'n_estimators': 102, 'max_depth': 7, 'learning_rate': 0.2197325710169943, 'subsample': 0.610547233762323, 'colsample_bytree': 0.7874820875551369, 'gamma': 6.7169034639277175e-06}. Best is trial 13 with value: 0.9265312829611185.


Best trial: 13. Best value: 0.926531:  93%|█████████▎| 14/15 [06:41<00:21, 21.49s/it]

[I 2025-08-01 14:50:58,077] Trial 14 finished with value: 0.8645482549698402 and parameters: {'n_estimators': 78, 'max_depth': 6, 'learning_rate': 0.04374146076402921, 'subsample': 0.6041792656687146, 'colsample_bytree': 0.8955427636018558, 'gamma': 9.146410590181663e-06}. Best is trial 13 with value: 0.9265312829611185.


Best trial: 13. Best value: 0.926531: 100%|██████████| 15/15 [06:42<00:00, 26.81s/it]

2025-08-01 14:50:58,386 - INFO - Best parameters for XGB: {'n_estimators': 102, 'max_depth': 7, 'learning_rate': 0.2197325710169943, 'subsample': 0.610547233762323, 'colsample_bytree': 0.7874820875551369, 'gamma': 6.7169034639277175e-06}
2025-08-01 14:50:58,389 - INFO - Best cross-validated F1-score for XGB: 0.9265
2025-08-01 14:50:58,392 - INFO - Tuning SVC model with 15 trials...



[I 2025-08-01 14:50:58,394] A new study created in memory with name: SVC_tuning_study
  0%|          | 0/15 [04:48<?, ?it/s]

[I 2025-08-01 14:55:46,416] Trial 0 finished with value: 0.48185640015644654 and parameters: {'C': 0.31489116479568624, 'gamma': 6.351221010640703, 'kernel': 'rbf'}. Best is trial 0 with value: 0.48185640015644654.


Best trial: 0. Best value: 0.481856:   7%|▋         | 1/15 [10:37<1:07:16, 288.31s/it]

[I 2025-08-01 15:01:36,066] Trial 1 finished with value: 0.8040040979459049 and parameters: {'C': 0.04207988669606638, 'gamma': 0.004207053950287938, 'kernel': 'sigmoid'}. Best is trial 1 with value: 0.8040040979459049.


Best trial: 1. Best value: 0.804004:  13%|█▎        | 2/15 [11:01<1:10:17, 324.40s/it]

[I 2025-08-01 15:02:00,274] Trial 2 finished with value: 0.9221662225082857 and parameters: {'C': 2.5378155082656657, 'gamma': 0.679657809075816, 'kernel': 'sigmoid'}. Best is trial 2 with value: 0.9221662225082857.


Best trial: 2. Best value: 0.922166:  20%|██        | 3/15 [11:59<37:27, 187.32s/it]  

[I 2025-08-01 15:02:57,912] Trial 3 finished with value: 0.9274468578122226 and parameters: {'C': 21.368329072358772, 'gamma': 0.0070689749506246055, 'kernel': 'sigmoid'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  27%|██▋       | 4/15 [13:37<24:57, 136.10s/it]

[I 2025-08-01 15:04:36,354] Trial 4 finished with value: 0.9000954051681628 and parameters: {'C': 0.1648044642797898, 'gamma': 0.12561043700013558, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  33%|███▎      | 5/15 [17:12<20:25, 122.55s/it]

[I 2025-08-01 15:08:11,166] Trial 5 finished with value: 0.8466772244638058 and parameters: {'C': 2.801635158716261, 'gamma': 0.003613894271216527, 'kernel': 'sigmoid'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  40%|████      | 6/15 [17:48<23:05, 153.91s/it]

[I 2025-08-01 15:08:47,211] Trial 6 finished with value: 0.8754548416250257 and parameters: {'C': 0.6672367170464207, 'gamma': 1.382623217936987, 'kernel': 'sigmoid'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  47%|████▋     | 7/15 [21:26<15:23, 115.38s/it]

[I 2025-08-01 15:12:24,950] Trial 7 finished with value: 0.839863591318843 and parameters: {'C': 2.342384984711291, 'gamma': 0.0015339162591163618, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  53%|█████▎    | 8/15 [26:37<17:15, 147.97s/it]

[I 2025-08-01 15:17:35,938] Trial 8 finished with value: 0.08042653953868908 and parameters: {'C': 0.018205657658407266, 'gamma': 6.245139574743075, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  60%|██████    | 9/15 [32:06<19:53, 198.94s/it]

[I 2025-08-01 15:23:04,863] Trial 9 finished with value: 0.8047218721060352 and parameters: {'C': 0.1653693718282443, 'gamma': 0.002458603276328005, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  67%|██████▋   | 10/15 [32:31<19:55, 239.06s/it]

[I 2025-08-01 15:23:29,869] Trial 10 finished with value: 0.9415302848927896 and parameters: {'C': 60.33178530661243, 'gamma': 0.028504320627871515, 'kernel': 'sigmoid'}. Best is trial 10 with value: 0.9415302848927896.


Best trial: 10. Best value: 0.94153:  73%|███████▎  | 11/15 [32:58<11:34, 173.59s/it]

[I 2025-08-01 15:23:56,538] Trial 11 finished with value: 0.9424880088028337 and parameters: {'C': 64.64947866087911, 'gamma': 0.024218157448679556, 'kernel': 'sigmoid'}. Best is trial 11 with value: 0.9424880088028337.


Best trial: 11. Best value: 0.942488:  80%|████████  | 12/15 [33:18<06:26, 128.84s/it]

[I 2025-08-01 15:24:16,963] Trial 12 finished with value: 0.9392053629128879 and parameters: {'C': 88.19429776626716, 'gamma': 0.03713740624438133, 'kernel': 'sigmoid'}. Best is trial 11 with value: 0.9424880088028337.


Best trial: 11. Best value: 0.942488:  87%|████████▋ | 13/15 [33:41<03:12, 96.02s/it] 

[I 2025-08-01 15:24:40,353] Trial 13 finished with value: 0.9385834226508812 and parameters: {'C': 97.65296156943181, 'gamma': 0.025774482038992817, 'kernel': 'sigmoid'}. Best is trial 11 with value: 0.9424880088028337.


Best trial: 11. Best value: 0.942488:  93%|█████████▎| 14/15 [34:05<01:14, 74.11s/it]

[I 2025-08-01 15:25:04,240] Trial 14 finished with value: 0.9365904399909507 and parameters: {'C': 15.59864319752155, 'gamma': 0.13883438990307442, 'kernel': 'sigmoid'}. Best is trial 11 with value: 0.9424880088028337.


Best trial: 11. Best value: 0.942488: 100%|██████████| 15/15 [34:06<00:00, 136.41s/it]

2025-08-01 15:25:04,497 - INFO - Best parameters for SVC: {'C': 64.64947866087911, 'gamma': 0.024218157448679556, 'kernel': 'sigmoid'}
2025-08-01 15:25:04,499 - INFO - Best cross-validated F1-score for SVC: 0.9425
2025-08-01 15:25:04,501 - INFO - Tuning ETC model with 15 trials...



[I 2025-08-01 15:25:04,505] A new study created in memory with name: ETC_tuning_study
  0%|          | 0/15 [00:04<?, ?it/s]

[I 2025-08-01 15:25:09,290] Trial 0 finished with value: 0.905345439765127 and parameters: {'n_estimators': 144, 'max_depth': 36, 'min_samples_split': 15, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.905345439765127.


Best trial: 0. Best value: 0.905345:   7%|▋         | 1/15 [00:07<01:11,  5.13s/it]

[I 2025-08-01 15:25:11,984] Trial 1 finished with value: 0.9033105221386787 and parameters: {'n_estimators': 89, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.905345439765127.


Best trial: 0. Best value: 0.905345:  13%|█▎        | 2/15 [00:14<00:48,  3.72s/it]

[I 2025-08-01 15:25:18,918] Trial 2 finished with value: 0.9048320961326024 and parameters: {'n_estimators': 200, 'max_depth': 21, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.905345439765127.


Best trial: 0. Best value: 0.905345:  20%|██        | 3/15 [00:21<01:02,  5.18s/it]

[I 2025-08-01 15:25:25,989] Trial 3 finished with value: 0.9191989621106709 and parameters: {'n_estimators': 258, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  27%|██▋       | 4/15 [00:26<01:05,  5.93s/it]

[I 2025-08-01 15:25:30,924] Trial 4 finished with value: 0.9037124762642886 and parameters: {'n_estimators': 126, 'max_depth': 14, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  33%|███▎      | 5/15 [00:31<00:55,  5.53s/it]

[I 2025-08-01 15:25:36,302] Trial 5 finished with value: 0.9055210896400828 and parameters: {'n_estimators': 203, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  40%|████      | 6/15 [00:38<00:49,  5.53s/it]

[I 2025-08-01 15:25:42,666] Trial 6 finished with value: 0.8996331187012563 and parameters: {'n_estimators': 164, 'max_depth': 25, 'min_samples_split': 5, 'min_samples_leaf': 6}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  47%|████▋     | 7/15 [00:42<00:46,  5.75s/it]

[I 2025-08-01 15:25:47,358] Trial 7 finished with value: 0.8992309400726045 and parameters: {'n_estimators': 198, 'max_depth': 5, 'min_samples_split': 13, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  53%|█████▎    | 8/15 [00:45<00:38,  5.45s/it]

[I 2025-08-01 15:25:50,126] Trial 8 finished with value: 0.8987381998441906 and parameters: {'n_estimators': 66, 'max_depth': 36, 'min_samples_split': 20, 'min_samples_leaf': 9}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  60%|██████    | 9/15 [00:49<00:27,  4.63s/it]

[I 2025-08-01 15:25:53,725] Trial 9 finished with value: 0.9065322904908435 and parameters: {'n_estimators': 126, 'max_depth': 6, 'min_samples_split': 15, 'min_samples_leaf': 5}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  67%|██████▋   | 10/15 [00:58<00:21,  4.29s/it]

[I 2025-08-01 15:26:02,552] Trial 10 finished with value: 0.9112263682741635 and parameters: {'n_estimators': 287, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  73%|███████▎  | 11/15 [01:07<00:22,  5.68s/it]

[I 2025-08-01 15:26:11,536] Trial 11 finished with value: 0.907783505879283 and parameters: {'n_estimators': 290, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  80%|████████  | 12/15 [01:16<00:20,  6.68s/it]

[I 2025-08-01 15:26:20,908] Trial 12 finished with value: 0.9131610037820834 and parameters: {'n_estimators': 293, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  87%|████████▋ | 13/15 [01:24<00:14,  7.48s/it]

[I 2025-08-01 15:26:28,432] Trial 13 finished with value: 0.9150964771322074 and parameters: {'n_estimators': 250, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  93%|█████████▎| 14/15 [01:30<00:07,  7.49s/it]

[I 2025-08-01 15:26:35,162] Trial 14 finished with value: 0.9171971586951543 and parameters: {'n_estimators': 242, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199: 100%|██████████| 15/15 [01:31<00:00,  6.07s/it]

2025-08-01 15:26:35,515 - INFO - Best parameters for ETC: {'n_estimators': 258, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 2}
2025-08-01 15:26:35,517 - INFO - Best cross-validated F1-score for ETC: 0.9192
2025-08-01 15:26:35,518 - INFO - Hyperparameter tuning completed for all selected models.
2025-08-01 15:26:35,519 - INFO - --- Completed Pipeline Step: Hyperparameter Tuning ---

2025-08-01 15:26:35,520 - INFO - 
--- Starting Pipeline Step: Final Model Training & Evaluation ---
2025-08-01 15:26:35,521 - INFO - Applying SMOTE to the entire training data for final model training...
2025-08-01 15:26:35,563 - INFO - SMOTE applied. Original train: 4135 samples. Resampled train: 7226 samples.
2025-08-01 15:26:35,565 - INFO - Training final LR model on resampled data and evaluating...





2025-08-01 15:26:37,092 - INFO - 
--- Performance for LR ---
2025-08-01 15:26:37,093 - INFO - Accuracy: 0.9787
2025-08-01 15:26:37,094 - INFO - Precision (Spam): 0.8978
2025-08-01 15:26:37,095 - INFO - Recall (Spam): 0.9389
2025-08-01 15:26:37,096 - INFO - F1-Score (Spam): 0.9179
2025-08-01 15:26:37,108 - INFO - 
Full Classification Report for LR:
              precision    recall  f1-score   support

         ham       0.99      0.98      0.99       903
        spam       0.90      0.94      0.92       131

    accuracy                           0.98      1034
   macro avg       0.94      0.96      0.95      1034
weighted avg       0.98      0.98      0.98      1034

2025-08-01 15:26:37,112 - INFO - 
Raw Confusion Matrix for LR:
[[889  14]
 [  8 123]]
2025-08-01 15:26:37,396 - INFO - Confusion matrix plot for LR saved to /home/dev/spam_classifier_project/plots/confusion_matrix_LR_20250801_152635.png.
2025-08-01 15:26:37,397 - INFO - Training final RF model on resampled data and evalua

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.71it/s]


Prediction for SPAM text: 'WINNER! You have been selected for a £1000 prize! Call 09061701300 now or claim at link.co.uk/prize. T&C's apply.' -> Label: spam, Spam Confidence: 0.9887


Batches: 100%|██████████| 1/1 [00:00<00:00, 42.78it/s]


Prediction for HAM text: 'Hey, just checking in. How are you doing today? Let's catch up soon for coffee!' -> Label: ham, Ham Confidence: 0.9926
