In [None]:
# Spam Classification Pipeline with EDA, Text Preprocessing, and Model Training
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from collections import Counter
from wordcloud import WordCloud
import pickle
import warnings
import logging
import os
import sys
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, recall_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                             BaggingClassifier, ExtraTreesClassifier,
                             GradientBoostingClassifier, VotingClassifier,
                             StackingClassifier)
from xgboost import XGBClassifier
from sentence_transformers import SentenceTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import optuna
from datetime import datetime

# --- Determine Base Directory for Notebook/Script ---
try:
    current_script_dir = os.path.dirname(os.path.abspath(__file__))
    base_directory = current_script_dir
    print(f"Running as a script. Base directory set to: '{base_directory}'")
except NameError:
    base_directory = os.getcwd()
    print(f"Running in a notebook environment. Base directory set to CWD: '{base_directory}'")


# --- Configuration (Externalize for production) ---
class Config:
    DATA_PATH = os.path.join(base_directory, 'spam.csv')
    SENTENCE_TRANSFORMER_MODEL = 'all-MiniLM-L6-v2'
    LOG_FILE = os.path.join(base_directory, 'spam_classifier.log')
    RANDOM_STATE = 42
    TEST_SIZE = 0.2
    N_TRIALS_OPTUNA = 15
    PLOTS_DIR = os.path.join(base_directory, 'plots')
    MODELS_DIR = os.path.join(base_directory, 'models')

# Ensure plot and model directories exist at startup
os.makedirs(Config.PLOTS_DIR, exist_ok=True)
os.makedirs(Config.MODELS_DIR, exist_ok=True)


class SpamClassifier:
    def __init__(self):
        self._configure_logging()
        self._verify_nltk_resources()
        self._configure_matplotlib()
        self.df = None
        self.encoder = LabelEncoder()
        self.ps = PorterStemmer()
        self.sentence_transformer_model = None
        self.X, self.y = None, None
        self.X_train, self.X_test, self.y_train, self.y_test = [None]*4
        self.clfs = {}
        self.best_tuned_models_params = {}
        self.best_model = None
        self.best_model_name = None
        self.performance_df = pd.DataFrame()
        self._initialize_classifiers()
        logging.info("SpamClassifier initialized successfully.")

    def _configure_logging(self) -> None:
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(Config.LOG_FILE),
                logging.StreamHandler(sys.stdout)
            ]
        )
        warnings.filterwarnings('ignore')

    def _verify_nltk_resources(self) -> None:
        resources = [
            ('tokenizers/punkt', 'punkt'),
            ('corpora/stopwords', 'stopwords'),
            ('tokenizers/punkt_tab', 'punkt_tab')
        ]
        for path, package in resources:
            try:
                nltk.data.find(path)
                logging.info(f"NLTK {package} resource found.")
            except LookupError:
                logging.warning(f"NLTK {package} not found. Attempting to download...")
                try:
                    nltk.download(package, quiet=True)
                    logging.info(f"NLTK {package} downloaded successfully.")
                except Exception as e:
                    logging.critical(f"Failed to download NLTK {package}. Error: {e}")
                    sys.exit(1)

    def _configure_matplotlib(self) -> None:
        plt.ioff()
        sns.set(style='whitegrid', palette='viridis')

    def _initialize_classifiers(self) -> None:
        self.clfs = {
            'LR': LogisticRegression(
                solver='liblinear',
                penalty='l1',
                random_state=Config.RANDOM_STATE,
                class_weight='balanced',
                max_iter=1000
            ),
            'RF': RandomForestClassifier(
                n_estimators=100,
                random_state=Config.RANDOM_STATE,
                class_weight='balanced',
                n_jobs=-1
            ),
            'XGB': XGBClassifier(
                n_estimators=100,
                random_state=Config.RANDOM_STATE,
                eval_metric='logloss',
                scale_pos_weight=1
            ),
            'SVC': SVC(kernel='sigmoid', gamma=1.0, probability=True, random_state=Config.RANDOM_STATE, class_weight='balanced'),
            'KN': KNeighborsClassifier(),
            'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=Config.RANDOM_STATE),
            'BgC': BaggingClassifier(n_estimators=100, random_state=Config.RANDOM_STATE, n_jobs=-1),
            'ETC': ExtraTreesClassifier(n_estimators=100, random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1),
            'GBDT': GradientBoostingClassifier(n_estimators=100, random_state=Config.RANDOM_STATE),
            'DT': DecisionTreeClassifier(max_depth=5, random_state=Config.RANDOM_STATE, class_weight='balanced')
        }

    def load_data(self) -> None:
        try:
            if not os.path.exists(Config.DATA_PATH):
                raise FileNotFoundError(f"Data file not found at {os.path.abspath(Config.DATA_PATH)}")
            self.df = pd.read_csv(Config.DATA_PATH, encoding='latin-1')
            if len(self.df) < 100:
                raise ValueError(f"Dataset too small ({len(self.df)} samples). Minimum 100 samples required for robust analysis.")
            logging.info(f"Loaded {len(self.df)} records from {Config.DATA_PATH}.")
        except Exception as e:
            logging.critical(f"Data loading failed: {e}")
            sys.exit(1)

    def clean_data(self) -> None:
        try:
            if 'v1' in self.df.columns and 'v2' in self.df.columns:
                self.df = self.df[['v1', 'v2']].copy()
                logging.info("Selected 'v1' and 'v2' columns from the dataset.")
            else:
                found_v1 = next((col for col in self.df.columns if 'target' in col.lower() or 'label' in col.lower() or 'type' in col.lower()), None)
                found_v2 = next((col for col in self.df.columns if 'text' in col.lower() or 'message' in col.lower() or 'sms' in col.lower()), None)
                if found_v1 and found_v2:
                    self.df = self.df[[found_v1, found_v2]].copy()
                    logging.info(f"Mapped columns '{found_v1}' to 'target' and '{found_v2}' to 'text' using heuristics.")
                else:
                    raise ValueError(f"Could not find required 'target' and 'text' columns (v1/v2 or equivalents) in dataset. Found columns: {self.df.columns.tolist()}")

            self.df.columns = ['target', 'text']
            valid_targets = {'ham', 'spam'}
            invalid_targets = set(self.df['target'].unique()) - valid_targets
            if invalid_targets:
                logging.warning(f"Invalid target values found: {invalid_targets}. Filtering out rows with these values.")
                self.df = self.df[self.df['target'].isin(valid_targets)]
                if self.df.empty:
                    raise ValueError("No valid 'ham' or 'spam' records remaining after filtering invalid targets. Dataset is empty.")

            self.df['target'] = self.encoder.fit_transform(self.df['target'])
            initial_rows = len(self.df)
            self.df.drop_duplicates(inplace=True)
            self.df.dropna(inplace=True)

            logging.info(f"Cleaned dataset. Removed {initial_rows - len(self.df)} duplicates/nulls. Remaining: {len(self.df)} records.")
            if self.df.empty:
                raise ValueError("Dataset became empty after cleaning steps. Check data quality or initial loading.")
        except Exception as e:
            logging.critical(f"Data cleaning failed: {e}")
            sys.exit(1)

    def _safe_tokenize(self, text: str) -> list[str]:
        if not isinstance(text, str):
            text = str(text)
            logging.debug(f"Coerced non-string text to string for tokenization: {text[:50]}...")
        try:
            tokens = nltk.word_tokenize(text.lower())
            return [t for t in tokens if t.isalnum() and t not in string.punctuation]
        except Exception as e:
            logging.warning(f"Tokenization failed for text (first 50 chars: '{text[:50]}...'). Returning empty list. Error: {e}")
            return []

    def eda(self) -> None:
        try:
            self.df['num_words'] = self.df['text'].apply(lambda x: len(self._safe_tokenize(x)))
            self.df['num_chars'] = self.df['text'].apply(len)
            self.df['num_sentences'] = self.df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

            ham_count = self.df[self.df['target'] == self.encoder.transform(['ham'])[0]].shape[0]
            spam_count = self.df[self.df['target'] == self.encoder.transform(['spam'])[0]].shape[0]
            if spam_count > 0:
                scale_pos_weight_val = ham_count / spam_count
                self.clfs['XGB'].set_params(scale_pos_weight=scale_pos_weight_val)
                logging.info(f"Set XGBoost scale_pos_weight to: {scale_pos_weight_val:.2f} (Ham:{ham_count}, Spam:{spam_count})")
            else:
                logging.warning("No spam samples found to calculate scale_pos_weight for XGBoost. Defaulting to 1.")

            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            fig1, ax1 = plt.subplots(figsize=(8, 8))
            self.df['target'].value_counts().plot(
                kind='pie', ax=ax1, autopct='%1.1f%%',
                labels=self.encoder.inverse_transform(self.df['target'].value_counts().index),
                colors=sns.color_palette('pastel')[0:2],
                explode=[0, 0.1]
            )
            ax1.set_title('Target Class Distribution')
            ax1.set_ylabel('')
            fig1_filename = os.path.join(Config.PLOTS_DIR, f'target_distribution_{timestamp}.png')
            plt.savefig(fig1_filename, bbox_inches='tight')
            plt.close(fig1)
            logging.info(f"Target distribution plot saved to {fig1_filename}.")

            fig2, ax2 = plt.subplots(figsize=(14, 6))
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['ham'])[0]], x='num_words', ax=ax2, bins=50, kde=True, color='blue', label='Ham')
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['spam'])[0]], x='num_words', ax=ax2, bins=50, kde=True, color='red', label='Spam')
            ax2.set_title('Word Count Distribution by Target Class')
            ax2.set_xlabel('Number of Words')
            ax2.set_ylabel('Count')
            ax2.legend()
            fig2_filename = os.path.join(Config.PLOTS_DIR, f'word_count_distribution_{timestamp}.png')
            plt.savefig(fig2_filename, bbox_inches='tight')
            plt.close(fig2)
            logging.info(f"Word count distribution plot saved to {fig2_filename}.")

            fig3, ax3 = plt.subplots(figsize=(14, 6))
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['ham'])[0]], x='num_chars', ax=ax3, bins=50, kde=True, color='blue', label='Ham')
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['spam'])[0]], x='num_chars', ax=ax3, bins=50, kde=True, color='red', label='Spam')
            ax3.set_title('Character Count Distribution by Target Class')
            ax3.set_xlabel('Number of Characters')
            ax3.set_ylabel('Count')
            ax3.legend()
            fig3_filename = os.path.join(Config.PLOTS_DIR, f'char_count_distribution_{timestamp}.png')
            plt.savefig(fig3_filename, bbox_inches='tight')
            plt.close(fig3)
            logging.info(f"Character count distribution plot saved to {fig3_filename}.")

            fig4, ax4 = plt.subplots(figsize=(8, 6))
            sns.heatmap(self.df[['num_chars', 'num_words', 'num_sentences', 'target']].corr(), annot=True, cmap='coolwarm', ax=ax4)
            ax4.set_title('Correlation Matrix of Text Features and Target')
            fig4_filename = os.path.join(Config.PLOTS_DIR, f'correlation_heatmap_{timestamp}.png')
            plt.savefig(fig4_filename, bbox_inches='tight')
            plt.close(fig4)
            logging.info(f"Correlation heatmap plot saved to {fig4_filename}.")

            logging.info(f"Descriptive statistics for Ham emails:\n{self.df[self.df['target'] == self.encoder.transform(['ham'])[0]][['num_chars', 'num_words', 'num_sentences']].describe()}")
            logging.info(f"Descriptive statistics for Spam emails:\n{self.df[self.df['target'] == self.encoder.transform(['spam'])[0]][['num_chars', 'num_words', 'num_sentences']].describe()}")

        except Exception as e:
            logging.error(f"EDA process failed: {e}")
            raise

    def transform_text(self, text: str) -> str:
        if not isinstance(text, str):
            text = str(text)
            logging.debug(f"Coerced non-string text to string for transform_text: {text[:50]}...")
        tokens = nltk.word_tokenize(text.lower())
        processed_tokens = [token for token in tokens if token.isalnum()]
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [token for token in processed_tokens if token not in stop_words and token not in string.punctuation]
        stemmed_tokens = [self.ps.stem(token) for token in filtered_tokens]
        final_tokens = [token for token in stemmed_tokens if len(token) > 1 or token.isdigit()]
        return " ".join(final_tokens)

    def preprocess_text(self) -> None:
        try:
            logging.info("\n--- Text Preprocessing for EDA and Visualizations ---")
            self.df['transformed_text'] = self.df['text'].apply(self.transform_text)
            logging.info("Text transformation for EDA complete. Example:")
            logging.info(f"\n{self.df[['text', 'transformed_text']].head().to_string()}")
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            logging.info("\nGenerating Word Clouds (saved to plots directory):")
            spam_wc = WordCloud(width=800, height=400, min_font_size=10, background_color='white').generate(
                self.df[self.df['target'] == self.encoder.transform(['spam'])[0]]['transformed_text'].str.cat(sep=" ")
            )
            plt.figure(figsize=(10, 5))
            plt.imshow(spam_wc)
            plt.title('Spam Word Cloud')
            plt.axis('off')
            wc_spam_filename = os.path.join(Config.PLOTS_DIR, f'spam_wordcloud_{timestamp}.png')
            plt.savefig(wc_spam_filename, bbox_inches='tight')
            plt.close()
            logging.info(f"Spam word cloud saved to {wc_spam_filename}.")

            ham_wc = WordCloud(width=800, height=400, min_font_size=10, background_color='white').generate(
                self.df[self.df['target'] == self.encoder.transform(['ham'])[0]]['transformed_text'].str.cat(sep=" ")
            )
            plt.figure(figsize=(10, 5))
            plt.imshow(ham_wc)
            plt.title('Ham Word Cloud')
            plt.axis('off')
            wc_ham_filename = os.path.join(Config.PLOTS_DIR, f'ham_wordcloud_{timestamp}.png')
            plt.savefig(wc_ham_filename, bbox_inches='tight')
            plt.close()
            logging.info(f"Ham word cloud saved to {wc_ham_filename}.")

            logging.info("\nMost common words in Spam (saved as plot):")
            spam_corpus = ' '.join(self.df[self.df['target'] == self.encoder.transform(['spam'])[0]]['transformed_text']).split()
            self._plot_most_common_words(spam_corpus, title='Top 30 Spam Words', filename=f'top_spam_words_{timestamp}.png')

            logging.info("\nMost common words in Ham (saved as plot):")
            ham_corpus = ' '.join(self.df[self.df['target'] == self.encoder.transform(['ham'])[0]]['transformed_text']).split()
            self._plot_most_common_words(ham_corpus, title='Top 30 Ham Words', filename=f'top_ham_words_{timestamp}.png')

        except Exception as e:
            logging.critical(f"Text preprocessing for EDA failed: {e}")
            sys.exit(1)

    def _plot_most_common_words(self, corpus: list[str], title: str, n: int = 30, filename: str = "common_words.png") -> None:
        common_words = Counter(corpus).most_common(n)
        df_common_words = pd.DataFrame(common_words, columns=['Word', 'Count'])
        fig, ax = plt.subplots(figsize=(12, 6))
        sns.barplot(x='Word', y='Count', data=df_common_words, ax=ax, palette='viridis')
        ax.set_xticklabels(ax.get_xticklabels(), rotation='vertical')
        ax.set_title(title)
        plot_filepath = os.path.join(Config.PLOTS_DIR, filename)
        plt.savefig(plot_filepath, bbox_inches='tight')
        plt.close(fig)
        logging.info(f"Plot '{title}' saved to {plot_filepath}.")

    def vectorize_text_with_embeddings(self) -> None:
        try:
            logging.info(f"\n--- Text Vectorization (SentenceTransformer: {Config.SENTENCE_TRANSFORMER_MODEL}) ---")
            if self.sentence_transformer_model is None:
                self.sentence_transformer_model = SentenceTransformer(Config.SENTENCE_TRANSFORMER_MODEL)

            self.X = self.sentence_transformer_model.encode(
                self.df['text'].tolist(),
                show_progress_bar=True,
                convert_to_tensor=False,
                batch_size=64
            )
            self.y = self.df['target'].values
            logging.info(f"SentenceTransformer embedding complete. X shape: {self.X.shape}, Y shape: {self.y.shape}.")
        except Exception as e:
            logging.critical(f"Text vectorization failed: {e}")
            sys.exit(1)

    def split_data(self) -> None:
        try:
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
                self.X, self.y, test_size=Config.TEST_SIZE,
                random_state=Config.RANDOM_STATE, stratify=self.y)
            logging.info(f"Data split: Train {len(self.X_train)} samples, Test {len(self.X_test)} samples.")
            logging.info(f"Train target distribution: {np.bincount(self.y_train)}")
            logging.info(f"Test target distribution: {np.bincount(self.y_test)}")
        except Exception as e:
            logging.critical(f"Data splitting failed: {e}")
            sys.exit(1)

    def _objective(self, trial: optuna.trial.Trial, model_name: str) -> float:
        if model_name == 'LR':
            c_param = trial.suggest_loguniform('C', 1e-4, 1e2)
            solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])
            model = LogisticRegression(C=c_param, solver=solver, random_state=Config.RANDOM_STATE,
                                       class_weight='balanced', max_iter=2000,
                                       n_jobs=-1 if solver == 'saga' else None)
        elif model_name == 'RF':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 5, 40, log=True)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           min_samples_leaf=min_samples_leaf,
                                           random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1)
        elif model_name == 'XGB':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 3, 12)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.005, 0.5)
            subsample = trial.suggest_uniform('subsample', 0.6, 1.0)
            colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
            gamma = trial.suggest_loguniform('gamma', 1e-8, 1.0)
            current_scale_pos_weight = self.clfs['XGB'].get_params().get('scale_pos_weight', 1)
            model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                  learning_rate=learning_rate, subsample=subsample,
                                  colsample_bytree=colsample_bytree, gamma=gamma,
                                  random_state=Config.RANDOM_STATE,
                                  eval_metric='logloss',
                                  scale_pos_weight=current_scale_pos_weight)
        elif model_name == 'SVC':
            C_param = trial.suggest_loguniform('C', 1e-2, 1e2)
            gamma_param = trial.suggest_loguniform('gamma', 1e-3, 1e1)
            kernel = trial.suggest_categorical('kernel', ['rbf', 'sigmoid'])
            model = SVC(C=C_param, gamma=gamma_param, kernel=kernel, probability=True,
                        random_state=Config.RANDOM_STATE, class_weight='balanced')
        elif model_name == 'KN':
            n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
            weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
            algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
            model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, n_jobs=-1)
        elif model_name == 'AdaBoost':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 1.0)
            model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=Config.RANDOM_STATE)
        elif model_name == 'BgC':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            model = BaggingClassifier(n_estimators=n_estimators, random_state=Config.RANDOM_STATE, n_jobs=-1)
        elif model_name == 'ETC':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 5, 40, log=True)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            model = ExtraTreesClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                         min_samples_split=min_samples_split,
                                         min_samples_leaf=min_samples_leaf,
                                         random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1)
        elif model_name == 'GBDT':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 1.0)
            max_depth = trial.suggest_int('max_depth', 3, 10)
            model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=Config.RANDOM_STATE)
        elif model_name == 'DT':
            max_depth = trial.suggest_int('max_depth', 3, 20)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
            model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split,
                                           min_samples_leaf=min_samples_leaf, criterion=criterion,
                                           random_state=Config.RANDOM_STATE, class_weight='balanced')
        else:
            raise ValueError(f"Model '{model_name}' is not configured for Optuna tuning.")

        pipeline = ImbPipeline([
            ('smote', SMOTE(random_state=Config.RANDOM_STATE)),
            ('classifier', model)
        ])
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=Config.RANDOM_STATE)
        scores = cross_val_score(pipeline, self.X_train, self.y_train, cv=cv, scoring='f1', n_jobs=-1)
        return scores.mean()

    def tune_models(self) -> None:
        try:
            if self.X_train is None or self.y_train is None:
                logging.error("Data not split for tuning. Calling split_data().")
                self.split_data()

            logging.info("Starting hyperparameter tuning with Optuna for selected models...")
            models_to_tune = ['LR', 'RF', 'XGB', 'SVC', 'ETC']

            for name in models_to_tune:
                if name not in self.clfs:
                    logging.warning(f"Model '{name}' not found in initialized classifiers, skipping tuning.")
                    continue

                logging.info(f"Tuning {name} model with {Config.N_TRIALS_OPTUNA} trials...")
                study = optuna.create_study(direction='maximize',
                                            sampler=optuna.samplers.TPESampler(seed=Config.RANDOM_STATE),
                                            study_name=f"{name}_tuning_study")

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", UserWarning)
                    study.optimize(lambda trial: self._objective(trial, name),
                                   n_trials=Config.N_TRIALS_OPTUNA,
                                   show_progress_bar=True,
                                   gc_after_trial=True)

                self.best_tuned_models_params[name] = study.best_trial.params
                logging.info(f"Best parameters for {name}: {study.best_trial.params}")
                logging.info(f"Best cross-validated F1-score for {name}: {study.best_trial.value:.4f}")

                self.clfs[name].set_params(**study.best_trial.params)
                if name == 'XGB':
                    current_scale_pos_weight = self.clfs[name].get_params().get('scale_pos_weight', 1)
                    self.clfs[name].set_params(scale_pos_weight=current_scale_pos_weight)

            logging.info("Hyperparameter tuning completed for all selected models.")
        except Exception as e:
            logging.critical(f"Model tuning failed: {e}")
            sys.exit(1)

    def train_final_models(self) -> None:
        try:
            if self.X_train is None or self.X_test is None:
                 logging.error("Data not split for final training. Calling split_data().")
                 self.split_data()

            logging.info("Applying SMOTE to the entire training data for final model training...")
            smote = SMOTE(random_state=Config.RANDOM_STATE)
            X_train_resampled, y_train_resampled = smote.fit_resample(self.X_train, self.y_train)
            logging.info(f"SMOTE applied. Original train: {len(self.X_train)} samples. Resampled train: {len(X_train_resampled)} samples.")

            results = []
            best_f1_overall = -1
            self.best_model = None
            self.best_model_name = None
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

            for name, model in self.clfs.items():
                logging.info(f"Training final {name} model on resampled data and evaluating...")
                try:
                    # To use the ImbPipeline, we need to pass the model, not just the classifier
                    pipeline = ImbPipeline([('smote', SMOTE(random_state=Config.RANDOM_STATE)), ('classifier', model)])
                    pipeline.fit(self.X_train, self.y_train)
                    y_pred = pipeline.predict(self.X_test)

                    accuracy = accuracy_score(self.y_test, y_pred)
                    precision = precision_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)
                    recall = recall_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)
                    f1 = f1_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)

                    report_dict = classification_report(self.y_test, y_pred, target_names=self.encoder.classes_, output_dict=True)

                    results.append({
                        'Model': name,
                        'Accuracy': accuracy,
                        'Precision (Spam)': precision,
                        'Recall (Spam)': recall,
                        'F1-Score (Spam)': f1,
                        'Full Classification Report': report_dict
                    })

                    logging.info(f"\n--- Performance for {name} ---")
                    logging.info(f"Accuracy: {accuracy:.4f}")
                    logging.info(f"Precision (Spam): {precision:.4f}")
                    logging.info(f"Recall (Spam): {recall:.4f}")
                    logging.info(f"F1-Score (Spam): {f1:.4f}")
                    logging.info(f"\nFull Classification Report for {name}:\n{classification_report(self.y_test, y_pred, target_names=self.encoder.classes_)}")

                    cm = confusion_matrix(self.y_test, y_pred)
                    logging.info(f"\nRaw Confusion Matrix for {name}:\n{cm}")

                    fig_cm, ax_cm = plt.subplots(figsize=(7, 6))
                    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                                xticklabels=self.encoder.classes_,
                                yticklabels=self.encoder.classes_,
                                linecolor='gray', linewidths=0.5,
                                annot_kws={"size": 14})
                    ax_cm.set_xlabel('Predicted Label', fontsize=12)
                    ax_cm.set_ylabel('True Label', fontsize=12)
                    ax_cm.set_title(f'Confusion Matrix for {name}', fontsize=14)
                    cm_filename = os.path.join(Config.PLOTS_DIR, f'confusion_matrix_{name}_{timestamp}.png')
                    plt.savefig(cm_filename, bbox_inches='tight')
                    plt.close(fig_cm)
                    logging.info(f"Confusion matrix plot for {name} saved to {cm_filename}.")

                    if f1 > best_f1_overall:
                        best_f1_overall = f1
                        self.best_model_name = name
                        self.best_model = pipeline # Store the entire pipeline
                except Exception as model_e:
                    logging.error(f"Error training or evaluating model {name}: {model_e}")
                    results.append({
                        'Model': name,
                        'Accuracy': np.nan,
                        'Precision (Spam)': np.nan,
                        'Recall (Spam)': np.nan,
                        'F1-Score (Spam)': np.nan,
                        'Full Classification Report': {'error': str(model_e)}
                    })

            self.performance_df = pd.DataFrame(results)
            self.performance_df = self.performance_df.sort_values(by='F1-Score (Spam)', ascending=False).reset_index(drop=True)
            logging.info(f"\n--- Overall Best Model Identified: {self.best_model_name} (F1-Score on Spam: {best_f1_overall:.4f}) ---")
            logging.info("All model evaluations completed.")
            self._save_best_model()
            self._plot_performance_comparison(timestamp)

        except Exception as e:
            logging.critical(f"Final model training and evaluation failed: {e}")
            sys.exit(1)

    def _save_best_model(self) -> None:
        """Saves the best performing model and related components to a pickle file."""
        try:
            if self.best_model is None or self.best_model_name is None:
                logging.warning("No best model identified or stored. Skipping model save operation.")
                return
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_filename = os.path.join(Config.MODELS_DIR, f'best_model_{self.best_model_name}_{timestamp}.pkl')
            with open(model_filename, 'wb') as f:
                pickle.dump({
                    'model': self.best_model,
                    'transformer': Config.SENTENCE_TRANSFORMER_MODEL, # THIS IS THE FIX
                    'encoder': self.encoder,
                    'model_name': self.best_model_name,
                    'performance_summary': self.performance_df.to_dict('records')
                }, f)
            logging.info(f"Best performing model ({self.best_model_name}) saved to {model_filename}")
        except Exception as e:
            logging.error(f"Failed to save the best model: {e}")

    def _plot_performance_comparison(self, timestamp: str) -> None:
        if self.performance_df.empty:
            logging.warning("Performance DataFrame is empty, cannot plot comparison.")
            return
        plot_df = self.performance_df[['Model', 'Accuracy', 'Precision (Spam)', 'Recall (Spam)', 'F1-Score (Spam)']].copy()
        plot_df_melted = plot_df.melt(id_vars="Model", var_name="Metric", value_name="Score")
        fig, ax = plt.subplots(figsize=(14, 7))
        sns.barplot(x='Model', y='Score', hue='Metric', data=plot_df_melted, palette='tab10', ax=ax)
        ax.set_ylim(0.5, 1.0)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
        ax.set_title('Model Performance Comparison (Test Set)')
        ax.set_ylabel('Score')
        ax.set_xlabel('Model')
        ax.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plot_filename = os.path.join(Config.PLOTS_DIR, f'model_performance_comparison_{timestamp}.png')
        plt.savefig(plot_filename, bbox_inches='tight')
        plt.close(fig)
        logging.info(f"Model performance comparison plot saved to {plot_filename}.")

    def run_pipeline(self) -> bool:
        steps = [
            ('Data Loading', self.load_data),
            ('Data Cleaning', self.clean_data),
            ('EDA and Feature Engineering', self.eda),
            ('Text Preprocessing for EDA', self.preprocess_text),
            ('Text Vectorization (Embeddings)', self.vectorize_text_with_embeddings),
            ('Data Splitting', self.split_data),
            ('Hyperparameter Tuning', self.tune_models),
            ('Final Model Training & Evaluation', self.train_final_models)
        ]
        for name, step in steps:
            try:
                logging.info(f"\n--- Starting Pipeline Step: {name} ---")
                step()
                logging.info(f"--- Completed Pipeline Step: {name} ---\n")
            except SystemExit:
                logging.critical(f"Pipeline stopped due to critical error in step: '{name}'.")
                return False
            except Exception as e:
                logging.critical(f"Pipeline failed unexpectedly in step '{name}': {e}")
                return False
        logging.info("Spam classification pipeline completed successfully.")
        return True

    @staticmethod
    def load_for_inference(model_path: str) -> 'SpamClassifier':
        try:
            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Model file not found at {os.path.abspath(model_path)}")
            with open(model_path, 'rb') as f:
                data = pickle.load(f)
            classifier = SpamClassifier()
            classifier.best_model = data['model']
            classifier.encoder = data['encoder']
            classifier.best_model_name = data.get('model_name', 'Unknown_Model')
            transformer_data = data['transformer']
            if isinstance(transformer_data, str):
                logging.info(f"Loading SentenceTransformer by name: '{transformer_data}'")
                classifier.sentence_transformer_model = SentenceTransformer(transformer_data)
            else:
                logging.warning("Loaded SentenceTransformer object directly from pickle.")
                classifier.sentence_transformer_model = transformer_data
            classifier.ps = PorterStemmer()
            logging.info(f"Model '{classifier.best_model_name}' loaded successfully from {model_path} for inference.")
            return classifier
        except Exception as e:
            logging.critical(f"Failed to load model for inference from {model_path}: {e}")
            raise

    def predict(self, text: str) -> str:
        if self.best_model is None or self.sentence_transformer_model is None or self.encoder is None:
            logging.error("Model components not loaded. Please run run_pipeline() or load model using load_for_inference() before calling predict().")
            raise RuntimeError("Model components not available for prediction.")
        try:
            vector = self.sentence_transformer_model.encode([text], convert_to_tensor=False)
            prediction_encoded = self.best_model.predict(vector)[0]
            prediction_label = self.encoder.inverse_transform([prediction_encoded])[0]
            return prediction_label
        except Exception as e:
            logging.error(f"Prediction failed for text '{text[:50]}...': {e}")
            return "error"


if __name__ == '__main__':
    classifier = SpamClassifier()
    pipeline_success = classifier.run_pipeline()

    if pipeline_success:
        logging.info("\n=== Spam Classification Pipeline Completed Successfully ===")
        logging.info("Overall Model Performance Summary (Sorted by F1-Score on Spam):")
        print(classifier.performance_df[['Model', 'Accuracy', 'Precision (Spam)', 'Recall (Spam)', 'F1-Score (Spam)']].to_string())
        logging.info(f"\nBest Performing Model Identified: {classifier.best_model_name}")
        logging.info(f"Check '{Config.PLOTS_DIR}' for EDA and Confusion Matrix plots.")
        logging.info(f"Check '{Config.MODELS_DIR}' for the saved best model.")

        try:
            logging.info("\n--- Demonstrating Model Inference from Saved Model ---")
            model_files = [f for f in os.listdir(Config.MODELS_DIR) if f.startswith('best_model_') and f.endswith('.pkl')]
            if model_files:
                latest_model_file = max(model_files, key=lambda f: os.path.getmtime(os.path.join(Config.MODELS_DIR, f)))
                latest_model_path = os.path.join(Config.MODELS_DIR, latest_model_file)
                logging.info(f"Attempting to load the latest best model from: {latest_model_path}")
                loaded_classifier = SpamClassifier.load_for_inference(latest_model_path)
                test_spam_text_1 = "WINNER! You have been selected for a £1000 prize! Call 09061701300 now or claim at link.co.uk/prize. T&C's apply."
                test_spam_text_2 = "URGENT! Your bank account has been locked due to suspicious activity. Verify immediately at http://bit.ly/malicious-site to avoid closure."
                test_ham_text_1 = "Hey, just checking in. How are you doing today? Let's catch up soon for coffee!"
                test_ham_text_2 = "Hi mom, can you pick up milk and bread on your way home? Thanks, love you!"
                test_empty_text = "???!!!#@%"
                print(f"\nPrediction for SPAM text 1: '{test_spam_text_1}' -> {loaded_classifier.predict(test_spam_text_1)}")
                print(f"Prediction for SPAM text 2: '{test_spam_text_2}' -> {loaded_classifier.predict(test_spam_text_2)}")
                print(f"Prediction for HAM text 1: '{test_ham_text_1}' -> {loaded_classifier.predict(test_ham_text_1)}")
                print(f"Prediction for HAM text 2: '{test_ham_text_2}' -> {loaded_classifier.predict(test_ham_text_2)}")
                print(f"Prediction for EMPTY/NOISY text: '{test_empty_text}' -> {loaded_classifier.predict(test_empty_text)}")
            else:
                logging.warning("No model files found in the 'models' directory to demonstrate inference. Run the pipeline first.")
        except Exception as e:
            logging.error(f"An error occurred during the inference demonstration: {e}")
            sys.exit(1)
    else:
        logging.critical("Spam classification pipeline failed during execution. Please review the log file for details.")
        sys.exit(1)

In [1]:
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from collections import Counter
from wordcloud import WordCloud
import pickle
import warnings
import logging
import os
import sys
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, recall_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                             BaggingClassifier, ExtraTreesClassifier,
                             GradientBoostingClassifier, VotingClassifier,
                             StackingClassifier)
from xgboost import XGBClassifier
from sentence_transformers import SentenceTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import optuna
from datetime import datetime

# --- Determine Base Directory for Notebook/Script ---
try:
    current_script_dir = os.path.dirname(os.path.abspath(__file__))
    base_directory = current_script_dir
    print(f"Running as a script. Base directory set to: '{base_directory}'")
except NameError:
    base_directory = os.getcwd()
    print(f"Running in a notebook environment. Base directory set to CWD: '{base_directory}'")


# --- Configuration (Externalize for production) ---
class Config:
    DATA_PATH = os.path.join(base_directory, 'spam.csv')
    SENTENCE_TRANSFORMER_MODEL = 'all-MiniLM-L6-v2'
    LOG_FILE = os.path.join(base_directory, 'spam_classifier.log')
    RANDOM_STATE = 42
    TEST_SIZE = 0.2
    N_TRIALS_OPTUNA = 15
    PLOTS_DIR = os.path.join(base_directory, 'plots')
    MODELS_DIR = os.path.join(base_directory, 'models')

# Ensure plot and model directories exist at startup
os.makedirs(Config.PLOTS_DIR, exist_ok=True)
os.makedirs(Config.MODELS_DIR, exist_ok=True)


class SpamClassifier:
    def __init__(self):
        self._configure_logging()
        self._verify_nltk_resources()
        self._configure_matplotlib()
        self.df = None
        self.encoder = LabelEncoder()
        self.ps = PorterStemmer()
        self.sentence_transformer_model = None
        self.X, self.y = None, None
        self.X_train, self.X_test, self.y_train, self.y_test = [None]*4
        self.clfs = {}
        self.best_tuned_models_params = {}
        self.best_model = None
        self.best_model_name = None
        self.performance_df = pd.DataFrame()
        self._initialize_classifiers()
        logging.info("SpamClassifier initialized successfully.")

    def _configure_logging(self) -> None:
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(Config.LOG_FILE),
                logging.StreamHandler(sys.stdout)
            ]
        )
        warnings.filterwarnings('ignore')

    def _verify_nltk_resources(self) -> None:
        resources = [
            ('tokenizers/punkt', 'punkt'),
            ('corpora/stopwords', 'stopwords'),
            ('tokenizers/punkt_tab', 'punkt_tab')
        ]
        for path, package in resources:
            try:
                nltk.data.find(path)
                logging.info(f"NLTK {package} resource found.")
            except LookupError:
                logging.warning(f"NLTK {package} not found. Attempting to download...")
                try:
                    nltk.download(package, quiet=True)
                    logging.info(f"NLTK {package} downloaded successfully.")
                except Exception as e:
                    logging.critical(f"Failed to download NLTK {package}. Error: {e}")
                    sys.exit(1)

    def _configure_matplotlib(self) -> None:
        plt.ioff()
        sns.set(style='whitegrid', palette='viridis')

    def _initialize_classifiers(self) -> None:
        """Initializes all individual and ensemble classifiers."""
        self.clfs = {
            'LR': LogisticRegression(
                solver='liblinear',
                penalty='l1',
                random_state=Config.RANDOM_STATE,
                class_weight='balanced',
                max_iter=1000
            ),
            'RF': RandomForestClassifier(
                n_estimators=100,
                random_state=Config.RANDOM_STATE,
                class_weight='balanced',
                n_jobs=-1
            ),
            'XGB': XGBClassifier(
                n_estimators=100,
                random_state=Config.RANDOM_STATE,
                eval_metric='logloss',
                scale_pos_weight=1
            ),
            'SVC': SVC(kernel='sigmoid', gamma=1.0, probability=True, random_state=Config.RANDOM_STATE, class_weight='balanced'),
            'KN': KNeighborsClassifier(),
            'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=Config.RANDOM_STATE),
            'BgC': BaggingClassifier(n_estimators=100, random_state=Config.RANDOM_STATE, n_jobs=-1),
            'ETC': ExtraTreesClassifier(n_estimators=100, random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1),
            'GBDT': GradientBoostingClassifier(n_estimators=100, random_state=Config.RANDOM_STATE),
            'DT': DecisionTreeClassifier(max_depth=5, random_state=Config.RANDOM_STATE, class_weight='balanced'),
        }
        # Add a VotingClassifier using some of the best models
        self.clfs['Voting'] = VotingClassifier(
            estimators=[
                ('xgb', self.clfs['XGB']),
                ('svc', self.clfs['SVC']),
                ('rf', self.clfs['RF']),
            ],
            voting='soft',  # Use 'soft' voting for probability-based prediction
            weights=[0.3, 0.4, 0.3],  # Example weights (can be tuned)
            n_jobs=-1
        )
        logging.info("Initialized all individual and ensemble classifiers.")

    def load_data(self) -> None:
        try:
            if not os.path.exists(Config.DATA_PATH):
                raise FileNotFoundError(f"Data file not found at {os.path.abspath(Config.DATA_PATH)}")
            self.df = pd.read_csv(Config.DATA_PATH, encoding='latin-1')
            if len(self.df) < 100:
                raise ValueError(f"Dataset too small ({len(self.df)} samples). Minimum 100 samples required for robust analysis.")
            logging.info(f"Loaded {len(self.df)} records from {Config.DATA_PATH}.")
        except Exception as e:
            logging.critical(f"Data loading failed: {e}")
            sys.exit(1)

    def clean_data(self) -> None:
        try:
            if 'v1' in self.df.columns and 'v2' in self.df.columns:
                self.df = self.df[['v1', 'v2']].copy()
                logging.info("Selected 'v1' and 'v2' columns from the dataset.")
            else:
                found_v1 = next((col for col in self.df.columns if 'target' in col.lower() or 'label' in col.lower() or 'type' in col.lower()), None)
                found_v2 = next((col for col in self.df.columns if 'text' in col.lower() or 'message' in col.lower() or 'sms' in col.lower()), None)
                if found_v1 and found_v2:
                    self.df = self.df[[found_v1, found_v2]].copy()
                    logging.info(f"Mapped columns '{found_v1}' to 'target' and '{found_v2}' to 'text' using heuristics.")
                else:
                    raise ValueError(f"Could not find required 'target' and 'text' columns (v1/v2 or equivalents) in dataset. Found columns: {self.df.columns.tolist()}")

            self.df.columns = ['target', 'text']
            valid_targets = {'ham', 'spam'}
            invalid_targets = set(self.df['target'].unique()) - valid_targets
            if invalid_targets:
                logging.warning(f"Invalid target values found: {invalid_targets}. Filtering out rows with these values.")
                self.df = self.df[self.df['target'].isin(valid_targets)]
                if self.df.empty:
                    raise ValueError("No valid 'ham' or 'spam' records remaining after filtering invalid targets. Dataset is empty.")

            self.df['target'] = self.encoder.fit_transform(self.df['target'])
            initial_rows = len(self.df)
            self.df.drop_duplicates(inplace=True)
            self.df.dropna(inplace=True)

            logging.info(f"Cleaned dataset. Removed {initial_rows - len(self.df)} duplicates/nulls. Remaining: {len(self.df)} records.")
            if self.df.empty:
                raise ValueError("Dataset became empty after cleaning steps. Check data quality or initial loading.")
        except Exception as e:
            logging.critical(f"Data cleaning failed: {e}")
            sys.exit(1)

    def _safe_tokenize(self, text: str) -> list[str]:
        if not isinstance(text, str):
            text = str(text)
            logging.debug(f"Coerced non-string text to string for tokenization: {text[:50]}...")
        try:
            tokens = nltk.word_tokenize(text.lower())
            return [t for t in tokens if t.isalnum() and t not in string.punctuation]
        except Exception as e:
            logging.warning(f"Tokenization failed for text (first 50 chars: '{text[:50]}...'). Returning empty list. Error: {e}")
            return []

    def eda(self) -> None:
        try:
            self.df['num_words'] = self.df['text'].apply(lambda x: len(self._safe_tokenize(x)))
            self.df['num_chars'] = self.df['text'].apply(len)
            self.df['num_sentences'] = self.df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

            ham_count = self.df[self.df['target'] == self.encoder.transform(['ham'])[0]].shape[0]
            spam_count = self.df[self.df['target'] == self.encoder.transform(['spam'])[0]].shape[0]
            if spam_count > 0:
                scale_pos_weight_val = ham_count / spam_count
                if 'XGB' in self.clfs:
                    self.clfs['XGB'].set_params(scale_pos_weight=scale_pos_weight_val)
                logging.info(f"Set XGBoost scale_pos_weight to: {scale_pos_weight_val:.2f} (Ham:{ham_count}, Spam:{spam_count})")
            else:
                logging.warning("No spam samples found to calculate scale_pos_weight for XGBoost. Defaulting to 1.")

            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            fig1, ax1 = plt.subplots(figsize=(8, 8))
            self.df['target'].value_counts().plot(
                kind='pie', ax=ax1, autopct='%1.1f%%',
                labels=self.encoder.inverse_transform(self.df['target'].value_counts().index),
                colors=sns.color_palette('pastel')[0:2],
                explode=[0, 0.1]
            )
            ax1.set_title('Target Class Distribution')
            ax1.set_ylabel('')
            fig1_filename = os.path.join(Config.PLOTS_DIR, f'target_distribution_{timestamp}.png')
            plt.savefig(fig1_filename, bbox_inches='tight')
            plt.close(fig1)
            logging.info(f"Target distribution plot saved to {fig1_filename}.")

            fig2, ax2 = plt.subplots(figsize=(14, 6))
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['ham'])[0]], x='num_words', ax=ax2, bins=50, kde=True, color='blue', label='Ham')
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['spam'])[0]], x='num_words', ax=ax2, bins=50, kde=True, color='red', label='Spam')
            ax2.set_title('Word Count Distribution by Target Class')
            ax2.set_xlabel('Number of Words')
            ax2.set_ylabel('Count')
            ax2.legend()
            fig2_filename = os.path.join(Config.PLOTS_DIR, f'word_count_distribution_{timestamp}.png')
            plt.savefig(fig2_filename, bbox_inches='tight')
            plt.close(fig2)
            logging.info(f"Word count distribution plot saved to {fig2_filename}.")

            fig3, ax3 = plt.subplots(figsize=(14, 6))
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['ham'])[0]], x='num_chars', ax=ax3, bins=50, kde=True, color='blue', label='Ham')
            sns.histplot(data=self.df[self.df['target'] == self.encoder.transform(['spam'])[0]], x='num_chars', ax=ax3, bins=50, kde=True, color='red', label='Spam')
            ax3.set_title('Character Count Distribution by Target Class')
            ax3.set_xlabel('Number of Characters')
            ax3.set_ylabel('Count')
            ax3.legend()
            fig3_filename = os.path.join(Config.PLOTS_DIR, f'char_count_distribution_{timestamp}.png')
            plt.savefig(fig3_filename, bbox_inches='tight')
            plt.close(fig3)
            logging.info(f"Character count distribution plot saved to {fig3_filename}.")

            fig4, ax4 = plt.subplots(figsize=(8, 6))
            sns.heatmap(self.df[['num_chars', 'num_words', 'num_sentences', 'target']].corr(), annot=True, cmap='coolwarm', ax=ax4)
            ax4.set_title('Correlation Matrix of Text Features and Target')
            fig4_filename = os.path.join(Config.PLOTS_DIR, f'correlation_heatmap_{timestamp}.png')
            plt.savefig(fig4_filename, bbox_inches='tight')
            plt.close(fig4)
            logging.info(f"Correlation heatmap plot saved to {fig4_filename}.")

            logging.info(f"Descriptive statistics for Ham emails:\n{self.df[self.df['target'] == self.encoder.transform(['ham'])[0]][['num_chars', 'num_words', 'num_sentences']].describe()}")
            logging.info(f"Descriptive statistics for Spam emails:\n{self.df[self.df['target'] == self.encoder.transform(['spam'])[0]][['num_chars', 'num_words', 'num_sentences']].describe()}")

        except Exception as e:
            logging.error(f"EDA process failed: {e}")
            raise

    def transform_text(self, text: str) -> str:
        if not isinstance(text, str):
            text = str(text)
            logging.debug(f"Coerced non-string text to string for transform_text: {text[:50]}...")
        tokens = nltk.word_tokenize(text.lower())
        processed_tokens = [token for token in tokens if token.isalnum()]
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [token for token in processed_tokens if token not in stop_words and token not in string.punctuation]
        stemmed_tokens = [self.ps.stem(token) for token in filtered_tokens]
        final_tokens = [token for token in stemmed_tokens if len(token) > 1 or token.isdigit()]
        return " ".join(final_tokens)

    def preprocess_text(self) -> None:
        try:
            logging.info("\n--- Text Preprocessing for EDA and Visualizations ---")
            self.df['transformed_text'] = self.df['text'].apply(self.transform_text)
            logging.info("Text transformation for EDA complete. Example:")
            logging.info(f"\n{self.df[['text', 'transformed_text']].head().to_string()}")
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            logging.info("\nGenerating Word Clouds (saved to plots directory):")
            spam_wc = WordCloud(width=800, height=400, min_font_size=10, background_color='white').generate(
                self.df[self.df['target'] == self.encoder.transform(['spam'])[0]]['transformed_text'].str.cat(sep=" ")
            )
            plt.figure(figsize=(10, 5))
            plt.imshow(spam_wc)
            plt.title('Spam Word Cloud')
            plt.axis('off')
            wc_spam_filename = os.path.join(Config.PLOTS_DIR, f'spam_wordcloud_{timestamp}.png')
            plt.savefig(wc_spam_filename, bbox_inches='tight')
            plt.close()
            logging.info(f"Spam word cloud saved to {wc_spam_filename}.")

            ham_wc = WordCloud(width=800, height=400, min_font_size=10, background_color='white').generate(
                self.df[self.df['target'] == self.encoder.transform(['ham'])[0]]['transformed_text'].str.cat(sep=" ")
            )
            plt.figure(figsize=(10, 5))
            plt.imshow(ham_wc)
            plt.title('Ham Word Cloud')
            plt.axis('off')
            wc_ham_filename = os.path.join(Config.PLOTS_DIR, f'ham_wordcloud_{timestamp}.png')
            plt.savefig(wc_ham_filename, bbox_inches='tight')
            plt.close()
            logging.info(f"Ham word cloud saved to {wc_ham_filename}.")

            logging.info("\nMost common words in Spam (saved as plot):")
            spam_corpus = ' '.join(self.df[self.df['target'] == self.encoder.transform(['spam'])[0]]['transformed_text']).split()
            self._plot_most_common_words(spam_corpus, title='Top 30 Spam Words', filename=f'top_spam_words_{timestamp}.png')

            logging.info("\nMost common words in Ham (saved as plot):")
            ham_corpus = ' '.join(self.df[self.df['target'] == self.encoder.transform(['ham'])[0]]['transformed_text']).split()
            self._plot_most_common_words(ham_corpus, title='Top 30 Ham Words', filename=f'top_ham_words_{timestamp}.png')

        except Exception as e:
            logging.critical(f"Text preprocessing for EDA failed: {e}")
            sys.exit(1)

    def _plot_most_common_words(self, corpus: list[str], title: str, n: int = 30, filename: str = "common_words.png") -> None:
        common_words = Counter(corpus).most_common(n)
        df_common_words = pd.DataFrame(common_words, columns=['Word', 'Count'])
        fig, ax = plt.subplots(figsize=(12, 6))
        sns.barplot(x='Word', y='Count', data=df_common_words, ax=ax, palette='viridis')
        ax.set_xticklabels(ax.get_xticklabels(), rotation='vertical')
        ax.set_title(title)
        plot_filepath = os.path.join(Config.PLOTS_DIR, filename)
        plt.savefig(plot_filepath, bbox_inches='tight')
        plt.close(fig)
        logging.info(f"Plot '{title}' saved to {plot_filepath}.")

    def vectorize_text_with_embeddings(self) -> None:
        try:
            logging.info(f"\n--- Text Vectorization (SentenceTransformer: {Config.SENTENCE_TRANSFORMER_MODEL}) ---")
            if self.sentence_transformer_model is None:
                self.sentence_transformer_model = SentenceTransformer(Config.SENTENCE_TRANSFORMER_MODEL)

            self.X = self.sentence_transformer_model.encode(
                self.df['text'].tolist(),
                show_progress_bar=True,
                convert_to_tensor=False,
                batch_size=64
            )
            self.y = self.df['target'].values
            logging.info(f"SentenceTransformer embedding complete. X shape: {self.X.shape}, Y shape: {self.y.shape}.")
        except Exception as e:
            logging.critical(f"Text vectorization failed: {e}")
            sys.exit(1)

    def split_data(self) -> None:
        try:
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
                self.X, self.y, test_size=Config.TEST_SIZE,
                random_state=Config.RANDOM_STATE, stratify=self.y)
            logging.info(f"Data split: Train {len(self.X_train)} samples, Test {len(self.X_test)} samples.")
            logging.info(f"Train target distribution: {np.bincount(self.y_train)}")
            logging.info(f"Test target distribution: {np.bincount(self.y_test)}")
        except Exception as e:
            logging.critical(f"Data splitting failed: {e}")
            sys.exit(1)

    def _objective(self, trial: optuna.trial.Trial, model_name: str) -> float:
        if model_name == 'LR':
            c_param = trial.suggest_loguniform('C', 1e-4, 1e2)
            solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])
            model = LogisticRegression(C=c_param, solver=solver, random_state=Config.RANDOM_STATE,
                                       class_weight='balanced', max_iter=2000,
                                       n_jobs=-1 if solver == 'saga' else None)
        elif model_name == 'RF':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 5, 40, log=True)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           min_samples_leaf=min_samples_leaf,
                                           random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1)
        elif model_name == 'XGB':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 3, 12)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.005, 0.5)
            subsample = trial.suggest_uniform('subsample', 0.6, 1.0)
            colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
            gamma = trial.suggest_loguniform('gamma', 1e-8, 1.0)
            current_scale_pos_weight = self.clfs['XGB'].get_params().get('scale_pos_weight', 1)
            model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                  learning_rate=learning_rate, subsample=subsample,
                                  colsample_bytree=colsample_bytree, gamma=gamma,
                                  random_state=Config.RANDOM_STATE,
                                  eval_metric='logloss',
                                  scale_pos_weight=current_scale_pos_weight)
        elif model_name == 'SVC':
            C_param = trial.suggest_loguniform('C', 1e-2, 1e2)
            gamma_param = trial.suggest_loguniform('gamma', 1e-3, 1e1)
            kernel = trial.suggest_categorical('kernel', ['rbf', 'sigmoid'])
            model = SVC(C=C_param, gamma=gamma_param, kernel=kernel, probability=True,
                        random_state=Config.RANDOM_STATE, class_weight='balanced')
        elif model_name == 'KN':
            n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
            weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
            algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
            model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, n_jobs=-1)
        elif model_name == 'AdaBoost':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 1.0)
            model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=Config.RANDOM_STATE)
        elif model_name == 'BgC':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            model = BaggingClassifier(n_estimators=n_estimators, random_state=Config.RANDOM_STATE, n_jobs=-1)
        elif model_name == 'ETC':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 5, 40, log=True)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            model = ExtraTreesClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                         min_samples_split=min_samples_split,
                                         min_samples_leaf=min_samples_leaf,
                                         random_state=Config.RANDOM_STATE, class_weight='balanced', n_jobs=-1)
        elif model_name == 'GBDT':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 1.0)
            max_depth = trial.suggest_int('max_depth', 3, 10)
            model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=Config.RANDOM_STATE)
        elif model_name == 'DT':
            max_depth = trial.suggest_int('max_depth', 3, 20)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
            criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
            model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split,
                                           min_samples_leaf=min_samples_leaf, criterion=criterion,
                                           random_state=Config.RANDOM_STATE, class_weight='balanced')
        else:
            raise ValueError(f"Model '{model_name}' is not configured for Optuna tuning.")

        pipeline = ImbPipeline([
            ('smote', SMOTE(random_state=Config.RANDOM_STATE)),
            ('classifier', model)
        ])
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=Config.RANDOM_STATE)
        scores = cross_val_score(pipeline, self.X_train, self.y_train, cv=cv, scoring='f1', n_jobs=-1)
        return scores.mean()

    def tune_models(self) -> None:
        try:
            if self.X_train is None or self.y_train is None:
                logging.error("Data not split for tuning. Calling split_data().")
                self.split_data()

            logging.info("Starting hyperparameter tuning with Optuna for selected models...")
            models_to_tune = ['LR', 'RF', 'XGB', 'SVC', 'ETC']

            for name in models_to_tune:
                if name not in self.clfs:
                    logging.warning(f"Model '{name}' not found in initialized classifiers, skipping tuning.")
                    continue

                logging.info(f"Tuning {name} model with {Config.N_TRIALS_OPTUNA} trials...")
                study = optuna.create_study(direction='maximize',
                                            sampler=optuna.samplers.TPESampler(seed=Config.RANDOM_STATE),
                                            study_name=f"{name}_tuning_study")

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", UserWarning)
                    study.optimize(lambda trial: self._objective(trial, name),
                                   n_trials=Config.N_TRIALS_OPTUNA,
                                   show_progress_bar=True,
                                   gc_after_trial=True)

                self.best_tuned_models_params[name] = study.best_trial.params
                logging.info(f"Best parameters for {name}: {study.best_trial.params}")
                logging.info(f"Best cross-validated F1-score for {name}: {study.best_trial.value:.4f}")

                self.clfs[name].set_params(**study.best_trial.params)
                if name == 'XGB':
                    current_scale_pos_weight = self.clfs[name].get_params().get('scale_pos_weight', 1)
                    self.clfs[name].set_params(scale_pos_weight=current_scale_pos_weight)

            logging.info("Hyperparameter tuning completed for all selected models.")
        except Exception as e:
            logging.critical(f"Model tuning failed: {e}")
            sys.exit(1)

    def train_final_models(self) -> None:
        try:
            if self.X_train is None or self.X_test is None:
                 logging.error("Data not split for final training. Calling split_data().")
                 self.split_data()

            logging.info("Applying SMOTE to the entire training data for final model training...")
            smote = SMOTE(random_state=Config.RANDOM_STATE)
            X_train_resampled, y_train_resampled = smote.fit_resample(self.X_train, self.y_train)
            logging.info(f"SMOTE applied. Original train: {len(self.X_train)} samples. Resampled train: {len(X_train_resampled)} samples.")

            results = []
            best_f1_overall = -1
            self.best_model = None
            self.best_model_name = None
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

            # Training loop now includes the VotingClassifier
            for name, model in self.clfs.items():
                logging.info(f"Training final {name} model on resampled data and evaluating...")
                try:
                    # To use the ImbPipeline, we need to pass the model, not just the classifier
                    pipeline = ImbPipeline([('smote', SMOTE(random_state=Config.RANDOM_STATE)), ('classifier', model)])
                    pipeline.fit(self.X_train, self.y_train)
                    y_pred = pipeline.predict(self.X_test)

                    accuracy = accuracy_score(self.y_test, y_pred)
                    precision = precision_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)
                    recall = recall_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)
                    f1 = f1_score(self.y_test, y_pred, pos_label=self.encoder.transform(['spam'])[0], zero_division=0)

                    report_dict = classification_report(self.y_test, y_pred, target_names=self.encoder.classes_, output_dict=True)

                    results.append({
                        'Model': name,
                        'Accuracy': accuracy,
                        'Precision (Spam)': precision,
                        'Recall (Spam)': recall,
                        'F1-Score (Spam)': f1,
                        'Full Classification Report': report_dict
                    })

                    logging.info(f"\n--- Performance for {name} ---")
                    logging.info(f"Accuracy: {accuracy:.4f}")
                    logging.info(f"Precision (Spam): {precision:.4f}")
                    logging.info(f"Recall (Spam): {recall:.4f}")
                    logging.info(f"F1-Score (Spam): {f1:.4f}")
                    logging.info(f"\nFull Classification Report for {name}:\n{classification_report(self.y_test, y_pred, target_names=self.encoder.classes_)}")

                    cm = confusion_matrix(self.y_test, y_pred)
                    logging.info(f"\nRaw Confusion Matrix for {name}:\n{cm}")

                    fig_cm, ax_cm = plt.subplots(figsize=(7, 6))
                    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                                xticklabels=self.encoder.classes_,
                                yticklabels=self.encoder.classes_,
                                linecolor='gray', linewidths=0.5,
                                annot_kws={"size": 14})
                    ax_cm.set_xlabel('Predicted Label', fontsize=12)
                    ax_cm.set_ylabel('True Label', fontsize=12)
                    ax_cm.set_title(f'Confusion Matrix for {name}', fontsize=14)
                    cm_filename = os.path.join(Config.PLOTS_DIR, f'confusion_matrix_{name}_{timestamp}.png')
                    plt.savefig(cm_filename, bbox_inches='tight')
                    plt.close(fig_cm)
                    logging.info(f"Confusion matrix plot for {name} saved to {cm_filename}.")

                    if f1 > best_f1_overall:
                        best_f1_overall = f1
                        self.best_model_name = name
                        self.best_model = pipeline # Store the entire pipeline
                except Exception as model_e:
                    logging.error(f"Error training or evaluating model {name}: {model_e}")
                    results.append({
                        'Model': name,
                        'Accuracy': np.nan,
                        'Precision (Spam)': np.nan,
                        'Recall (Spam)': np.nan,
                        'F1-Score (Spam)': np.nan,
                        'Full Classification Report': {'error': str(model_e)}
                    })

            self.performance_df = pd.DataFrame(results)
            self.performance_df = self.performance_df.sort_values(by='F1-Score (Spam)', ascending=False).reset_index(drop=True)
            logging.info(f"\n--- Overall Best Model Identified: {self.best_model_name} (F1-Score on Spam: {best_f1_overall:.4f}) ---")
            logging.info("All model evaluations completed.")
            self._save_best_model()
            self._plot_performance_comparison(timestamp)

        except Exception as e:
            logging.critical(f"Final model training and evaluation failed: {e}")
            sys.exit(1)

    def _save_best_model(self) -> None:
        """Saves the best performing model and related components to a pickle file."""
        try:
            if self.best_model is None or self.best_model_name is None:
                logging.warning("No best model identified or stored. Skipping model save operation.")
                return
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_filename = os.path.join(Config.MODELS_DIR, f'best_model_{self.best_model_name}_{timestamp}.pkl')
            with open(model_filename, 'wb') as f:
                pickle.dump({
                    'model': self.best_model,
                    'transformer': Config.SENTENCE_TRANSFORMER_MODEL,
                    'encoder': self.encoder,
                    'model_name': self.best_model_name,
                    'performance_summary': self.performance_df.to_dict('records')
                }, f)
            logging.info(f"Best performing model ({self.best_model_name}) saved to {model_filename}")
        except Exception as e:
            logging.error(f"Failed to save the best model: {e}")

    def _plot_performance_comparison(self, timestamp: str) -> None:
        if self.performance_df.empty:
            logging.warning("Performance DataFrame is empty, cannot plot comparison.")
            return
        plot_df = self.performance_df[['Model', 'Accuracy', 'Precision (Spam)', 'Recall (Spam)', 'F1-Score (Spam)']].copy()
        plot_df_melted = plot_df.melt(id_vars="Model", var_name="Metric", value_name="Score")
        fig, ax = plt.subplots(figsize=(14, 7))
        sns.barplot(x='Model', y='Score', hue='Metric', data=plot_df_melted, palette='tab10', ax=ax)
        ax.set_ylim(0.5, 1.0)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
        ax.set_title('Model Performance Comparison (Test Set)')
        ax.set_ylabel('Score')
        ax.set_xlabel('Model')
        ax.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plot_filename = os.path.join(Config.PLOTS_DIR, f'model_performance_comparison_{timestamp}.png')
        plt.savefig(plot_filename, bbox_inches='tight')
        plt.close(fig)
        logging.info(f"Model performance comparison plot saved to {plot_filename}.")

    def run_pipeline(self) -> bool:
        steps = [
            ('Data Loading', self.load_data),
            ('Data Cleaning', self.clean_data),
            ('EDA and Feature Engineering', self.eda),
            ('Text Preprocessing for EDA', self.preprocess_text),
            ('Text Vectorization (Embeddings)', self.vectorize_text_with_embeddings),
            ('Data Splitting', self.split_data),
            ('Hyperparameter Tuning', self.tune_models),
            ('Final Model Training & Evaluation', self.train_final_models)
        ]
        for name, step in steps:
            try:
                logging.info(f"\n--- Starting Pipeline Step: {name} ---")
                step()
                logging.info(f"--- Completed Pipeline Step: {name} ---\n")
            except SystemExit:
                logging.critical(f"Pipeline stopped due to critical error in step: '{name}'.")
                return False
            except Exception as e:
                logging.critical(f"Pipeline failed unexpectedly in step '{name}': {e}")
                return False
        logging.info("Spam classification pipeline completed successfully.")
        return True

    @staticmethod
    def load_for_inference(model_path: str) -> 'SpamClassifier':
        try:
            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Model file not found at {os.path.abspath(model_path)}")
            with open(model_path, 'rb') as f:
                data = pickle.load(f)
            classifier = SpamClassifier()
            classifier.best_model = data['model']
            classifier.encoder = data['encoder']
            classifier.best_model_name = data.get('model_name', 'Unknown_Model')
            transformer_data = data['transformer']
            if isinstance(transformer_data, str):
                logging.info(f"Loading SentenceTransformer by name: '{transformer_data}'")
                classifier.sentence_transformer_model = SentenceTransformer(transformer_data)
            else:
                logging.warning("Loaded SentenceTransformer object directly from pickle.")
                classifier.sentence_transformer_model = transformer_data
            classifier.ps = PorterStemmer()
            logging.info(f"Model '{classifier.best_model_name}' loaded successfully from {model_path} for inference.")
            return classifier
        except Exception as e:
            logging.critical(f"Failed to load model for inference from {model_path}: {e}")
            raise

    def predict(self, text: str) -> str:
        """
        Predicts the label for a given text.
        This method is now a simplified wrapper for predict_with_confidence.
        """
        prediction_label, _, _ = self.predict_with_confidence(text)
        return prediction_label

    def predict_with_confidence(self, text: str) -> tuple[str, float, float]:
        """
        Predicts the label and returns the confidence score for spam/ham.
        Returns: (prediction_label, spam_confidence, ham_confidence)
        """
        if self.best_model is None or self.sentence_transformer_model is None or self.encoder is None:
            logging.error("Model components not loaded. Please load model using load_for_inference() before calling predict().")
            raise RuntimeError("Model components not available for prediction.")
        try:
            vector = self.sentence_transformer_model.encode([text], convert_to_tensor=False)
            prediction_encoded = self.best_model.predict(vector)[0]
            prediction_label = self.encoder.inverse_transform([prediction_encoded])[0]

            # Get probabilities and confidence
            prediction_proba = self.best_model.predict_proba(vector)[0]
            classes = self.best_model.named_steps['classifier'].classes_
            
            spam_prob_idx = np.where(classes == self.encoder.transform(['spam'])[0])[0]
            ham_prob_idx = np.where(classes == self.encoder.transform(['ham'])[0])[0]
            
            spam_confidence = prediction_proba[spam_prob_idx][0] if spam_prob_idx.size > 0 else 0.0
            ham_confidence = prediction_proba[ham_prob_idx][0] if ham_prob_idx.size > 0 else 0.0
            
            return prediction_label, spam_confidence, ham_confidence

        except Exception as e:
            logging.error(f"Prediction failed for text '{text[:50]}...': {e}")
            return "error", 0.0, 0.0

if __name__ == '__main__':
    classifier = SpamClassifier()
    pipeline_success = classifier.run_pipeline()

    if pipeline_success:
        logging.info("\n=== Spam Classification Pipeline Completed Successfully ===")
        logging.info("Overall Model Performance Summary (Sorted by F1-Score on Spam):")
        print(classifier.performance_df[['Model', 'Accuracy', 'Precision (Spam)', 'Recall (Spam)', 'F1-Score (Spam)']].to_string())
        logging.info(f"\nBest Performing Model Identified: {classifier.best_model_name}")
        logging.info(f"Check '{Config.PLOTS_DIR}' for EDA and Confusion Matrix plots.")
        logging.info(f"Check '{Config.MODELS_DIR}' for the saved best model.")

        try:
            logging.info("\n--- Demonstrating Model Inference from Saved Model ---")
            model_files = [f for f in os.listdir(Config.MODELS_DIR) if f.startswith('best_model_') and f.endswith('.pkl')]
            if model_files:
                latest_model_file = max(model_files, key=lambda f: os.path.getmtime(os.path.join(Config.MODELS_DIR, f)))
                latest_model_path = os.path.join(Config.MODELS_DIR, latest_model_file)
                logging.info(f"Attempting to load the latest best model from: {latest_model_path}")
                loaded_classifier = SpamClassifier.load_for_inference(latest_model_path)

                test_spam_text_1 = "WINNER! You have been selected for a £1000 prize! Call 09061701300 now or claim at link.co.uk/prize. T&C's apply."
                test_ham_text_1 = "Hey, just checking in. How are you doing today? Let's catch up soon for coffee!"
                
                label, spam_conf, ham_conf = loaded_classifier.predict_with_confidence(test_spam_text_1)
                print(f"Prediction for SPAM text: '{test_spam_text_1}' -> Label: {label}, Spam Confidence: {spam_conf:.4f}")
                
                label, spam_conf, ham_conf = loaded_classifier.predict_with_confidence(test_ham_text_1)
                print(f"Prediction for HAM text: '{test_ham_text_1}' -> Label: {label}, Ham Confidence: {ham_conf:.4f}")

            else:
                logging.warning("No model files found in the 'models' directory to demonstrate inference. Run the pipeline first.")
        except Exception as e:
            logging.error(f"An error occurred during the inference demonstration: {e}")
            sys.exit(1)
    else:
        logging.critical("Spam classification pipeline failed during execution. Please review the log file for details.")
        sys.exit

  from .autonotebook import tqdm as notebook_tqdm


Running in a notebook environment. Base directory set to CWD: '/home/dev/spam_classifier_project'
2025-08-11 15:27:02,647 - INFO - NLTK punkt resource found.
2025-08-11 15:27:02,649 - INFO - NLTK stopwords resource found.
2025-08-11 15:27:02,650 - INFO - NLTK punkt_tab resource found.
2025-08-11 15:27:02,652 - INFO - Initialized all individual and ensemble classifiers.
2025-08-11 15:27:02,652 - INFO - SpamClassifier initialized successfully.
2025-08-11 15:27:02,654 - INFO - 
--- Starting Pipeline Step: Data Loading ---
2025-08-11 15:27:02,667 - INFO - Loaded 5572 records from /home/dev/spam_classifier_project/spam.csv.
2025-08-11 15:27:02,668 - INFO - --- Completed Pipeline Step: Data Loading ---

2025-08-11 15:27:02,669 - INFO - 
--- Starting Pipeline Step: Data Cleaning ---
2025-08-11 15:27:02,674 - INFO - Selected 'v1' and 'v2' columns from the dataset.
2025-08-11 15:27:02,869 - INFO - Cleaned dataset. Removed 403 duplicates/nulls. Remaining: 5169 records.
2025-08-11 15:27:02,870 - 

Batches: 100%|██████████| 81/81 [00:32<00:00,  2.47it/s]

2025-08-11 15:27:49,719 - INFO - SentenceTransformer embedding complete. X shape: (5169, 384), Y shape: (5169,).
2025-08-11 15:27:49,720 - INFO - --- Completed Pipeline Step: Text Vectorization (Embeddings) ---

2025-08-11 15:27:49,721 - INFO - 
--- Starting Pipeline Step: Data Splitting ---
2025-08-11 15:27:49,727 - INFO - Data split: Train 4135 samples, Test 1034 samples.
2025-08-11 15:27:49,728 - INFO - Train target distribution: [3613  522]
2025-08-11 15:27:49,729 - INFO - Test target distribution: [903 131]
2025-08-11 15:27:49,730 - INFO - --- Completed Pipeline Step: Data Splitting ---

2025-08-11 15:27:49,731 - INFO - 
--- Starting Pipeline Step: Hyperparameter Tuning ---
2025-08-11 15:27:49,731 - INFO - Starting hyperparameter tuning with Optuna for selected models...
2025-08-11 15:27:49,732 - INFO - Tuning LR model with 15 trials...



[I 2025-08-11 15:27:49,734] A new study created in memory with name: LR_tuning_study
  0%|          | 0/15 [00:03<?, ?it/s]

[I 2025-08-11 15:27:52,899] Trial 0 finished with value: 0.8195201694380092 and parameters: {'C': 0.017670169402947963, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8195201694380092.


Best trial: 0. Best value: 0.81952:   7%|▋         | 1/15 [00:06<00:47,  3.41s/it]

[I 2025-08-11 15:27:55,737] Trial 1 finished with value: 0.9067637144074732 and parameters: {'C': 0.39079671568228835, 'solver': 'liblinear'}. Best is trial 1 with value: 0.9067637144074732.


Best trial: 1. Best value: 0.906764:  13%|█▎        | 2/15 [00:06<00:39,  3.05s/it]

[I 2025-08-11 15:27:56,480] Trial 2 finished with value: 0.5636527533822596 and parameters: {'C': 0.00022310108018679258, 'solver': 'liblinear'}. Best is trial 1 with value: 0.9067637144074732.


Best trial: 1. Best value: 0.906764:  20%|██        | 3/15 [00:09<00:24,  2.04s/it]

[I 2025-08-11 15:27:58,931] Trial 3 finished with value: 0.9252627284490291 and parameters: {'C': 1.7718847354806828, 'solver': 'saga'}. Best is trial 3 with value: 0.9252627284490291.


Best trial: 3. Best value: 0.925263:  27%|██▋       | 4/15 [00:11<00:24,  2.19s/it]

[I 2025-08-11 15:28:00,923] Trial 4 finished with value: 0.9358347576312023 and parameters: {'C': 9.877700294007917, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  33%|███▎      | 5/15 [00:13<00:21,  2.13s/it]

[I 2025-08-11 15:28:03,010] Trial 5 finished with value: 0.7982814551985044 and parameters: {'C': 0.0012601639723276807, 'solver': 'saga'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  40%|████      | 6/15 [00:15<00:18,  2.09s/it]

[I 2025-08-11 15:28:04,878] Trial 6 finished with value: 0.8533267901688955 and parameters: {'C': 0.039054412752107935, 'solver': 'saga'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  53%|█████▎    | 8/15 [00:17<00:13,  1.98s/it]

[I 2025-08-11 15:28:06,796] Trial 7 finished with value: 0.7962869532134003 and parameters: {'C': 0.0006870101665590031, 'solver': 'saga'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  53%|█████▎    | 8/15 [00:17<00:13,  1.98s/it]

[I 2025-08-11 15:28:07,708] Trial 8 finished with value: 0.8553585616674987 and parameters: {'C': 0.054502936945582565, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  60%|██████    | 9/15 [00:19<00:10,  1.67s/it]

[I 2025-08-11 15:28:08,756] Trial 9 finished with value: 0.8757717445900015 and parameters: {'C': 0.12173252504194051, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  67%|██████▋   | 10/15 [00:21<00:07,  1.48s/it]

[I 2025-08-11 15:28:11,044] Trial 10 finished with value: 0.9347469338394623 and parameters: {'C': 73.7864208342295, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  73%|███████▎  | 11/15 [00:23<00:06,  1.73s/it]

[I 2025-08-11 15:28:13,419] Trial 11 finished with value: 0.9338781698536784 and parameters: {'C': 65.64817611753449, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  80%|████████  | 12/15 [00:25<00:05,  1.93s/it]

[I 2025-08-11 15:28:15,571] Trial 12 finished with value: 0.9357473287321831 and parameters: {'C': 78.72383571224226, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  87%|████████▋ | 13/15 [00:27<00:03,  1.99s/it]

[I 2025-08-11 15:28:17,163] Trial 13 finished with value: 0.9331448068701527 and parameters: {'C': 8.224108054741553, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835:  93%|█████████▎| 14/15 [00:28<00:01,  1.87s/it]

[I 2025-08-11 15:28:18,708] Trial 14 finished with value: 0.9331984783268465 and parameters: {'C': 6.611757606926467, 'solver': 'liblinear'}. Best is trial 4 with value: 0.9358347576312023.


Best trial: 4. Best value: 0.935835: 100%|██████████| 15/15 [00:29<00:00,  1.95s/it]

2025-08-11 15:28:18,978 - INFO - Best parameters for LR: {'C': 9.877700294007917, 'solver': 'liblinear'}
2025-08-11 15:28:18,980 - INFO - Best cross-validated F1-score for LR: 0.9358
2025-08-11 15:28:18,981 - INFO - Tuning RF model with 15 trials...



[I 2025-08-11 15:28:18,983] A new study created in memory with name: RF_tuning_study
  0%|          | 0/15 [00:23<?, ?it/s]

[I 2025-08-11 15:28:42,567] Trial 0 finished with value: 0.9105635300372142 and parameters: {'n_estimators': 144, 'max_depth': 36, 'min_samples_split': 15, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.9105635300372142.


Best trial: 0. Best value: 0.910564:   7%|▋         | 1/15 [00:32<05:33, 23.85s/it]

[I 2025-08-11 15:28:51,273] Trial 1 finished with value: 0.9127467132964885 and parameters: {'n_estimators': 89, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.9127467132964885.


Best trial: 1. Best value: 0.912747:  13%|█▎        | 2/15 [01:03<03:14, 14.93s/it]

[I 2025-08-11 15:29:22,101] Trial 2 finished with value: 0.9091251939576921 and parameters: {'n_estimators': 200, 'max_depth': 21, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 1 with value: 0.9127467132964885.


Best trial: 1. Best value: 0.912747:  20%|██        | 3/15 [01:31<04:26, 22.18s/it]

[I 2025-08-11 15:29:50,938] Trial 3 finished with value: 0.9170959825132309 and parameters: {'n_estimators': 258, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  33%|███▎      | 5/15 [01:52<03:52, 23.26s/it]

[I 2025-08-11 15:30:11,503] Trial 4 finished with value: 0.9058311877181092 and parameters: {'n_estimators': 126, 'max_depth': 14, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  33%|███▎      | 5/15 [02:12<03:52, 23.26s/it]

[I 2025-08-11 15:30:31,200] Trial 5 finished with value: 0.9090006738787435 and parameters: {'n_estimators': 203, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  40%|████      | 6/15 [02:39<03:18, 22.08s/it]

[I 2025-08-11 15:30:58,411] Trial 6 finished with value: 0.9097470973538184 and parameters: {'n_estimators': 164, 'max_depth': 25, 'min_samples_split': 5, 'min_samples_leaf': 6}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  47%|████▋     | 7/15 [02:56<03:10, 23.75s/it]

[I 2025-08-11 15:31:15,212] Trial 7 finished with value: 0.9093436919014714 and parameters: {'n_estimators': 198, 'max_depth': 5, 'min_samples_split': 13, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  53%|█████▎    | 8/15 [03:07<02:30, 21.55s/it]

[I 2025-08-11 15:31:26,967] Trial 8 finished with value: 0.9052117927047891 and parameters: {'n_estimators': 66, 'max_depth': 36, 'min_samples_split': 20, 'min_samples_leaf': 9}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  60%|██████    | 9/15 [03:20<01:50, 18.47s/it]

[I 2025-08-11 15:31:39,392] Trial 9 finished with value: 0.9089022907441778 and parameters: {'n_estimators': 126, 'max_depth': 6, 'min_samples_split': 15, 'min_samples_leaf': 5}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  67%|██████▋   | 10/15 [04:00<01:23, 16.61s/it]

[I 2025-08-11 15:32:19,378] Trial 10 finished with value: 0.9116850200695618 and parameters: {'n_estimators': 287, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  73%|███████▎  | 11/15 [04:34<01:35, 23.76s/it]

[I 2025-08-11 15:32:53,242] Trial 11 finished with value: 0.9108888736746182 and parameters: {'n_estimators': 277, 'max_depth': 9, 'min_samples_split': 2, 'min_samples_leaf': 8}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  80%|████████  | 12/15 [04:41<01:20, 26.83s/it]

[I 2025-08-11 15:33:00,271] Trial 12 finished with value: 0.9084614524269989 and parameters: {'n_estimators': 59, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 8}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  87%|████████▋ | 13/15 [05:18<00:41, 20.84s/it]

[I 2025-08-11 15:33:37,383] Trial 13 finished with value: 0.911159221076747 and parameters: {'n_estimators': 250, 'max_depth': 13, 'min_samples_split': 5, 'min_samples_leaf': 7}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096:  93%|█████████▎| 14/15 [05:27<00:25, 25.75s/it]

[I 2025-08-11 15:33:46,808] Trial 14 finished with value: 0.9118333987728944 and parameters: {'n_estimators': 90, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9170959825132309.


Best trial: 3. Best value: 0.917096: 100%|██████████| 15/15 [05:28<00:00, 21.87s/it]

2025-08-11 15:33:47,071 - INFO - Best parameters for RF: {'n_estimators': 258, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 2}
2025-08-11 15:33:47,072 - INFO - Best cross-validated F1-score for RF: 0.9171
2025-08-11 15:33:47,073 - INFO - Tuning XGB model with 15 trials...



[I 2025-08-11 15:33:47,075] A new study created in memory with name: XGB_tuning_study
  0%|          | 0/15 [00:19<?, ?it/s]

[I 2025-08-11 15:34:06,641] Trial 0 finished with value: 0.922533456170615 and parameters: {'n_estimators': 144, 'max_depth': 12, 'learning_rate': 0.14553179565665345, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.6624074561769746, 'gamma': 1.7699302940633311e-07}. Best is trial 0 with value: 0.922533456170615.


Best trial: 0. Best value: 0.922533:  13%|█▎        | 2/15 [00:40<04:24, 20.38s/it]

[I 2025-08-11 15:34:27,499] Trial 1 finished with value: 0.8968282232826492 and parameters: {'n_estimators': 64, 'max_depth': 11, 'learning_rate': 0.07965261308120507, 'subsample': 0.8832290311184181, 'colsample_bytree': 0.608233797718321, 'gamma': 0.574485163632042}. Best is trial 0 with value: 0.922533456170615.


Best trial: 0. Best value: 0.922533:  13%|█▎        | 2/15 [01:08<04:24, 20.38s/it]

[I 2025-08-11 15:34:55,911] Trial 2 finished with value: 0.8411553008352888 and parameters: {'n_estimators': 258, 'max_depth': 5, 'learning_rate': 0.011551009439226469, 'subsample': 0.6733618039413735, 'colsample_bytree': 0.7216968971838151, 'gamma': 0.00015777981883364995}. Best is trial 0 with value: 0.922533456170615.


Best trial: 0. Best value: 0.922533:  20%|██        | 3/15 [01:27<04:48, 24.06s/it]

[I 2025-08-11 15:35:14,348] Trial 3 finished with value: 0.924591148223439 and parameters: {'n_estimators': 158, 'max_depth': 5, 'learning_rate': 0.08369042894376064, 'subsample': 0.6557975442608167, 'colsample_bytree': 0.7168578594140873, 'gamma': 8.528933855762793e-06}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  33%|███▎      | 5/15 [02:23<05:41, 34.13s/it]

[I 2025-08-11 15:36:10,288] Trial 4 finished with value: 0.8409469883992775 and parameters: {'n_estimators': 164, 'max_depth': 10, 'learning_rate': 0.01254057843022616, 'subsample': 0.8056937753654446, 'colsample_bytree': 0.836965827544817, 'gamma': 2.3528990899815284e-08}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  33%|███▎      | 5/15 [02:45<05:41, 34.13s/it]

[I 2025-08-11 15:36:32,339] Trial 5 finished with value: 0.6888456060996102 and parameters: {'n_estimators': 202, 'max_depth': 4, 'learning_rate': 0.006746417134006626, 'subsample': 0.9795542149013333, 'colsample_bytree': 0.9862528132298237, 'gamma': 0.02932100047183291}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  40%|████      | 6/15 [02:53<04:30, 30.03s/it]

[I 2025-08-11 15:36:40,556] Trial 6 finished with value: 0.909142632349273 and parameters: {'n_estimators': 126, 'max_depth': 3, 'learning_rate': 0.11679817513130797, 'subsample': 0.7760609974958406, 'colsample_bytree': 0.6488152939379115, 'gamma': 9.149877525022172e-05}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  47%|████▋     | 7/15 [03:12<03:03, 22.90s/it]

[I 2025-08-11 15:36:59,236] Trial 7 finished with value: 0.7659710875806484 and parameters: {'n_estimators': 58, 'max_depth': 12, 'learning_rate': 0.01646379567211809, 'subsample': 0.8650089137415928, 'colsample_bytree': 0.7246844304357644, 'gamma': 0.00014472520367197597}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  53%|█████▎    | 8/15 [03:19<02:30, 21.56s/it]

[I 2025-08-11 15:37:07,026] Trial 8 finished with value: 0.9134719551686679 and parameters: {'n_estimators': 187, 'max_depth': 4, 'learning_rate': 0.43464957555697725, 'subsample': 0.9100531293444458, 'colsample_bytree': 0.9757995766256756, 'gamma': 0.14408501080722544}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  67%|██████▋   | 10/15 [04:18<02:29, 29.83s/it]

[I 2025-08-11 15:38:05,061] Trial 9 finished with value: 0.8332102676585844 and parameters: {'n_estimators': 200, 'max_depth': 12, 'learning_rate': 0.007515450322528414, 'subsample': 0.6783931449676581, 'colsample_bytree': 0.6180909155642152, 'gamma': 4.005370050283172e-06}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  73%|███████▎  | 11/15 [05:02<02:17, 34.42s/it]

[I 2025-08-11 15:38:49,863] Trial 10 finished with value: 0.9143035711477321 and parameters: {'n_estimators': 287, 'max_depth': 7, 'learning_rate': 0.03621799474202481, 'subsample': 0.6071847502459278, 'colsample_bytree': 0.8391524267229545, 'gamma': 0.0033264162114920023}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  73%|███████▎  | 11/15 [05:13<02:17, 34.42s/it]

[I 2025-08-11 15:39:01,055] Trial 11 finished with value: 0.9243616144942959 and parameters: {'n_estimators': 121, 'max_depth': 8, 'learning_rate': 0.22955406185548316, 'subsample': 0.7518416973680894, 'colsample_bytree': 0.7248996679248748, 'gamma': 1.3465901496770342e-07}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  80%|████████  | 12/15 [05:23<01:22, 27.36s/it]

[I 2025-08-11 15:39:10,395] Trial 12 finished with value: 0.9179095040995241 and parameters: {'n_estimators': 106, 'max_depth': 8, 'learning_rate': 0.32976584052032165, 'subsample': 0.7273145000499233, 'colsample_bytree': 0.7624979833916741, 'gamma': 1.307420395434413e-06}. Best is trial 3 with value: 0.924591148223439.


Best trial: 3. Best value: 0.924591:  87%|████████▋ | 13/15 [05:34<00:43, 21.91s/it]

[I 2025-08-11 15:39:21,349] Trial 13 finished with value: 0.9265312829611185 and parameters: {'n_estimators': 102, 'max_depth': 7, 'learning_rate': 0.2197325710169943, 'subsample': 0.610547233762323, 'colsample_bytree': 0.7874820875551369, 'gamma': 6.7169034639277175e-06}. Best is trial 13 with value: 0.9265312829611185.


Best trial: 13. Best value: 0.926531:  93%|█████████▎| 14/15 [05:48<00:18, 18.59s/it]

[I 2025-08-11 15:39:35,308] Trial 14 finished with value: 0.8645482549698402 and parameters: {'n_estimators': 78, 'max_depth': 6, 'learning_rate': 0.04374146076402921, 'subsample': 0.6041792656687146, 'colsample_bytree': 0.8955427636018558, 'gamma': 9.146410590181663e-06}. Best is trial 13 with value: 0.9265312829611185.


Best trial: 13. Best value: 0.926531: 100%|██████████| 15/15 [05:48<00:00, 23.23s/it]

2025-08-11 15:39:35,529 - INFO - Best parameters for XGB: {'n_estimators': 102, 'max_depth': 7, 'learning_rate': 0.2197325710169943, 'subsample': 0.610547233762323, 'colsample_bytree': 0.7874820875551369, 'gamma': 6.7169034639277175e-06}
2025-08-11 15:39:35,531 - INFO - Best cross-validated F1-score for XGB: 0.9265
2025-08-11 15:39:35,532 - INFO - Tuning SVC model with 15 trials...



[I 2025-08-11 15:39:35,534] A new study created in memory with name: SVC_tuning_study
  0%|          | 0/15 [03:11<?, ?it/s]

[I 2025-08-11 15:42:47,160] Trial 0 finished with value: 0.48185640015644654 and parameters: {'C': 0.31489116479568624, 'gamma': 6.351221010640703, 'kernel': 'rbf'}. Best is trial 0 with value: 0.48185640015644654.


Best trial: 0. Best value: 0.481856:   7%|▋         | 1/15 [07:04<44:45, 191.84s/it]

[I 2025-08-11 15:46:40,260] Trial 1 finished with value: 0.8040040979459049 and parameters: {'C': 0.04207988669606638, 'gamma': 0.004207053950287938, 'kernel': 'sigmoid'}. Best is trial 1 with value: 0.8040040979459049.


Best trial: 1. Best value: 0.804004:  13%|█▎        | 2/15 [07:24<46:49, 216.11s/it]

[I 2025-08-11 15:46:59,615] Trial 2 finished with value: 0.9221662225082857 and parameters: {'C': 2.5378155082656657, 'gamma': 0.679657809075816, 'kernel': 'sigmoid'}. Best is trial 2 with value: 0.9221662225082857.


Best trial: 2. Best value: 0.922166:  20%|██        | 3/15 [08:08<25:15, 126.27s/it]

[I 2025-08-11 15:47:43,759] Trial 3 finished with value: 0.9274468578122226 and parameters: {'C': 21.368329072358772, 'gamma': 0.0070689749506246055, 'kernel': 'sigmoid'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  27%|██▋       | 4/15 [09:18<17:12, 93.84s/it] 

[I 2025-08-11 15:48:53,794] Trial 4 finished with value: 0.9000954051681628 and parameters: {'C': 0.1648044642797898, 'gamma': 0.12561043700013558, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  33%|███▎      | 5/15 [11:40<14:12, 85.26s/it]

[I 2025-08-11 15:51:15,568] Trial 5 finished with value: 0.8466772244638058 and parameters: {'C': 2.801635158716261, 'gamma': 0.003613894271216527, 'kernel': 'sigmoid'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  40%|████      | 6/15 [12:01<15:40, 104.48s/it]

[I 2025-08-11 15:51:37,106] Trial 6 finished with value: 0.8754548416250257 and parameters: {'C': 0.6672367170464207, 'gamma': 1.382623217936987, 'kernel': 'sigmoid'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  53%|█████▎    | 8/15 [14:29<11:38, 99.79s/it] 

[I 2025-08-11 15:54:04,947] Trial 7 finished with value: 0.839863591318843 and parameters: {'C': 2.342384984711291, 'gamma': 0.0015339162591163618, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  53%|█████▎    | 8/15 [18:20<11:38, 99.79s/it]

[I 2025-08-11 15:57:56,270] Trial 8 finished with value: 0.08042653953868908 and parameters: {'C': 0.018205657658407266, 'gamma': 6.245139574743075, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 3. Best value: 0.927447:  60%|██████    | 9/15 [22:10<14:05, 140.93s/it]

[I 2025-08-11 16:01:45,853] Trial 9 finished with value: 0.8047218721060352 and parameters: {'C': 0.1653693718282443, 'gamma': 0.002458603276328005, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9274468578122226.


Best trial: 10. Best value: 0.94153:  73%|███████▎  | 11/15 [22:26<08:06, 121.60s/it]

[I 2025-08-11 16:02:01,605] Trial 10 finished with value: 0.9415302848927896 and parameters: {'C': 60.33178530661243, 'gamma': 0.028504320627871515, 'kernel': 'sigmoid'}. Best is trial 10 with value: 0.9415302848927896.


Best trial: 10. Best value: 0.94153:  73%|███████▎  | 11/15 [22:42<08:06, 121.60s/it]

[I 2025-08-11 16:02:17,825] Trial 11 finished with value: 0.9424880088028337 and parameters: {'C': 64.64947866087911, 'gamma': 0.024218157448679556, 'kernel': 'sigmoid'}. Best is trial 11 with value: 0.9424880088028337.


Best trial: 11. Best value: 0.942488:  80%|████████  | 12/15 [22:56<04:28, 89.56s/it] 

[I 2025-08-11 16:02:31,361] Trial 12 finished with value: 0.9392053629128879 and parameters: {'C': 88.19429776626716, 'gamma': 0.03713740624438133, 'kernel': 'sigmoid'}. Best is trial 11 with value: 0.9424880088028337.


Best trial: 11. Best value: 0.942488:  93%|█████████▎| 14/15 [23:09<00:50, 50.57s/it]

[I 2025-08-11 16:02:45,101] Trial 13 finished with value: 0.9385834226508812 and parameters: {'C': 97.65296156943181, 'gamma': 0.025774482038992817, 'kernel': 'sigmoid'}. Best is trial 11 with value: 0.9424880088028337.


Best trial: 11. Best value: 0.942488:  93%|█████████▎| 14/15 [23:24<00:50, 50.57s/it]

[I 2025-08-11 16:03:00,022] Trial 14 finished with value: 0.9365904399909507 and parameters: {'C': 15.59864319752155, 'gamma': 0.13883438990307442, 'kernel': 'sigmoid'}. Best is trial 11 with value: 0.9424880088028337.


Best trial: 11. Best value: 0.942488: 100%|██████████| 15/15 [23:24<00:00, 93.65s/it]

2025-08-11 16:03:00,243 - INFO - Best parameters for SVC: {'C': 64.64947866087911, 'gamma': 0.024218157448679556, 'kernel': 'sigmoid'}
2025-08-11 16:03:00,244 - INFO - Best cross-validated F1-score for SVC: 0.9425
2025-08-11 16:03:00,246 - INFO - Tuning ETC model with 15 trials...



[I 2025-08-11 16:03:00,248] A new study created in memory with name: ETC_tuning_study
  0%|          | 0/15 [00:04<?, ?it/s]

[I 2025-08-11 16:03:04,478] Trial 0 finished with value: 0.905345439765127 and parameters: {'n_estimators': 144, 'max_depth': 36, 'min_samples_split': 15, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.905345439765127.


Best trial: 0. Best value: 0.905345:   7%|▋         | 1/15 [00:06<01:03,  4.51s/it]

[I 2025-08-11 16:03:06,811] Trial 1 finished with value: 0.9033105221386787 and parameters: {'n_estimators': 89, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.905345439765127.


Best trial: 0. Best value: 0.905345:  13%|█▎        | 2/15 [00:12<00:41,  3.18s/it]

[I 2025-08-11 16:03:12,695] Trial 2 finished with value: 0.9048320961326024 and parameters: {'n_estimators': 200, 'max_depth': 21, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.905345439765127.


Best trial: 0. Best value: 0.905345:  20%|██        | 3/15 [00:18<00:53,  4.45s/it]

[I 2025-08-11 16:03:18,848] Trial 3 finished with value: 0.9191989621106709 and parameters: {'n_estimators': 258, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  27%|██▋       | 4/15 [00:22<00:56,  5.12s/it]

[I 2025-08-11 16:03:22,832] Trial 4 finished with value: 0.9037124762642886 and parameters: {'n_estimators': 126, 'max_depth': 14, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  33%|███▎      | 5/15 [00:27<00:47,  4.71s/it]

[I 2025-08-11 16:03:27,333] Trial 5 finished with value: 0.9055210896400828 and parameters: {'n_estimators': 203, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  40%|████      | 6/15 [00:32<00:41,  4.64s/it]

[I 2025-08-11 16:03:32,376] Trial 6 finished with value: 0.8996331187012563 and parameters: {'n_estimators': 164, 'max_depth': 25, 'min_samples_split': 5, 'min_samples_leaf': 6}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  47%|████▋     | 7/15 [00:36<00:38,  4.78s/it]

[I 2025-08-11 16:03:36,376] Trial 7 finished with value: 0.8992309400726045 and parameters: {'n_estimators': 198, 'max_depth': 5, 'min_samples_split': 13, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  53%|█████▎    | 8/15 [00:38<00:31,  4.53s/it]

[I 2025-08-11 16:03:38,697] Trial 8 finished with value: 0.8987381998441906 and parameters: {'n_estimators': 66, 'max_depth': 36, 'min_samples_split': 20, 'min_samples_leaf': 9}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  60%|██████    | 9/15 [00:41<00:23,  3.84s/it]

[I 2025-08-11 16:03:41,800] Trial 9 finished with value: 0.9065322904908435 and parameters: {'n_estimators': 126, 'max_depth': 6, 'min_samples_split': 15, 'min_samples_leaf': 5}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  67%|██████▋   | 10/15 [00:49<00:18,  3.61s/it]

[I 2025-08-11 16:03:49,510] Trial 10 finished with value: 0.9112263682741635 and parameters: {'n_estimators': 287, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  73%|███████▎  | 11/15 [00:57<00:19,  4.85s/it]

[I 2025-08-11 16:03:57,260] Trial 11 finished with value: 0.907783505879283 and parameters: {'n_estimators': 290, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  80%|████████  | 12/15 [01:04<00:17,  5.75s/it]

[I 2025-08-11 16:04:05,107] Trial 12 finished with value: 0.9131610037820834 and parameters: {'n_estimators': 293, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  87%|████████▋ | 13/15 [01:11<00:12,  6.38s/it]

[I 2025-08-11 16:04:11,632] Trial 13 finished with value: 0.9150964771322074 and parameters: {'n_estimators': 250, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199:  93%|█████████▎| 14/15 [01:17<00:06,  6.43s/it]

[I 2025-08-11 16:04:17,806] Trial 14 finished with value: 0.9171971586951543 and parameters: {'n_estimators': 242, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.9191989621106709.


Best trial: 3. Best value: 0.919199: 100%|██████████| 15/15 [01:17<00:00,  5.19s/it]

2025-08-11 16:04:18,087 - INFO - Best parameters for ETC: {'n_estimators': 258, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 2}
2025-08-11 16:04:18,088 - INFO - Best cross-validated F1-score for ETC: 0.9192
2025-08-11 16:04:18,090 - INFO - Hyperparameter tuning completed for all selected models.
2025-08-11 16:04:18,091 - INFO - --- Completed Pipeline Step: Hyperparameter Tuning ---

2025-08-11 16:04:18,092 - INFO - 
--- Starting Pipeline Step: Final Model Training & Evaluation ---
2025-08-11 16:04:18,092 - INFO - Applying SMOTE to the entire training data for final model training...
2025-08-11 16:04:18,189 - INFO - SMOTE applied. Original train: 4135 samples. Resampled train: 7226 samples.
2025-08-11 16:04:18,190 - INFO - Training final LR model on resampled data and evaluating...





2025-08-11 16:04:19,046 - INFO - 
--- Performance for LR ---
2025-08-11 16:04:19,046 - INFO - Accuracy: 0.9787
2025-08-11 16:04:19,047 - INFO - Precision (Spam): 0.8978
2025-08-11 16:04:19,047 - INFO - Recall (Spam): 0.9389
2025-08-11 16:04:19,048 - INFO - F1-Score (Spam): 0.9179
2025-08-11 16:04:19,055 - INFO - 
Full Classification Report for LR:
              precision    recall  f1-score   support

         ham       0.99      0.98      0.99       903
        spam       0.90      0.94      0.92       131

    accuracy                           0.98      1034
   macro avg       0.94      0.96      0.95      1034
weighted avg       0.98      0.98      0.98      1034

2025-08-11 16:04:19,058 - INFO - 
Raw Confusion Matrix for LR:
[[889  14]
 [  8 123]]
2025-08-11 16:04:19,169 - INFO - Confusion matrix plot for LR saved to /home/dev/spam_classifier_project/plots/confusion_matrix_LR_20250811_160418.png.
2025-08-11 16:04:19,170 - INFO - Training final RF model on resampled data and evalua

Batches: 100%|██████████| 1/1 [00:00<00:00, 16.45it/s]


Prediction for SPAM text: 'WINNER! You have been selected for a £1000 prize! Call 09061701300 now or claim at link.co.uk/prize. T&C's apply.' -> Label: spam, Spam Confidence: 0.9887


Batches: 100%|██████████| 1/1 [00:00<00:00, 22.42it/s]


Prediction for HAM text: 'Hey, just checking in. How are you doing today? Let's catch up soon for coffee!' -> Label: ham, Ham Confidence: 0.9926
