** Claude COde** italicized text

In [12]:
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=e9c925410af9976cb19ac52bcdc9c5387a8127b8060c611839056b5bafe5ec50
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [13]:
# 1) Google Colab Drive Setup
from google.colab import drive
drive.mount('/content/drive')

# Define results directory
import os
results_dir = '/content/drive/MyDrive/Amir-Khan FYP/Cluade-Results'
os.makedirs(results_dir, exist_ok=True)

# Create visualization subdirectory
viz_dir = os.path.join(results_dir, 'Visualizations')
os.makedirs(viz_dir, exist_ok=True)

# 2) Memory-Optimized Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, learning_curve
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (classification_report, confusion_matrix, ConfusionMatrixDisplay,
                            roc_curve, roc_auc_score, auc, precision_recall_curve, average_precision_score,
                            f1_score, accuracy_score, recall_score, precision_score)
from imblearn.over_sampling import SMOTE
import joblib
import gc
import psutil
import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
import matplotlib.colors as mcolors
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectKBest, f_classif
import itertools
import warnings
warnings.filterwarnings('ignore')

# Configure memory-efficient settings
plt.style.use('ggplot')
sns.set_theme(style="whitegrid", font_scale=1.1)
plt.rcParams.update({
    'figure.dpi': 300,
    'savefig.dpi': 300,
    'font.size': 10,
    'axes.titlesize': 14,
    'axes.titlepad': 15,
    'axes.labelpad': 10
})

# Define consistent color palette for visualizations
colors = sns.color_palette("viridis", 10)
model_colors = {
    'LightGBM': colors[0],
    'XGBoost': colors[1],
    'RandomForest': colors[2]
}

# 3) Memory-Aware Data Loading
def load_data(path):
    """Load data with memory optimization"""
    df = pd.read_csv(path)

    # Downcast numerical types
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    df[num_cols] = df[num_cols].apply(pd.to_numeric, downcast='unsigned' if 'int' in str(df[num_cols].dtypes[0]) else 'float')

    return df

data_path = '/content/drive/MyDrive/Amir-Khan FYP/feature_vectors_syscallsbinders_frequency_5_Cat.csv'
df = load_data(data_path)
print(f"Initial memory usage: {df.memory_usage().sum()/1024**2:.2f} MB")

# 4) Data Exploration and Initial Visualizations
def plot_class_distribution(y, save_path):
    """Plot the class distribution with enhanced styling"""
    plt.figure(figsize=(12, 6))

    # Count plot with custom styling
    ax = sns.countplot(x=y, palette='viridis')

    # Add count labels on top of bars
    total = len(y)
    for p in ax.patches:
        height = p.get_height()
        percentage = 100 * height / total
        ax.text(p.get_x() + p.get_width()/2., height + 0.1,
                f'{int(height)}\n({percentage:.1f}%)',
                ha="center", fontsize=9)

    plt.title('Class Distribution in Dataset', fontsize=16, pad=20)
    plt.xlabel('Class', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(rotation=45)
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

def plot_feature_correlations(X, save_path, n_features=20):
    """Plot correlation heatmap of top features"""
    # Select top features by variance
    X_sample = X.iloc[:, :n_features] if X.shape[1] > n_features else X

    plt.figure(figsize=(14, 12))
    corr = X_sample.corr()

    # Generate mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(230, 20, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1.0, vmin=-1.0, center=0,
                square=True, linewidths=.5, annot=False, fmt=".2f",
                cbar_kws={"shrink": .8, "label": "Correlation Coefficient"})

    plt.title('Feature Correlation Matrix (Top Features)', fontsize=16, pad=20)
    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

def plot_feature_importance(X, y, save_path, n_features=20):
    """Plot feature importance using SelectKBest"""
    # Select k best features
    k = min(n_features, X.shape[1])
    selector = SelectKBest(f_classif, k=k)
    selector.fit(X, y)

    # Get scores and feature names
    scores = -np.log10(selector.pvalues_)
    feature_names = X.columns

    if len(feature_names) > k:
        # Sort and get top k features
        indices = np.argsort(scores)[::-1][:k]
        top_scores = scores[indices]
        top_features = [feature_names[i] for i in indices]
    else:
        # Use all features if less than k
        indices = np.argsort(scores)[::-1]
        top_scores = scores[indices]
        top_features = [feature_names[i] for i in indices]

    # Plot
    plt.figure(figsize=(12, 8))
    plt.barh(range(len(top_features)), top_scores, align='center', color=colors[3:])
    plt.yticks(range(len(top_features)), top_features)
    plt.title('Top Feature Importance (ANOVA F-value)', fontsize=16, pad=20)
    plt.xlabel('-log10(p-value)', fontsize=12)
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

# Execute initial visualizations
plot_class_distribution(df['Class'], os.path.join(viz_dir, 'class_distribution.png'))
plot_feature_correlations(df.drop('Class', axis=1), os.path.join(viz_dir, 'feature_correlation.png'))
plot_feature_importance(df.drop('Class', axis=1), df['Class'], os.path.join(viz_dir, 'feature_importance.png'))

# 5) Balanced Sampling Strategy
X = df.drop('Class', axis=1)
y = df['Class']

# Initial split with smaller test size
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.10,  # Reduced from 0.15
    random_state=56,
    stratify=y
)

# Plot train-test split visualization
def plot_train_test_split(y_train, y_test, save_path):
    """Visualize the class distribution in train and test sets with corrected percentages"""
    # Count occurrences of each class
    train_counts = pd.Series(y_train).value_counts().sort_index()
    test_counts = pd.Series(y_test).value_counts().sort_index()

    # Create a DataFrame for plotting counts
    df_plot = pd.DataFrame({
        'Train': train_counts,
        'Test': test_counts
    })

    # Calculate correct percentages within each set
    total_train = len(y_train)
    total_test = len(y_test)

    train_percentages = (train_counts / total_train) * 100
    test_percentages = (test_counts / total_test) * 100

    # Create DataFrame for percentages
    df_percentages = pd.DataFrame({
        'Train %': train_percentages,
        'Test %': test_percentages
    })

    # Print actual values for verification
    print("Train counts:", dict(train_counts))
    print("Test counts:", dict(test_counts))
    print("Train percentages:", dict(train_percentages))
    print("Test percentages:", dict(test_percentages))

    # Create figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))

    # Plot counts
    df_plot.plot(kind='bar', ax=ax1, width=0.8, color=[colors[0], colors[1]])
    ax1.set_title('Class Distribution (Counts)', fontsize=14)
    ax1.set_xlabel('Class', fontsize=12)
    ax1.set_ylabel('Count', fontsize=12)
    ax1.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax1.grid(axis='y', alpha=0.3)

    # Add count labels
    for container in ax1.containers:
        ax1.bar_label(container, fontsize=9)

    # Plot percentages
    df_percentages.plot(kind='bar', ax=ax2, width=0.8, color=[colors[0], colors[1]])
    ax2.set_title('Class Distribution (Percentage)', fontsize=14)
    ax2.set_xlabel('Class', fontsize=12)
    ax2.set_ylabel('Percentage (%)', fontsize=12)
    ax2.grid(axis='y', alpha=0.3)

    # Add percentage labels
    for container in ax2.containers:
        ax2.bar_label(container, fmt='%.1f%%', fontsize=9)

    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

plot_train_test_split(y_train, y_test, os.path.join(viz_dir, 'train_test_split.png'))

# Clean up memory
del df
gc.collect()

# Apply SMOTE with sampling strategy
smote = SMOTE(
    sampling_strategy='auto',  # Auto balances to largest class
    random_state=42,
    k_neighbors=3  # Reduced from default 5
)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Visualize SMOTE effect
def plot_smote_effect(y_orig, y_resampled, save_path):
    """Visualize the effect of SMOTE on class distribution"""
    # Count occurrences of each class
    orig_counts = pd.Series(y_orig).value_counts().sort_index()
    resamp_counts = pd.Series(y_resampled).value_counts().sort_index()

    # Create a DataFrame for plotting
    df_plot = pd.DataFrame({
        'Original': orig_counts,
        'After SMOTE': resamp_counts
    })

    # Calculate class ratios
    total_orig = len(y_orig)
    total_resamp = len(y_resampled)

    df_percentages = pd.DataFrame({
        'Original %': (orig_counts / total_orig) * 100,
        'After SMOTE %': (resamp_counts / total_resamp) * 100
    })

    # Create figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))

    # Plot counts
    df_plot.plot(kind='bar', ax=ax1, width=0.8, color=[colors[3], colors[4]])
    ax1.set_title('Effect of SMOTE (Counts)', fontsize=14)
    ax1.set_xlabel('Class', fontsize=12)
    ax1.set_ylabel('Count', fontsize=12)
    ax1.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax1.grid(axis='y', alpha=0.3)

    # Add count labels
    for container in ax1.containers:
        ax1.bar_label(container, fontsize=9)

    # Plot percentages
    df_percentages.plot(kind='bar', ax=ax2, width=0.8, color=[colors[3], colors[4]])
    ax2.set_title('Effect of SMOTE (Percentage)', fontsize=14)
    ax2.set_xlabel('Class', fontsize=12)
    ax2.set_ylabel('Percentage (%)', fontsize=12)
    ax2.grid(axis='y', alpha=0.3)

    # Add percentage labels
    for container in ax2.containers:
        ax2.bar_label(container, fmt='%.1f%%', fontsize=9)

    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

plot_smote_effect(y_train, y_train_res, os.path.join(viz_dir, 'smote_effect.png'))

# Clean up original training data
del X_train, y_train
gc.collect()

# 6) Memory-Efficient Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# 7) Feature Transformation and Dimensionality Reduction
def plot_pca_explained_variance(X_scaled, save_path, n_components=20):
    """Plot PCA explained variance ratio with error handling for small feature dimensions"""
    # Get the actual number of features
    n_features = X_scaled.shape[1]

    # Print warning if very few features
    if n_features < 5:
        print(f"Warning: Input data has only {n_features} features. PCA analysis may be limited.")

    # Limit components to feature dimension
    n_components = min(n_components, n_features)

    # Fit PCA
    pca = PCA(n_components=n_components)
    pca.fit(X_scaled)

    # Get explained variance
    exp_var_ratio = pca.explained_variance_ratio_
    cum_exp_var_ratio = np.cumsum(exp_var_ratio)

    # Create figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

    # Plot explained variance ratio
    ax1.bar(range(1, len(exp_var_ratio) + 1), exp_var_ratio,
           alpha=0.7, color=colors[:len(exp_var_ratio)], edgecolor='black', linewidth=0.5)
    ax1.set_title('Explained Variance Ratio by Component', fontsize=14)
    ax1.set_xlabel('Principal Component', fontsize=12)
    ax1.set_ylabel('Explained Variance Ratio', fontsize=12)

    # Adjust x-ticks based on number of components
    if len(exp_var_ratio) > 10:
        ax1.set_xticks(range(1, len(exp_var_ratio) + 1, 2))
    else:
        ax1.set_xticks(range(1, len(exp_var_ratio) + 1))

    ax1.grid(axis='y', alpha=0.3)

    # Plot cumulative explained variance
    ax2.plot(range(1, len(cum_exp_var_ratio) + 1), cum_exp_var_ratio,
            marker='o', linestyle='-', color=colors[5], linewidth=2, markersize=6)
    ax2.set_title('Cumulative Explained Variance', fontsize=14)
    ax2.set_xlabel('Number of Components', fontsize=12)
    ax2.set_ylabel('Cumulative Explained Variance', fontsize=12)
    ax2.grid(alpha=0.3)

    # Adjust x-ticks based on number of components
    if len(cum_exp_var_ratio) > 10:
        ax2.set_xticks(range(1, len(cum_exp_var_ratio) + 1, 2))
    else:
        ax2.set_xticks(range(1, len(cum_exp_var_ratio) + 1))

    # Add threshold line at 0.95
    ax2.axhline(y=0.95, color='r', linestyle='--', alpha=0.5)

    # Find number of components needed for 95% variance
    if any(cum_exp_var_ratio >= 0.95):
        n_components_95 = np.argmax(cum_exp_var_ratio >= 0.95) + 1
        ax2.text(min(n_components_95 + 0.5, len(cum_exp_var_ratio)), 0.96,
                f'{n_components_95} components for 95% variance',
                fontsize=10, color='red')
    else:
        # In case we don't reach 95% explained variance
        n_components_95 = len(cum_exp_var_ratio)
        ax2.text(len(cum_exp_var_ratio) * 0.7, 0.96,
                f'All {n_components_95} components explain {cum_exp_var_ratio[-1]:.2%} variance',
                fontsize=10, color='red')

    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

    # Return number of components for 95% variance
    return n_components_95

# Apply PCA and plot explained variance
n_components_95 = plot_pca_explained_variance(X_train_scaled,
                                             os.path.join(viz_dir, 'pca_explained_variance.png'))
print(f"Number of components needed for 95% variance: {n_components_95}")

# Apply PCA for visualization and dimensionality reduction
# Ensure we have at least 3 components for visualization if possible
n_components_viz = max(3, min(n_components_95, X_train_scaled.shape[1]))

# If feature dimension is less than 3, use what we have
if X_train_scaled.shape[1] < 3:
    n_components_viz = X_train_scaled.shape[1]
    print(f"Warning: Input data has only {X_train_scaled.shape[1]} features. Will use all available features for PCA.")

pca = PCA(n_components=n_components_viz)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"PCA transformation complete. Output shape: {X_train_pca.shape}")

# 8) Enhanced Visualizations
def plot_tsne(X, y, save_path, subsample=2000, perplexity=30, title="t-SNE Visualization"):
    """Memory-efficient t-SNE visualization"""
    if len(X) > subsample:
        idx = np.random.choice(len(X), subsample, replace=False)
        X_sub = X[idx]
        y_sub = y.iloc[idx] if isinstance(y, pd.Series) else y[idx]
    else:
        X_sub = X
        y_sub = y

    # Apply t-SNE
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    X_tsne = tsne.fit_transform(X_sub)

    # Create a DataFrame for easier plotting
    df_tsne = pd.DataFrame({
        'x': X_tsne[:, 0],
        'y': X_tsne[:, 1],
        'class': y_sub
    })

    # Plot
    plt.figure(figsize=(12, 10))

    # Create a scatter plot with custom styling
    scatter = sns.scatterplot(
        x='x', y='y',
        hue='class',
        data=df_tsne,
        palette='viridis',
        alpha=0.8,
        s=60,
        edgecolor='w',
        linewidth=0.5
    )

    # Improve legend
    plt.legend(title='Class', title_fontsize=12, fontsize=10,
              loc='best', frameon=True, framealpha=0.7)

    # Remove axis ticks and labels
    plt.xticks([])
    plt.yticks([])
    plt.xlabel('')
    plt.ylabel('')

    # Add title and borders
    plt.title(title, fontsize=16, pad=20)
    plt.grid(False)

    # Add contours around clusters
    for cls in np.unique(y_sub):
        df_class = df_tsne[df_tsne['class'] == cls]
        if len(df_class) > 10:  # Only draw contour if there are enough points
            try:
                # Calculate mean and std of this class
                mean_x, mean_y = df_class['x'].mean(), df_class['y'].mean()
                std_x, std_y = df_class['x'].std(), df_class['y'].std()

                # Add class label at the center of each cluster
                plt.text(mean_x, mean_y, str(cls),
                        fontsize=12, ha='center', va='center',
                        bbox=dict(boxstyle="round,pad=0.3",
                                 fc='white', ec="gray", alpha=0.8))
            except:
                pass  # Skip if there's an error in contour calculation

    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

def plot_pca_visualization(X_pca, y, save_path, three_dim=True):
    """Create 2D and 3D PCA visualizations with error handling for low-dimensional data"""
    # Check if we have enough dimensions for even 2D visualization
    if X_pca.shape[1] < 2:
        print(f"Warning: PCA output has only {X_pca.shape[1]} dimension(s). Creating 1D visualization instead.")

        # Create a 1D visualization
        plt.figure(figsize=(12, 8))

        # Create a DataFrame with PC1 and a dummy y-axis
        df_pca = pd.DataFrame({
            'PC1': X_pca[:, 0],
            'class': y
        })

        # Add jitter for better visualization in 1D
        jitter = np.random.normal(0, 0.05, size=len(df_pca))

        # Create scatter plot with PC1 and jitter
        scatter = sns.scatterplot(
            x='PC1', y=jitter,
            hue='class',
            data=df_pca,
            palette='viridis',
            alpha=0.8,
            s=60,
            edgecolor='w',
            linewidth=0.5
        )

        plt.title('PCA Visualization (1D with jitter)', fontsize=16, pad=20)
        plt.xlabel('Principal Component 1', fontsize=12)
        plt.ylabel('Jitter (for visualization only)', fontsize=12)
        plt.grid(alpha=0.3)
        plt.legend(title='Class', title_fontsize=12, fontsize=10)

        plt.tight_layout()
        plt.savefig(save_path, bbox_inches='tight')
        plt.close()

        return

    # Create a DataFrame for easier plotting
    df_pca = pd.DataFrame({
        'PC1': X_pca[:, 0],
        'PC2': X_pca[:, 1],
        'class': y
    })

    if three_dim and X_pca.shape[1] >= 3:
        df_pca['PC3'] = X_pca[:, 2]

        # Create figure with two subplots (2D and 3D)
        fig = plt.figure(figsize=(18, 9))

        # 2D Plot
        ax1 = fig.add_subplot(121)
        scatter_2d = sns.scatterplot(
            x='PC1', y='PC2',
            hue='class',
            data=df_pca,
            palette='viridis',
            alpha=0.8,
            s=60,
            edgecolor='w',
            linewidth=0.5,
            ax=ax1
        )
        ax1.set_title('PCA Visualization (2D: PC1 vs PC2)', fontsize=14)
        ax1.set_xlabel('Principal Component 1', fontsize=12)
        ax1.set_ylabel('Principal Component 2', fontsize=12)
        ax1.grid(alpha=0.3)

        # 3D Plot
        ax2 = fig.add_subplot(122, projection='3d')
        classes = np.unique(y)
        colors_dict = dict(zip(classes, sns.color_palette("viridis", len(classes))))

        for cls in classes:
            subset = df_pca[df_pca['class'] == cls]
            ax2.scatter(
                subset['PC1'], subset['PC2'], subset['PC3'],
                label=cls,
                color=colors_dict[cls],
                alpha=0.8,
                s=60,
                edgecolor='w',
                linewidth=0.5
            )

        ax2.set_title('PCA Visualization (3D: PC1, PC2, PC3)', fontsize=14)
        ax2.set_xlabel('Principal Component 1', fontsize=10)
        ax2.set_ylabel('Principal Component 2', fontsize=10)
        ax2.set_zlabel('Principal Component 3', fontsize=10)
        ax2.legend(title='Class', title_fontsize=12, fontsize=10)

        plt.tight_layout()
        plt.savefig(save_path, bbox_inches='tight')
        plt.close()
    else:
        # Only 2D Plot
        plt.figure(figsize=(12, 10))
        scatter = sns.scatterplot(
            x='PC1', y='PC2',
            hue='class',
            data=df_pca,
            palette='viridis',
            alpha=0.8,
            s=60,
            edgecolor='w',
            linewidth=0.5
        )
        plt.title('PCA Visualization (PC1 vs PC2)', fontsize=16, pad=20)
        plt.xlabel('Principal Component 1', fontsize=12)
        plt.ylabel('Principal Component 2', fontsize=12)
        plt.grid(alpha=0.3)
        plt.legend(title='Class', title_fontsize=12, fontsize=10)

        plt.tight_layout()
        plt.savefig(save_path, bbox_inches='tight')
        plt.close()

# Plot t-SNE and PCA visualizations
plot_tsne(X_train_scaled, y_train_res, os.path.join(viz_dir, 'tsne_training_data.png'),
         title="t-SNE Visualization of Training Data (after SMOTE)")
plot_pca_visualization(X_train_pca, y_train_res, os.path.join(viz_dir, 'pca_visualization.png'))

# 9) Memory-Optimized Model Training
models = {
    'LightGBM': LGBMClassifier(verbose=-1, device='cpu'),
    'XGBoost': XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        tree_method='hist'  # More memory-efficient
    ),
    'RandomForest': RandomForestClassifier(
        max_depth=10,  # Restricted depth
        n_jobs=-1
    )
}

param_grids = {
    'LightGBM': {
        'n_estimators': [50, 100],
        'max_depth': [3, 5],
        'learning_rate': [0.05, 0.1]
    },
    'XGBoost': {
        'n_estimators': [50, 100],
        'max_depth': [3, 5],
        'learning_rate': [0.05, 0.1]
    },
    'RandomForest': {
        'n_estimators': [50, 100],
        'max_depth': [5, 10],
        'min_samples_split': [5, 10]  # More conservative splits
    }
}

# Memory monitoring function
def check_memory():
    return psutil.virtual_memory().percent

# Learning curve function
def plot_learning_curve(estimator, X, y, train_sizes, cv, save_path, title="Learning Curve"):
    """Plot learning curve for an estimator"""
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, train_sizes=train_sizes, cv=cv,
        scoring='f1_weighted', n_jobs=-1
    )

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(12, 8))

    # Plot mean training and test scores
    plt.plot(train_sizes, train_mean, 'o-', color=colors[0], label="Training score", linewidth=2)
    plt.plot(train_sizes, test_mean, 'o-', color=colors[2], label="Cross-validation score", linewidth=2)

    # Plot standard deviation bands
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color=colors[0])
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color=colors[2])

    # Plot details
    plt.title(title, fontsize=16, pad=20)
    plt.xlabel("Training examples", fontsize=12)
    plt.ylabel("F1 Score (weighted)", fontsize=12)
    plt.grid(alpha=0.3)
    plt.legend(loc="best", fontsize=12)

    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

# Confusion matrix plotting function
def plot_confusion_matrix(y_true, y_pred, classes, save_path, title="Confusion Matrix"):
    """Plot a confusion matrix with improved styling and robust error handling"""
    # Get the unique classes from the data itself
    unique_true = np.unique(y_true)
    unique_pred = np.unique(y_pred)

    # Create confusion matrix directly from data
    cm = confusion_matrix(y_true, y_pred)

    # Create a normalized version
    with np.errstate(divide='ignore', invalid='ignore'):
        cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm_norm = np.nan_to_num(cm_norm)  # Replace NaNs with 0

    # Create figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

    # Plot raw counts - direct approach without DataFrame conversion
    im1 = ax1.imshow(cm, interpolation='nearest', cmap='Blues')
    ax1.set_title('Confusion Matrix (Counts)', fontsize=14)

    # Create tick marks
    tick_marks = np.arange(len(unique_true))
    ax1.set_xticks(tick_marks)
    ax1.set_yticks(tick_marks)

    # Label ticks with class names
    ax1.set_xticklabels(unique_true, rotation=45, ha='right')
    ax1.set_yticklabels(unique_true)

    # Add text annotations
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        ax1.text(j, i, format(cm[i, j], 'd'),
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black")

    # Add colorbar
    plt.colorbar(im1, ax=ax1)

    # Labels and ticks
    ax1.set_ylabel('True Label', fontsize=12)
    ax1.set_xlabel('Predicted Label', fontsize=12)

    # Plot normalized percentages - direct approach
    im2 = ax2.imshow(cm_norm, interpolation='nearest', cmap='Blues', vmin=0, vmax=1)
    ax2.set_title('Confusion Matrix (Normalized)', fontsize=14)

    # Create tick marks
    ax2.set_xticks(tick_marks)
    ax2.set_yticks(tick_marks)

    # Label ticks with class names
    ax2.set_xticklabels(unique_true, rotation=45, ha='right')
    ax2.set_yticklabels(unique_true)

    # Add text annotations
    for i, j in itertools.product(range(cm_norm.shape[0]), range(cm_norm.shape[1])):
        ax2.text(j, i, format(cm_norm[i, j], '.2%'),
                horizontalalignment="center",
                color="white" if cm_norm[i, j] > 0.5 else "black")

    # Add colorbar
    plt.colorbar(im2, ax=ax2)

    # Labels
    ax2.set_ylabel('True Label', fontsize=12)
    ax2.set_xlabel('Predicted Label', fontsize=12)

    plt.suptitle(title, fontsize=16, y=1.05)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

# ROC curve plotting function
def plot_roc_curves(models_dict, X_test, y_test, classes, save_path):
    """Plot ROC curves for multiple models"""
    n_classes = len(classes)
    le = LabelEncoder().fit(classes)
    y_test_enc = le.transform(y_test)

    # Create figure
    plt.figure(figsize=(16, 12))

    # If binary classification
    if n_classes == 2:
        for model_name, model in models_dict.items():
            # Get predictions
            y_score = model.predict_proba(X_test)[:, 1]

            # Compute ROC curve and area
            fpr, tpr, _ = roc_curve(y_test_enc, y_score)
            roc_auc = auc(fpr, tpr)

            # Plot ROC curve
            plt.plot(fpr, tpr, lw=2,
                    label=f'{model_name} (AUC = {roc_auc:.3f})',
                    color=model_colors[model_name])

        # Plot diagonal line
        plt.plot([0, 1], [0, 1], 'k--', lw=2)

        # Set plot details
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate', fontsize=12)
        plt.ylabel('True Positive Rate', fontsize=12)
        plt.title('ROC Curve - Binary Classification', fontsize=16, pad=20)
        plt.legend(loc="lower right", fontsize=12)
        plt.grid(alpha=0.3)

    # Multi-class case
    else:
        # Create subplot grid based on number of classes
        fig, axes = plt.subplots(nrows=(n_classes+1)//2, ncols=2, figsize=(16, 4*((n_classes+1)//2)))
        axes = axes.flatten()

        # For each class
        for i, cls in enumerate(classes):
            ax = axes[i]

            # Prepare binary labels (one-vs-rest)
            y_test_bin = (y_test_enc == i).astype(int)

            # Plot ROC for each model for this class
            for model_name, model in models_dict.items():
                # Get predictions for this class
                if hasattr(model, 'predict_proba'):
                    y_score = model.predict_proba(X_test)[:, i]
                else:
                    # For models without predict_proba
                    y_score = (model.predict(X_test) == i).astype(int)

                # Compute ROC curve and area
                fpr, tpr, _ = roc_curve(y_test_bin, y_score)
                roc_auc = auc(fpr, tpr)

                # Plot ROC curve
                ax.plot(fpr, tpr, lw=2,
                        label=f'{model_name} (AUC = {roc_auc:.3f})',
                        color=model_colors[model_name])

            # Plot diagonal line
            ax.plot([0, 1], [0, 1], 'k--', lw=2)

            # Set plot details
            ax.set_xlim([0.0, 1.0])
            ax.set_ylim([0.0, 1.05])
            ax.set_xlabel('False Positive Rate', fontsize=10)
            ax.set_ylabel('True Positive Rate', fontsize=10)
            ax.set_title(f'Class: {cls}', fontsize=12)
            ax.legend(loc="lower right", fontsize=8)
            ax.grid(alpha=0.3)

        # Hide any unused subplots
        for j in range(i+1, len(axes)):
            fig.delaxes(axes[j])

        plt.suptitle('ROC Curves - One-vs-Rest', fontsize=16, y=1.02)

    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

# Precision-Recall Curve plotting function
def plot_precision_recall_curves(models_dict, X_test, y_test, classes, save_path):
    """Plot Precision-Recall curves for multiple models"""
    n_classes = len(classes)
    le = LabelEncoder().fit(classes)
    y_test_enc = le.transform(y_test)

    # Create figure
    plt.figure(figsize=(16, 12))

    # If binary classification
    if n_classes == 2:
        for model_name, model in models_dict.items():
            # Get predictions
            y_score = model.predict_proba(X_test)[:, 1]

            # Compute precision-recall curve and average precision
            precision, recall, _ = precision_recall_curve(y_test_enc, y_score)
            avg_precision = average_precision_score(y_test_enc, y_score)

            # Plot precision-recall curve
            plt.plot(recall, precision, lw=2,
                    label=f'{model_name} (AP = {avg_precision:.3f})',
                    color=model_colors[model_name])

        # Set plot details
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('Recall', fontsize=12)
        plt.ylabel('Precision', fontsize=12)
        plt.title('Precision-Recall Curve - Binary Classification', fontsize=16, pad=20)
        plt.legend(loc="best", fontsize=12)
        plt.grid(alpha=0.3)

    # Multi-class case
    else:
        # Create subplot grid based on number of classes
        fig, axes = plt.subplots(nrows=(n_classes+1)//2, ncols=2, figsize=(16, 4*((n_classes+1)//2)))
        axes = axes.flatten()

        # For each class
        for i, cls in enumerate(classes):
            ax = axes[i]

            # Prepare binary labels (one-vs-rest)
            y_test_bin = (y_test_enc == i).astype(int)

            # Plot PR curve for each model for this class
            for model_name, model in models_dict.items():
                # Get predictions for this class
                if hasattr(model, 'predict_proba'):
                    y_score = model.predict_proba(X_test)[:, i]
                else:
                    # For models without predict_proba
                    y_score = (model.predict(X_test) == i).astype(int)

                # Compute precision-recall curve and average precision
                precision, recall, _ = precision_recall_curve(y_test_bin, y_score)
                avg_precision = average_precision_score(y_test_bin, y_score)

                # Plot precision-recall curve
                ax.plot(recall, precision, lw=2,
                        label=f'{model_name} (AP = {avg_precision:.3f})',
                        color=model_colors[model_name])

            # Set plot details
            ax.set_xlim([0.0, 1.0])
            ax.set_ylim([0.0, 1.05])
            ax.set_xlabel('Recall', fontsize=10)
            ax.set_ylabel('Precision', fontsize=10)
            ax.set_title(f'Class: {cls}', fontsize=12)
            ax.legend(loc="best", fontsize=8)
            ax.grid(alpha=0.3)

        # Hide any unused subplots
        for j in range(i+1, len(axes)):
            fig.delaxes(axes[j])

        plt.suptitle('Precision-Recall Curves - One-vs-Rest', fontsize=16, y=1.02)

    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

# Performance metrics visualization
def plot_model_comparison(results, save_path):
    """Plot model performance comparison with robust metric handling"""
    # Extract available metrics from results - check what's actually available
    # Different sklearn versions might have different keys in classification_report
    model_names = list(results.keys())

    # Check which metrics are actually available
    available_metrics = set()
    for model in model_names:
        for metric in results[model]['report']['weighted avg'].keys():
            if metric not in ['support']:  # Exclude non-score metrics
                available_metrics.add(metric)

    # Define the metrics to use - using what's available
    metrics = ['precision', 'recall', 'f1-score']  # These should always be available

    # Create data for plotting
    comparison_data = []
    for metric in metrics:
        for model in model_names:
            if metric in results[model]['report']['weighted avg']:
                comparison_data.append({
                    'Model': model,
                    'Metric': metric,
                    'Value': results[model]['report']['weighted avg'][metric]
                })
            else:
                print(f"Warning: Metric '{metric}' not found for model '{model}'. Using 0.")
                comparison_data.append({
                    'Model': model,
                    'Metric': metric,
                    'Value': 0.0
                })

    # Create DataFrame
    df_metrics = pd.DataFrame(comparison_data)

    # Create figure
    plt.figure(figsize=(14, 10))

    # Create grouped bar chart
    chart = sns.barplot(x='Model', y='Value', hue='Metric', data=df_metrics, palette='viridis')

    # Add value labels
    for i, container in enumerate(chart.containers):
        for j, bar in enumerate(container):
            if bar.get_height() is not None:  # Ensure the bar has a height
                height = bar.get_height()
                chart.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                          f'{height:.3f}', ha='center', va='bottom',
                          fontsize=8, rotation=0)

    # Set plot details
    plt.title('Model Performance Comparison (Weighted Metrics)', fontsize=16, pad=20)
    plt.xlabel('Model', fontsize=12)
    plt.ylabel('Score', fontsize=12)
    plt.ylim(0, 1.05)
    plt.grid(axis='y', alpha=0.3)
    plt.legend(title='Metric', fontsize=10, title_fontsize=12)

    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

# Feature importance visualization
def plot_feature_importance_model(model, feature_names, save_path, max_features=20, model_name="Model"):
    """Plot feature importances from model"""
    # Get feature importances
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
    elif hasattr(model, 'coef_'):
        importances = np.abs(model.coef_).mean(axis=0) if model.coef_.ndim > 1 else np.abs(model.coef_)
    else:
        print(f"Model {model_name} does not have feature_importances_ or coef_ attribute")
        return

    # Sort and get top features
    indices = np.argsort(importances)[::-1][:max_features]
    top_importances = importances[indices]
    top_features = [feature_names[i] for i in indices]

    # Plot
    plt.figure(figsize=(12, 8))
    plt.barh(range(len(top_features)), top_importances, align='center', color=colors)
    plt.yticks(range(len(top_features)), top_features)
    plt.title(f'Feature Importance - {model_name}', fontsize=16, pad=20)
    plt.xlabel('Importance', fontsize=12)
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

# Main training loop
results = {}
le = LabelEncoder().fit(y_train_res)
y_train_encoded = le.transform(y_train_res)
y_test_encoded = le.transform(y_test)

for name in ['LightGBM', 'XGBoost', 'RandomForest']:  # Ordered by memory efficiency
    print(f"\nTraining {name} - Memory Usage: {check_memory()}%")

    if check_memory() > 85:
        print("Memory threshold exceeded - skipping remaining models")
        break

    # Reduced search space
    search = RandomizedSearchCV(
        models[name],
        param_grids[name],
        n_iter=2,  # Reduced from 5
        cv=2,      # Reduced from 3
        scoring='f1_weighted',
        random_state=42,
        n_jobs=1   # Reduced parallelism
    )

    try:
        # Plot learning curve before full training
        plot_learning_curve(
            models[name], X_train_scaled, y_train_encoded,
            train_sizes=np.linspace(0.1, 1.0, 5),
            cv=2,
            save_path=os.path.join(viz_dir, f'learning_curve_{name}.png'),
            title=f"Learning Curve - {name}"
        )
    except Exception as e:
        print(f"Warning: Could not create learning curve for {name}: {str(e)}")

    search.fit(X_train_scaled, y_train_encoded)

    # Capture parameters before cleanup
    best_params = search.best_params_
    best_model = search.best_estimator_

    # Save model immediately
    model_path = os.path.join(results_dir, f'best_{name}.pkl')
    joblib.dump(best_model, model_path)
    print(f"Saved {name} model")

    # Clean up search object
    del search
    gc.collect()

    # Evaluate and store results
    y_pred = best_model.predict(X_test_scaled)
    y_pred_proba = best_model.predict_proba(X_test_scaled) if hasattr(best_model, 'predict_proba') else None

    # Calculate metrics
    report_dict = classification_report(y_test_encoded, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test_encoded, y_pred)
    f1 = f1_score(y_test_encoded, y_pred, average='weighted')
    precision = precision_score(y_test_encoded, y_pred, average='weighted')
    recall = recall_score(y_test_encoded, y_pred, average='weighted')

    try:
        # Create confusion matrix
        plot_confusion_matrix(
            y_test, y_pred,
            classes=np.unique(y_test),
            save_path=os.path.join(viz_dir, f'confusion_matrix_{name}.png'),
            title=f"Confusion Matrix - {name}"
        )
    except Exception as e:
        print(f"Warning: Could not create confusion matrix for {name}: {str(e)}")

    try:
        # Plot feature importance if available
        if hasattr(best_model, 'feature_importances_') or hasattr(best_model, 'coef_'):
            plot_feature_importance_model(
                best_model, X.columns,
                save_path=os.path.join(viz_dir, f'feature_importance_{name}.png'),
                model_name=name
            )
    except Exception as e:
        print(f"Warning: Could not create feature importance plot for {name}: {str(e)}")

    # Add explicit report keys for all models
    # This ensures we always have the same metrics available
    results[name] = {
        'model': best_model,
        'report': report_dict,
        'params': best_params,
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

    # Clean up temporary variables
    del y_pred, best_model
    gc.collect()

# 10) Model Performance Comparison and Visualization
if results:
    try:
        # Collect all trained models for ROC and PR curve plotting
        trained_models = {name: results[name]['model'] for name in results}

        # Plot ROC curves for all models
        plot_roc_curves(
            trained_models, X_test_scaled, y_test,
            classes=np.unique(y_test),
            save_path=os.path.join(viz_dir, 'roc_curves.png')
        )

        # Plot Precision-Recall curves for all models
        plot_precision_recall_curves(
            trained_models, X_test_scaled, y_test,
            classes=np.unique(y_test),
            save_path=os.path.join(viz_dir, 'precision_recall_curves.png')
        )

        # Plot model performance comparison
        plot_model_comparison(results, os.path.join(viz_dir, 'model_comparison.png'))

        # Create learning rate comparison for LightGBM and XGBoost
        if 'LightGBM' in results and 'XGBoost' in results:
            try:
                # Check if evaluation results are available in the models
                has_lgbm_history = hasattr(results['LightGBM']['model'], 'evals_result_') and results['LightGBM']['model'].evals_result_
                has_xgb_history = hasattr(results['XGBoost']['model'], 'evals_result_') and results['XGBoost']['model'].evals_result_

                if has_lgbm_history or has_xgb_history:
                    # Plot learning rate
                    plt.figure(figsize=(12, 6))

                    # Check and plot LightGBM history if available
                    if has_lgbm_history:
                        evals_lgbm = results['LightGBM']['model'].evals_result_
                        # Check which keys are actually available in the evals_result_ dictionary
                        lgbm_keys = list(evals_lgbm.keys())
                        if lgbm_keys:  # If there are keys available
                            train_key = lgbm_keys[0]  # First key is typically training data
                            valid_key = lgbm_keys[1] if len(lgbm_keys) > 1 else None  # Second key is validation data if available

                            # Get metric names
                            metric_keys = list(evals_lgbm[train_key].keys())
                            if metric_keys:  # If there are metric keys available
                                metric_key = metric_keys[0]  # Use the first metric

                                # Plot training data
                                plt.plot(evals_lgbm[train_key][metric_key],
                                        label=f'LightGBM Train ({metric_key})', color=colors[0])

                                # Plot validation data if available
                                if valid_key:
                                    plt.plot(evals_lgbm[valid_key][metric_key],
                                            label=f'LightGBM Validation ({metric_key})', color=colors[1])

                    # Check and plot XGBoost history if available
                    if has_xgb_history:
                        evals_xgb = results['XGBoost']['model'].evals_result_
                        # Check which keys are actually available
                        xgb_keys = list(evals_xgb.keys())
                        if xgb_keys:  # If there are keys available
                            train_key = next((k for k in xgb_keys if 'train' in k.lower()), xgb_keys[0])
                            valid_key = next((k for k in xgb_keys if 'val' in k.lower() or 'test' in k.lower()),
                                           xgb_keys[1] if len(xgb_keys) > 1 else None)

                            # Get metric names
                            train_metrics = list(evals_xgb[train_key].keys()) if train_key else []
                            if train_metrics:  # If there are metric keys available
                                metric_key = train_metrics[0]  # Use the first metric

                                # Plot training data
                                plt.plot(evals_xgb[train_key][metric_key],
                                        label=f'XGBoost Train ({metric_key})', color=colors[2])

                                # Plot validation data if available
                                if valid_key and metric_key in evals_xgb[valid_key]:
                                    plt.plot(evals_xgb[valid_key][metric_key],
                                            label=f'XGBoost Validation ({metric_key})', color=colors[3])

                    plt.title('Learning Curves Comparison', fontsize=16, pad=20)
                    plt.xlabel('Boosting Round', fontsize=12)
                    plt.ylabel('Loss', fontsize=12)
                    plt.legend(fontsize=12)
                    plt.grid(alpha=0.3)
                    plt.tight_layout()
                    plt.savefig(os.path.join(viz_dir, 'learning_curves_comparison.png'), bbox_inches='tight')
                    plt.close()
                else:
                    print("No evaluation results available for learning curve comparison")

            except Exception as e:
                print(f"Could not create learning curves comparison: {str(e)}")
                print("Continuing with other visualizations...")

        # Save best model
        best_model_name = max(results, key=lambda x: results[x]['report']['weighted avg']['f1-score'])
        final_model = results[best_model_name]['model']
        joblib.dump(final_model, os.path.join(results_dir, 'final_model.pkl'))

        # Create summary of best model
        with open(os.path.join(results_dir, 'best_model_summary.txt'), 'w') as f:
            f.write(f"Best Model: {best_model_name}\n")
            f.write(f"Parameters: {results[best_model_name]['params']}\n\n")
            f.write("Performance Metrics:\n")
            f.write(f"Accuracy: {results[best_model_name]['accuracy']:.4f}\n")
            f.write(f"F1 Score (weighted): {results[best_model_name]['f1']:.4f}\n")
            f.write(f"Precision (weighted): {results[best_model_name]['precision']:.4f}\n")
            f.write(f"Recall (weighted): {results[best_model_name]['recall']:.4f}\n\n")
            f.write("Classification Report:\n")

            # Format the classification report
            report = results[best_model_name]['report']
            for cls in report:
                if cls not in ['macro avg', 'weighted avg']:
                    f.write(f"Class {cls}:\n")
                    for metric in ['precision', 'recall', 'f1-score', 'support']:
                        if metric in report[cls]:
                            f.write(f"  {metric}: {report[cls][metric]:.4f}\n")
                    f.write("\n")

            for avg in ['macro avg', 'weighted avg']:
                if avg in report:
                    f.write(f"{avg}:\n")
                    for metric in ['precision', 'recall', 'f1-score']:
                        if metric in report[avg]:
                            f.write(f"  {metric}: {report[avg][metric]:.4f}\n")
                    f.write("\n")

        # Generate simplified report
        final_report = pd.DataFrame({
            model: results[model]['report']['weighted avg']
            for model in results
        }).T
        final_report.to_csv(os.path.join(results_dir, 'performance_report.csv'))

    except Exception as e:
        print(f"Error in visualization section: {str(e)}")
        print("Continuing with remaining visualizations...")

    # 11) Summary Dashboard for Paper
    # Create a combined visualization dashboard for paper
    try:
        plt.figure(figsize=(20, 24))
        gs = gridspec.GridSpec(4, 2, height_ratios=[1, 1, 1, 1])

        # Function to safely load and display an image
        def safe_load_image(ax, image_path, title):
            try:
                if os.path.exists(image_path):
                    img = plt.imread(image_path)
                    ax.imshow(img)
                    ax.axis('off')
                    ax.set_title(title, fontsize=14, loc='left')
                else:
                    ax.text(0.5, 0.5, f"Image not found:\n{os.path.basename(image_path)}",
                           ha='center', va='center', fontsize=12, color='red')
                    ax.axis('off')
            except Exception as e:
                ax.text(0.5, 0.5, f"Error loading image:\n{str(e)}",
                       ha='center', va='center', fontsize=10, color='red')
                ax.axis('off')

        # Class Distribution
        ax1 = plt.subplot(gs[0, 0])
        safe_load_image(ax1, os.path.join(viz_dir, 'class_distribution.png'), 'A) Class Distribution')

        # SMOTE Effect
        ax2 = plt.subplot(gs[0, 1])
        safe_load_image(ax2, os.path.join(viz_dir, 'smote_effect.png'), 'B) Effect of SMOTE')

        # PCA Visualization
        ax3 = plt.subplot(gs[1, 0])
        safe_load_image(ax3, os.path.join(viz_dir, 'pca_visualization.png'), 'C) PCA Visualization')

        # t-SNE Visualization
        ax4 = plt.subplot(gs[1, 1])
        safe_load_image(ax4, os.path.join(viz_dir, 'tsne_training_data.png'), 'D) t-SNE Visualization')

        # ROC Curves
        ax5 = plt.subplot(gs[2, 0])
        safe_load_image(ax5, os.path.join(viz_dir, 'roc_curves.png'), 'E) ROC Curves')

        # PR Curves
        ax6 = plt.subplot(gs[2, 1])
        safe_load_image(ax6, os.path.join(viz_dir, 'precision_recall_curves.png'), 'F) Precision-Recall Curves')

        # Confusion Matrix of Best Model
        ax7 = plt.subplot(gs[3, 0])
        cm_path = os.path.join(viz_dir, f'confusion_matrix_{best_model_name}.png')
        # If best model confusion matrix doesn't exist, try using any existing one
        if not os.path.exists(cm_path):
            for model_name in results:
                alt_path = os.path.join(viz_dir, f'confusion_matrix_{model_name}.png')
                if os.path.exists(alt_path):
                    cm_path = alt_path
                    print(f"Using confusion matrix from {model_name} instead of best model")
                    break

        safe_load_image(ax7, cm_path, f'G) Confusion Matrix')

        # Model Comparison
        ax8 = plt.subplot(gs[3, 1])
        safe_load_image(ax8, os.path.join(viz_dir, 'model_comparison.png'), 'H) Model Performance Comparison')

        plt.suptitle('Comprehensive Analysis of Syscalls Classification', fontsize=20, y=0.995)
        plt.tight_layout(rect=[0, 0, 1, 0.98])
        plt.savefig(os.path.join(results_dir, 'paper_visualization_dashboard.png'),
                   dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Successfully created summary dashboard at {os.path.join(results_dir, 'paper_visualization_dashboard.png')}")
    except Exception as e:
        print(f"Error creating summary dashboard: {str(e)}")
        print("Continuing with the rest of the analysis...")

# 12) Create summary PDF with all visualizations for the paper
try:
    from fpdf import FPDF

    class PDF(FPDF):
        def header(self):
            # Logo
            # self.image('logo.png', 10, 8, 33)
            # Arial bold 15
            self.set_font('Arial', 'B', 15)
            # Move to the right
            self.cell(80)
            # Title
            self.cell(30, 10, 'Machine Learning Analysis for Syscalls Classification', 0, 0, 'C')
            # Line break
            self.ln(20)

        def footer(self):
            # Position at 1.5 cm from bottom
            self.set_y(-15)
            # Arial italic 8
            self.set_font('Arial', 'I', 8)
            # Page number
            self.cell(0, 10, 'Page ' + str(self.page_no()) + '/{nb}', 0, 0, 'C')

    # Create PDF
    pdf = PDF()
    pdf.alias_nb_pages()
    pdf.add_page()
    pdf.set_font('Arial', 'B', 12)

    # Title
    pdf.cell(0, 10, 'Comprehensive Visualization Report', 0, 1, 'C')
    pdf.ln(5)

    # Add all visualizations with descriptions
    visualizations = [
        ('class_distribution.png', 'Class Distribution'),
        ('smote_effect.png', 'Effect of SMOTE Oversampling'),
        ('feature_correlation.png', 'Feature Correlation Matrix'),
        ('feature_importance.png', 'Feature Importance Analysis'),
        ('pca_explained_variance.png', 'PCA Explained Variance'),
        ('pca_visualization.png', 'PCA Visualization'),
        ('tsne_training_data.png', 't-SNE Visualization'),
        ('roc_curves.png', 'ROC Curves'),
        ('precision_recall_curves.png', 'Precision-Recall Curves')
    ]

    # Add model-specific visualizations
    for model_name in results:
        visualizations.append((f'confusion_matrix_{model_name}.png', f'Confusion Matrix - {model_name}'))
        visualizations.append((f'learning_curve_{model_name}.png', f'Learning Curve - {model_name}'))
        if os.path.exists(os.path.join(viz_dir, f'feature_importance_{model_name}.png')):
            visualizations.append((f'feature_importance_{model_name}.png', f'Feature Importance - {model_name}'))

    # Add comparison visualizations
    visualizations.append(('model_comparison.png', 'Model Performance Comparison'))
    if os.path.exists(os.path.join(viz_dir, 'learning_curves_comparison.png')):
        visualizations.append(('learning_curves_comparison.png', 'Learning Curves Comparison'))

    # Add dashboard
    visualizations.append(('paper_visualization_dashboard.png', 'Comprehensive Dashboard'))

    # Add visualizations to PDF
    for i, (viz_file, viz_desc) in enumerate(visualizations):
        if os.path.exists(os.path.join(viz_dir, viz_file)):
            if i > 0 and i % 2 == 0:
                pdf.add_page()

            pdf.set_font('Arial', 'B', 11)
            pdf.cell(0, 10, viz_desc, 0, 1)
            pdf.image(os.path.join(viz_dir, viz_file), x=10, w=190)
            pdf.ln(5)

    # Add performance summary
    pdf.add_page()
    pdf.set_font('Arial', 'B', 14)
    pdf.cell(0, 10, 'Model Performance Summary', 0, 1, 'C')
    pdf.ln(5)

    # Add summary table
    pdf.set_font('Arial', '', 10)
    col_width = 40
    row_height = 10

    # Table header
    pdf.set_font('Arial', 'B', 10)
    pdf.cell(col_width, row_height, 'Model', 1, 0, 'C')
    pdf.cell(col_width, row_height, 'Accuracy', 1, 0, 'C')
    pdf.cell(col_width, row_height, 'F1 Score', 1, 0, 'C')
    pdf.cell(col_width, row_height, 'Precision', 1, 1, 'C')

    # Table rows
    pdf.set_font('Arial', '', 10)
    for model_name in results:
        pdf.cell(col_width, row_height, model_name, 1, 0, 'L')
        pdf.cell(col_width, row_height, f"{results[model_name]['accuracy']:.4f}", 1, 0, 'C')
        pdf.cell(col_width, row_height, f"{results[model_name]['f1']:.4f}", 1, 0, 'C')
        pdf.cell(col_width, row_height, f"{results[model_name]['precision']:.4f}", 1, 1, 'C')

    # Best model highlight
    pdf.ln(5)
    pdf.set_font('Arial', 'B', 11)
    pdf.cell(0, 10, f"Best Model: {best_model_name}", 0, 1)
    pdf.set_font('Arial', '', 10)
    pdf.multi_cell(0, 10, f"Parameters: {results[best_model_name]['params']}")

    # Save PDF
    pdf.output(os.path.join(results_dir, 'visualization_report.pdf'))
    print(f"Generated visualization report PDF at {os.path.join(results_dir, 'visualization_report.pdf')}")
except ImportError:
    print("FPDF not installed. Skipping PDF generation.")
    print("To generate PDF, install fpdf: !pip install fpdf")

print("Pipeline completed successfully. Memory usage:", check_memory())
print(f"All visualizations saved to {viz_dir}")
print(f"Summary dashboard saved to {os.path.join(results_dir, 'paper_visualization_dashboard.png')}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Initial memory usage: 6.99 MB
Train counts: {1: np.int64(1128), 2: np.int64(1890), 3: np.int64(3514), 4: np.int64(2291), 5: np.int64(1615)}
Test counts: {1: np.int64(125), 2: np.int64(210), 3: np.int64(390), 4: np.int64(255), 5: np.int64(180)}
Train percentages: {1: np.float64(10.806667944050584), 2: np.float64(18.106917033914545), 3: np.float64(33.66545315194482), 4: np.float64(21.948649166506996), 5: np.float64(15.472312703583063)}
Test percentages: {1: np.float64(10.775862068965516), 2: np.float64(18.103448275862068), 3: np.float64(33.62068965517241), 4: np.float64(21.982758620689655), 5: np.float64(15.517241379310345)}
Number of components needed for 95% variance: 20
PCA transformation complete. Output shape: (17570, 20)

Training LightGBM - Memory Usage: 37.1%
Saved LightGBM model

Training XGBoost - Memory Usage: 42.7%
Saved XGBoost model

Training Rand

<Figure size 4800x3600 with 0 Axes>

<Figure size 4800x3600 with 0 Axes>