In [None]:
!pip install optuna xgboost scikit-learn pandas numpy matplotlib seaborn psutil

## Random Forest

### Training

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from scipy.sparse import hstack, csr_matrix
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
import os
import joblib
import time
import psutil
import json
from datetime import datetime

class SystemMonitor:
    """Monitor system resources during training"""
    def __init__(self, interval=1.0):
        self.interval = interval
        self.cpu_percentages = []
        self.memory_usage = []
        self.start_time = None
        self.end_time = None

    def start(self):
        """Start monitoring"""
        self.start_time = time.time()
        self.cpu_percentages = []
        self.memory_usage = []

    def update(self):
        """Record current system stats"""
        self.cpu_percentages.append(psutil.cpu_percent(interval=0.1))
        self.memory_usage.append(psutil.Process().memory_info().rss / 1024 / 1024)  # MB

    def stop(self):
        """Stop monitoring and return stats"""
        self.end_time = time.time()
        return {
            'training_time_seconds': self.end_time - self.start_time,
            'avg_cpu_percent': np.mean(self.cpu_percentages),
            'max_cpu_percent': max(self.cpu_percentages),
            'avg_memory_mb': np.mean(self.memory_usage),
            'max_memory_mb': max(self.memory_usage)
        }

class ExtendedAPIEncoder:
    """
    Custom encoder for API calls that handles unseen values using a predefined vocabulary.
    """
    def __init__(self, unknown_value=-1):
        self.label_encoder = LabelEncoder()
        self.unknown_value = unknown_value
        self.vocabulary = set()

    def load_vocabulary(self, vocab_file):
        """Load API vocabulary from a text file"""
        with open(vocab_file, 'r') as f:
            api_calls = {line.strip() for line in f if line.strip()}
        self.vocabulary.update(api_calls)

    def add_to_vocabulary(self, api_calls):
        """Add additional API calls to vocabulary"""
        self.vocabulary.update(api_calls)

    def fit(self, api_calls):
        """Fit the encoder using both the vocabulary and training data"""
        # Combine vocabulary with observed API calls
        all_apis = list(self.vocabulary.union(set(api_calls)))
        self.label_encoder.fit(all_apis)
        return self

    def transform(self, api_calls):
        """Transform API calls, handling unseen values gracefully"""
        # Create a copy to avoid modifying the input
        api_calls_clean = np.array(api_calls).copy()

        # Replace unseen values with a special token
        mask = ~np.isin(api_calls_clean, self.label_encoder.classes_)
        if mask.any():
            unseen_apis = set(api_calls_clean[mask])
            print(f"Warning: Found {len(unseen_apis)} unseen API calls not in vocabulary.")
            api_calls_clean[mask] = self.label_encoder.classes_[0]  # Use first class as unknown token

        return self.label_encoder.transform(api_calls_clean)

    def fit_transform(self, api_calls):
        """Fit and transform in one step"""
        self.fit(api_calls)
        return self.transform(api_calls)

    def inverse_transform(self, encoded_values):
        """Convert encoded values back to API calls"""
        return self.label_encoder.inverse_transform(encoded_values)

    def classes_(self):
        """Return the classes (API calls) known to the encoder"""
        return self.label_encoder.classes_

def load_and_combine_data(file_paths):
    """
    Load and combine data from multiple CSV files.

    Args:
        file_paths (list): List of paths to CSV files containing malware data
    """
    dfs = []
    required_columns = ['first_api', 'last_api', 'api_call_count',
                       'api_sequence', 'malware_type']

    for file_path in file_paths:
        try:
            df = pd.read_csv(file_path)

            # Validate required columns
            missing_columns = [col for col in required_columns if col not in df.columns]
            if missing_columns:
                raise ValueError(f"File {file_path} is missing required columns: {missing_columns}")

            # Basic data cleaning
            df = df.dropna()
            df['api_call_count'] = df['api_call_count'].astype(int)
            df['api_sequence'] = df['api_sequence'].astype(str)

            # Add source file information
            df['source_file'] = os.path.basename(file_path)

            dfs.append(df)

            print(f"\nLoaded data from {file_path}")
            print(f"Samples: {len(df)}")
            print("Class distribution:")
            for malware_type, count in df['malware_type'].value_counts().items():
                print(f"{malware_type}: {count} samples")

        except Exception as e:
            print(f"Error loading {file_path}: {str(e)}")
            raise

    # Combine all dataframes
    combined_df = pd.concat(dfs, ignore_index=True)

    # Print combined dataset statistics
    print("\nCombined Dataset Statistics:")
    print("-" * 50)
    print(f"Total samples: {len(combined_df)}")
    print("\nOverall class distribution:")
    class_dist = combined_df['malware_type'].value_counts()
    for malware_type, count in class_dist.items():
        print(f"{malware_type}: {count} samples")

    # API calls statistics
    print("\nAPI calls statistics:")
    api_stats = combined_df['api_call_count'].describe()
    print(f"Min API calls: {api_stats['min']:.0f}")
    print(f"Max API calls: {api_stats['max']:.0f}")
    print(f"Mean API calls: {api_stats['mean']:.0f}")
    print(f"Median API calls: {api_stats['50%']:.0f}")

    return combined_df

def prepare_data(df, api_vocab_file, max_features=2000, use_smote=True):
    """
    Prepare the data using extended API vocabulary.
    """
    # Create extended API encoders
    first_api_encoder = ExtendedAPIEncoder()
    last_api_encoder = ExtendedAPIEncoder()
    malware_type_encoder = LabelEncoder()

    # Load API vocabulary
    print("Loading API vocabulary...")
    first_api_encoder.load_vocabulary(api_vocab_file)
    last_api_encoder.load_vocabulary(api_vocab_file)

    # Encode categorical variables
    print("Encoding API calls...")
    df['first_api_encoded'] = first_api_encoder.fit_transform(df['first_api'])
    df['last_api_encoded'] = last_api_encoder.fit_transform(df['last_api'])
    df['malware_type_encoded'] = malware_type_encoder.fit_transform(df['malware_type'])

    # Normalize api_call_count
    df['api_call_count_norm'] = np.log1p(df['api_call_count'])

    # Modify TF-IDF to use the API vocabulary
    print("Creating TF-IDF features...")
    with open(api_vocab_file, 'r') as f:
        vocabulary = {line.strip() for line in f if line.strip()}

    tfidf = TfidfVectorizer(
        max_features=max_features,
        sublinear_tf=True,
        ngram_range=(1, 2),
        min_df=1,  # Changed to 1 to keep rare API calls
        vocabulary=vocabulary  # Use predefined vocabulary
    )
    api_sequence_features = tfidf.fit_transform(df['api_sequence'])

    # Create feature matrix
    numeric_features = np.column_stack((
        df['first_api_encoded'],
        df['last_api_encoded'],
        df['api_call_count_norm']
    ))
    numeric_features_sparse = csr_matrix(numeric_features)

    # Combine features
    X = hstack([numeric_features_sparse, api_sequence_features])
    y = df['malware_type_encoded']

    if use_smote:
        print("\nApplying SMOTE to balance classes...")
        smote = SMOTE(random_state=42)
        X, y = smote.fit_resample(X, y)
        print(f"Shape after SMOTE: {X.shape}")

    feature_names = (['First API', 'Last API', 'API Call Count'] +
                    [f'API_Seq_{i}' for i in range(api_sequence_features.shape[1])])

    return (X, y, malware_type_encoder, first_api_encoder, last_api_encoder,
            tfidf, feature_names)

def format_classification_report(y_true, y_pred, target_names):
    """Generate formatted classification report with 4 decimal precision"""
    report_dict = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)

    formatted_report = {"precision": {}, "recall": {}, "f1-score": {}, "support": {}}

    for class_name in target_names:
        metrics = report_dict[class_name]
        formatted_report["precision"][class_name] = f"{metrics['precision']:.4f}"
        formatted_report["recall"][class_name] = f"{metrics['recall']:.4f}"
        formatted_report["f1-score"][class_name] = f"{metrics['f1-score']:.4f}"
        formatted_report["support"][class_name] = int(metrics['support'])

    # Add weighted averages
    for avg_type in ['macro avg', 'weighted avg']:
        metrics = report_dict[avg_type]
        formatted_report["precision"][avg_type] = f"{metrics['precision']:.4f}"
        formatted_report["recall"][avg_type] = f"{metrics['recall']:.4f}"
        formatted_report["f1-score"][avg_type] = f"{metrics['f1-score']:.4f}"
        formatted_report["support"][avg_type] = int(metrics['support'])

    return formatted_report

def evaluate_model(model, X, y, malware_type_encoder, feature_names, output_dir, system_monitor):
    """
    Enhanced evaluation with system metrics and precise formatting
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    y_pred = model.predict(X_test)

    class_names = malware_type_encoder.classes_

    # Generate formatted classification report
    formatted_report = format_classification_report(y_test, y_pred, class_names)

    # Save metrics to JSON
    metrics_file = os.path.join(output_dir, 'model_metrics.json')
    system_stats = system_monitor.stop()

    metrics_data = {
        'classification_metrics': formatted_report,
        'system_stats': system_stats,
        'timestamp': datetime.now().isoformat()
    }

    with open(metrics_file, 'w') as f:
        json.dump(metrics_data, f, indent=2)

    # Print formatted results
    print("\nClassification Metrics (4 decimal precision):")
    print("-" * 80)
    for metric in ["precision", "recall", "f1-score"]:
        print(f"\n{metric.upper()}:")
        for class_name, value in formatted_report[metric].items():
            print(f"{class_name}: {value}")

    print("\nSystem Statistics:")
    print(f"Total training time: {system_stats['training_time_seconds']:.2f} seconds")
    print(f"Average CPU usage: {system_stats['avg_cpu_percent']:.1f}%")
    print(f"Peak CPU usage: {system_stats['max_cpu_percent']:.1f}%")
    print(f"Average memory usage: {system_stats['avg_memory_mb']:.1f} MB")
    print(f"Peak memory usage: {system_stats['max_memory_mb']:.1f} MB")

    # Create visualizations
    plt.figure(figsize=(12, 10))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
    plt.close()

    # Feature importance plot
    n_top_features = 20
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    })
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    feature_importance = feature_importance.head(n_top_features)

    plt.figure(figsize=(12, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance)
    plt.title(f'Top {n_top_features} Most Important Features')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'feature_importance.png'))
    plt.close()

def train_random_forest(X, y, n_estimators=100, n_jobs=-1, system_monitor=None):
    """
    Enhanced Random Forest training with system monitoring
    """
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        n_jobs=n_jobs,
        random_state=42,
        class_weight='balanced',
        max_features='sqrt'
    )

    cv_scores = []
    for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        rf_model.fit(X_train, y_train)
        score = rf_model.score(X_val, y_val)
        cv_scores.append(score)
        print(f"Fold {fold} accuracy: {score:.4f}")

        if system_monitor:
            system_monitor.update()

    print(f"\nMean CV accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")

    # Final training on full dataset
    rf_model.fit(X, y)

    return rf_model

def save_artifacts(output_dir, model, tfidf, malware_type_encoder,
                  first_api_encoder, last_api_encoder, feature_names):
    """
    Save all model artifacts required for inference.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Save model and preprocessors
    artifact_dict = {
        'malware_classifier.joblib': model,
        'tfidf_vectorizer.joblib': tfidf,
        'malware_type_encoder.joblib': malware_type_encoder,
        'first_api_encoder.joblib': first_api_encoder,
        'last_api_encoder.joblib': last_api_encoder,
        'feature_names.joblib': feature_names
    }

    for filename, artifact in artifact_dict.items():
        filepath = os.path.join(output_dir, filename)
        joblib.dump(artifact, filepath)
        print(f"Saved {filename} to {filepath}")

def main(csv_paths, output_dir, api_vocab_file):
    """
    Enhanced main execution function with system monitoring
    """
    system_monitor = SystemMonitor()
    system_monitor.start()

    # Load and combine data
    df = load_and_combine_data(csv_paths)

    # Prepare data
    (X, y, malware_type_encoder, first_api_encoder, last_api_encoder,
     tfidf, feature_names) = prepare_data(df, api_vocab_file, max_features=2000, use_smote=True)

    # Train model with system monitoring
    model = train_random_forest(X, y, n_estimators=200, system_monitor=system_monitor)

    # Evaluate model with system metrics
    evaluate_model(model, X, y, malware_type_encoder, feature_names, output_dir, system_monitor)

    # Save artifacts
    save_artifacts(
        output_dir,
        model,
        tfidf,
        malware_type_encoder,
        first_api_encoder,
        last_api_encoder,
        feature_names
    )

    print(f"\nTraining complete. All artifacts saved to {output_dir}")
    return model

if __name__ == "__main__":
    csv_paths = [
        "/csv/path/data.csv"
    ]
    output_dir = "output/directory/"
    api_vocab_file = "/api/calls/path/windowsapicalls.txt"
    model = main(csv_paths, output_dir, api_vocab_file)

### Inference

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack, csr_matrix
import joblib
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import time
import psutil
from datetime import datetime

class ExtendedAPIEncoder:
    """
    Custom encoder for API calls that handles unseen values using a predefined vocabulary.
    """
    def __init__(self, unknown_value=-1):
        self.label_encoder = LabelEncoder()
        self.unknown_value = unknown_value
        self.vocabulary = set()

    def load_vocabulary(self, vocab_file):
        """Load API vocabulary from a text file"""
        with open(vocab_file, 'r') as f:
            api_calls = {line.strip() for line in f if line.strip()}
        self.vocabulary.update(api_calls)

    def add_to_vocabulary(self, api_calls):
        """Add additional API calls to vocabulary"""
        self.vocabulary.update(api_calls)

    def fit(self, api_calls):
        """Fit the encoder using both the vocabulary and training data"""
        all_apis = list(self.vocabulary.union(set(api_calls)))
        self.label_encoder.fit(all_apis)
        return self

    def transform(self, api_calls):
        """Transform API calls, handling unseen values gracefully"""
        api_calls_clean = np.array(api_calls).copy()
        mask = ~np.isin(api_calls_clean, self.label_encoder.classes_)
        if mask.any():
            unseen_apis = set(api_calls_clean[mask])
            print(f"Warning: Found {len(unseen_apis)} unseen API calls not in vocabulary.")
            api_calls_clean[mask] = self.label_encoder.classes_[0]

        return self.label_encoder.transform(api_calls_clean)

    def fit_transform(self, api_calls):
        """Fit and transform in one step"""
        self.fit(api_calls)
        return self.transform(api_calls)

    def inverse_transform(self, encoded_values):
        """Convert encoded values back to API calls"""
        return self.label_encoder.inverse_transform(encoded_values)

    def classes_(self):
        """Return the classes (API calls) known to the encoder"""
        return self.label_encoder.classes_

def get_system_metrics():
    """
    Collect system metrics during model execution.

    Returns:
        dict: Dictionary containing system metrics
    """
    process = psutil.Process()
    return {
        'memory_usage_mb': process.memory_info().rss / 1024 / 1024,
        'cpu_percent': process.cpu_percent(),
        'threads': process.num_threads(),
    }

def create_run_directory(base_output_dir):
    """
    Create a timestamped directory for the current run.

    Args:
        base_output_dir (str): Base directory for all runs

    Returns:
        str: Path to the newly created directory
    """
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    run_dir = os.path.join(base_output_dir, f'run_{timestamp}')
    os.makedirs(run_dir, exist_ok=True)

    # Create subdirectories for organization
    os.makedirs(os.path.join(run_dir, 'individual_results'), exist_ok=True)
    os.makedirs(os.path.join(run_dir, 'confusion_matrices'), exist_ok=True)
    os.makedirs(os.path.join(run_dir, 'combined_results'), exist_ok=True)

    return run_dir

def load_model_artifacts(model_dir):
    """
    Load all saved model artifacts from the specified directory.

    Args:
        model_dir (str): Directory containing the saved model artifacts

    Returns:
        tuple: (model, tfidf, malware_encoder, first_api_encoder, last_api_encoder, feature_names)
    """
    try:
        model = joblib.load(os.path.join(model_dir, 'malware_classifier.joblib'))
        tfidf = joblib.load(os.path.join(model_dir, 'tfidf_vectorizer.joblib'))
        malware_encoder = joblib.load(os.path.join(model_dir, 'malware_type_encoder.joblib'))
        first_api_encoder = joblib.load(os.path.join(model_dir, 'first_api_encoder.joblib'))
        last_api_encoder = joblib.load(os.path.join(model_dir, 'last_api_encoder.joblib'))
        feature_names = joblib.load(os.path.join(model_dir, 'feature_names.joblib'))

        return model, tfidf, malware_encoder, first_api_encoder, last_api_encoder, feature_names
    except Exception as e:
        raise Exception(f"Error loading model artifacts: {str(e)}")

def prepare_inference_data(df, first_api_encoder, last_api_encoder, tfidf):
    """
    Prepare new data for inference using extended API handling.
    """
    # Handle API calls using extended encoders
    first_api_encoded = first_api_encoder.transform(df['first_api'])
    last_api_encoded = last_api_encoder.transform(df['last_api'])

    # Normalize api_call_count
    api_call_count_norm = np.log1p(df['api_call_count'])

    # Transform API sequences using TF-IDF
    # The vectorizer will now handle unseen tokens using the predefined vocabulary
    api_sequence_features = tfidf.transform(df['api_sequence'])

    # Combine features
    numeric_features = np.column_stack((
        first_api_encoded,
        last_api_encoded,
        api_call_count_norm
    ))
    numeric_features_sparse = csr_matrix(numeric_features)

    return hstack([numeric_features_sparse, api_sequence_features])

def plot_confusion_matrix(y_true, y_pred, class_names, output_path=None):
    """
    Create and save a confusion matrix visualization.

    Args:
        y_true (array-like): True labels
        y_pred (array-like): Predicted labels
        class_names (array-like): List of class names
        output_path (str, optional): Path to save the confusion matrix plot
    """
    # Create confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Calculate percentages
    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

    # Create figure and axes
    plt.figure(figsize=(12, 10))

    # Create heatmap
    sns.heatmap(cm_percent, annot=True, fmt='.1f',
                xticklabels=class_names,
                yticklabels=class_names,
                cmap='YlOrRd')

    plt.title('Confusion Matrix (%)')
    plt.xlabel('Predicted')
    plt.ylabel('True')

    # Rotate axis labels for better readability
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=45)

    # Adjust layout to prevent label cutoff
    plt.tight_layout()

    if output_path:
        plt.savefig(output_path)
        print(f"Confusion matrix plot saved to {output_path}")
    else:
        plt.show()

    plt.close()

def process_single_file(model_dir, file_path, run_dir):
    """
    Process a single CSV file and save results.

    Args:
        model_dir (str): Directory containing model artifacts
        file_path (str): Path to input CSV file
        run_dir (str): Directory for current run results

    Returns:
        tuple: (DataFrame with predictions, dict with metrics)
    """
    start_time = time.time()
    initial_metrics = get_system_metrics()
    file_name = os.path.basename(file_path)

    try:
        print(f"\nProcessing {file_name}...")

        # Load model and make predictions
        model, tfidf, malware_encoder, first_api_encoder, last_api_encoder, feature_names = load_model_artifacts(model_dir)
        df = pd.read_csv(file_path)

        # Prepare features and make predictions
        X = prepare_inference_data(df, first_api_encoder, last_api_encoder, tfidf)
        predictions = model.predict(X)
        prediction_probs = model.predict_proba(X)

        # Add predictions to DataFrame
        df['predicted_malware_type'] = malware_encoder.inverse_transform(predictions)
        for i, class_name in enumerate(malware_encoder.classes_):
            df[f'prob_{class_name}'] = prediction_probs[:, i]
        df['confidence'] = prediction_probs.max(axis=1)

        # Calculate metrics
        file_metrics = {
            'filename': file_name,
            'processing_time': time.time() - start_time,
            'sample_count': len(df),
            'avg_confidence': df['confidence'].mean(),
            'low_confidence_count': sum(df['confidence'] < 0.5)
        }

        if 'malware_type' in df.columns:
            accuracy = np.mean(df['malware_type'] == df['predicted_malware_type'])
            file_metrics['accuracy'] = round(accuracy, 4)

            # Create confusion matrix
            plot_confusion_matrix(
                df['malware_type'],
                df['predicted_malware_type'],
                malware_encoder.classes_,
                output_path=os.path.join(run_dir, 'confusion_matrices', f'confusion_matrix_{file_name}.png')
            )

        # Add system metrics
        final_metrics = get_system_metrics()
        file_metrics.update({
            'peak_memory_mb': max(initial_metrics['memory_usage_mb'], final_metrics['memory_usage_mb']),
            'peak_cpu_percent': max(initial_metrics['cpu_percent'], final_metrics['cpu_percent']),
            'peak_threads': max(initial_metrics['threads'], final_metrics['threads'])
        })

        # Save individual results
        output_path = os.path.join(run_dir, 'individual_results', f'predictions_{file_name}')
        df.to_csv(output_path, index=False)
        print(f"Results saved to {output_path}")

        return df, file_metrics

    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None, None

def process_multiple_files(model_dir, csv_paths, base_output_dir):
    """
    Process multiple CSV files and generate combined analysis.

    Args:
        model_dir (str): Directory containing model artifacts
        csv_paths (list): List of paths to CSV files
        base_output_dir (str): Base directory for all runs
    """
    # Create directory for this run
    run_dir = create_run_directory(base_output_dir)
    print(f"Created new run directory: {run_dir}")

    # Process each file individually
    all_metrics = []
    all_dfs = []

    for file_path in csv_paths:
        if not os.path.exists(file_path):
            print(f"Warning: File not found - {file_path}")
            continue

        df, metrics = process_single_file(model_dir, file_path, run_dir)

        if df is not None and metrics is not None:
            all_dfs.append(df)
            all_metrics.append(metrics)

    if not all_dfs:
        raise ValueError("No files were processed successfully")

    # Process combined dataset
    print("\nProcessing combined dataset...")
    combined_df = pd.concat(all_dfs, ignore_index=True)
    combined_metrics = {
        'filename': 'combined_dataset',
        'sample_count': len(combined_df),
        'avg_confidence': combined_df['confidence'].mean(),
        'low_confidence_count': sum(combined_df['confidence'] < 0.5)
    }

    if 'malware_type' in combined_df.columns:
        accuracy = np.mean(combined_df['malware_type'] == combined_df['predicted_malware_type'])
        combined_metrics['accuracy'] = round(accuracy, 4)

        # Create combined confusion matrix
        plot_confusion_matrix(
            combined_df['malware_type'],
            combined_df['predicted_malware_type'],
            combined_df['predicted_malware_type'].unique(),
            output_path=os.path.join(run_dir, 'combined_results', 'confusion_matrix_combined.png')
        )

    # Save combined results
    combined_output = os.path.join(run_dir, 'combined_results', 'combined_predictions.csv')
    combined_df.to_csv(combined_output, index=False)

    # Create summary report
    summary = pd.DataFrame(all_metrics + [combined_metrics])
    summary.to_csv(os.path.join(run_dir, 'processing_metrics.csv'), index=False)

    # Print summary
    print("\nProcessing Summary:")
    print("-" * 50)
    for metrics in all_metrics:
        print(f"\nFile: {metrics['filename']}")
        print(f"Accuracy: {metrics.get('accuracy', 'N/A'):.4f}")
        print(f"Processing Time: {metrics['processing_time']:.2f} seconds")
        print(f"Peak Memory Usage: {metrics['peak_memory_mb']:.2f} MB")
        print(f"Peak CPU Usage: {metrics['peak_cpu_percent']:.2f}%")

    print("\nCombined Dataset Results:")
    print(f"Total Samples: {combined_metrics['sample_count']}")
    if 'accuracy' in combined_metrics:
        print(f"Overall Accuracy: {combined_metrics['accuracy']:.4f}")
    print(f"Average Confidence: {combined_metrics['avg_confidence']:.4f}")

    print(f"\nAll results saved in: {run_dir}")


if __name__ == "__main__":
    # Specific CSV paths
    csv_paths = [
        "/csv/path/dataset.csv"
    ]

    model_dir = '/model/directory/'
    base_output_dir = '/output/directory/'

    process_multiple_files(model_dir, csv_paths, base_output_dir)

## XGBoost

### Training

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from scipy.sparse import hstack, csr_matrix
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
import os
import joblib
import csv
import sys
import time
import psutil
import json
from datetime import datetime

class ExtendedAPIEncoder:
    """
    Custom encoder for API calls that handles unseen values using a predefined vocabulary.
    """
    def __init__(self, unknown_value=-1):
        self.label_encoder = LabelEncoder()
        self.unknown_value = unknown_value
        self.vocabulary = set()

    def load_vocabulary(self, vocab_file):
        """Load API vocabulary from a text file"""
        with open(vocab_file, 'r') as f:
            api_calls = {line.strip() for line in f if line.strip()}
        self.vocabulary.update(api_calls)

    def add_to_vocabulary(self, api_calls):
        """Add additional API calls to vocabulary"""
        self.vocabulary.update(api_calls)

    def fit(self, api_calls):
        """Fit the encoder using both the vocabulary and training data"""
        all_apis = list(self.vocabulary.union(set(api_calls)))
        self.label_encoder.fit(all_apis)
        return self

    def transform(self, api_calls):
        """Transform API calls, handling unseen values gracefully"""
        api_calls_clean = np.array(api_calls).copy()
        mask = ~np.isin(api_calls_clean, self.label_encoder.classes_)
        if mask.any():
            unseen_apis = set(api_calls_clean[mask])
            print(f"Warning: Found {len(unseen_apis)} unseen API calls not in vocabulary.")
            api_calls_clean[mask] = self.label_encoder.classes_[0]

        return self.label_encoder.transform(api_calls_clean)

    def fit_transform(self, api_calls):
        """Fit and transform in one step"""
        self.fit(api_calls)
        return self.transform(api_calls)

    def inverse_transform(self, encoded_values):
        """Convert encoded values back to API calls"""
        return self.label_encoder.inverse_transform(encoded_values)

    def classes_(self):
        """Return the classes (API calls) known to the encoder"""
        return self.label_encoder.classes_

class SystemMonitor:
    """Monitor system resources during training"""
    def __init__(self, interval=1.0):
        self.interval = interval
        self.cpu_percentages = []
        self.memory_usage = []
        self.start_time = None
        self.end_time = None

    def start(self):
        """Start monitoring"""
        self.start_time = time.time()
        self.cpu_percentages = []
        self.memory_usage = []

    def update(self):
        """Record current system stats"""
        self.cpu_percentages.append(psutil.cpu_percent(interval=0.1))
        self.memory_usage.append(psutil.Process().memory_info().rss / 1024 / 1024)  # MB

    def stop(self):
        """Stop monitoring and return stats"""
        self.end_time = time.time()
        return {
            'training_time_seconds': self.end_time - self.start_time,
            'avg_cpu_percent': np.mean(self.cpu_percentages),
            'max_cpu_percent': max(self.cpu_percentages),
            'avg_memory_mb': np.mean(self.memory_usage),
            'max_memory_mb': max(self.memory_usage)
        }

def load_and_combine_data(file_paths):
    """
    Load and combine data from multiple CSV files,
    handling large field sizes.

    Args:
        file_paths (list): List of paths to CSV files containing malware data
    """
    # Increase field size limit to handle large API sequences
    maxInt = sys.maxsize
    while True:
        try:
            csv.field_size_limit(maxInt)
            break
        except OverflowError:
            maxInt = int(maxInt/10)

    dfs = []
    required_columns = ['first_api', 'last_api', 'api_call_count',
                       'api_sequence', 'malware_type']

    for file_path in file_paths:
        try:
            # Use the 'python' engine to handle potential parsing issues
            df = pd.read_csv(file_path, engine='python')

            # Validate required columns
            missing_columns = [col for col in required_columns if col not in df.columns]
            if missing_columns:
                raise ValueError(f"File {file_path} is missing required columns: {missing_columns}")

            # Basic data cleaning
            df = df.dropna()
            df['api_call_count'] = df['api_call_count'].astype(int)
            df['api_sequence'] = df['api_sequence'].astype(str)

            # Add source file information
            df['source_file'] = os.path.basename(file_path)

            dfs.append(df)

            print(f"\nLoaded data from {file_path}")
            print(f"Samples: {len(df)}")
            print("Class distribution:")
            for malware_type, count in df['malware_type'].value_counts().items():
                print(f"{malware_type}: {count} samples")

        except Exception as e:
            print(f"Error loading {file_path}: {str(e)}")
            raise

    # Combine all dataframes
    combined_df = pd.concat(dfs, ignore_index=True)

    # Print combined dataset statistics
    print("\nCombined Dataset Statistics:")
    print("-" * 50)
    print(f"Total samples: {len(combined_df)}")
    print("\nOverall class distribution:")
    class_dist = combined_df['malware_type'].value_counts()
    for malware_type, count in class_dist.items():
        print(f"{malware_type}: {count} samples")

    # API calls statistics
    print("\nAPI calls statistics:")
    api_stats = combined_df['api_call_count'].describe()
    print(f"Min API calls: {api_stats['min']:.0f}")
    print(f"Max API calls: {api_stats['max']:.0f}")
    print(f"Mean API calls: {api_stats['mean']:.0f}")
    print(f"Median API calls: {api_stats['50%']:.0f}")

    return combined_df

def prepare_data(df, api_vocab_file, max_features=2000, use_smote=True):
    """
    Prepare the data using extended API vocabulary.
    """
    first_api_encoder = ExtendedAPIEncoder()
    last_api_encoder = ExtendedAPIEncoder()
    malware_type_encoder = LabelEncoder()

    print("Loading API vocabulary...")
    first_api_encoder.load_vocabulary(api_vocab_file)
    last_api_encoder.load_vocabulary(api_vocab_file)

    print("Encoding API calls...")
    df['first_api_encoded'] = first_api_encoder.fit_transform(df['first_api'])
    df['last_api_encoded'] = last_api_encoder.fit_transform(df['last_api'])
    df['malware_type_encoded'] = malware_type_encoder.fit_transform(df['malware_type'])

    df['api_call_count_norm'] = np.log1p(df['api_call_count'])

    print("Creating TF-IDF features...")
    with open(api_vocab_file, 'r') as f:
        vocabulary = {line.strip() for line in f if line.strip()}

    tfidf = TfidfVectorizer(
        max_features=max_features,
        sublinear_tf=True,
        ngram_range=(1, 2),
        min_df=1,
        vocabulary=vocabulary
    )
    api_sequence_features = tfidf.fit_transform(df['api_sequence'])

    numeric_features = np.column_stack((
        df['first_api_encoded'],
        df['last_api_encoded'],
        df['api_call_count_norm']
    ))
    numeric_features_sparse = csr_matrix(numeric_features)

    X = hstack([numeric_features_sparse, api_sequence_features])
    y = df['malware_type_encoded']

    if use_smote:
        print("\nApplying SMOTE to balance classes...")
        smote = SMOTE(random_state=42)
        X, y = smote.fit_resample(X, y)
        print(f"Shape after SMOTE: {X.shape}")

    feature_names = (['First API', 'Last API', 'API Call Count'] +
                    [f'API_Seq_{i}' for i in range(api_sequence_features.shape[1])])

    return (X, y, malware_type_encoder, first_api_encoder, last_api_encoder,
            tfidf, feature_names)


def format_classification_report(y_true, y_pred, target_names):
    """Generate formatted classification report with 4 decimal precision"""
    report_dict = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)

    formatted_report = {"precision": {}, "recall": {}, "f1-score": {}, "support": {}}

    for class_name in target_names:
        metrics = report_dict[class_name]
        formatted_report["precision"][class_name] = f"{metrics['precision']:.4f}"
        formatted_report["recall"][class_name] = f"{metrics['recall']:.4f}"
        formatted_report["f1-score"][class_name] = f"{metrics['f1-score']:.4f}"
        formatted_report["support"][class_name] = int(metrics['support'])

    # Add weighted averages
    for avg_type in ['macro avg', 'weighted avg']:
        metrics = report_dict[avg_type]
        formatted_report["precision"][avg_type] = f"{metrics['precision']:.4f}"
        formatted_report["recall"][avg_type] = f"{metrics['recall']:.4f}"
        formatted_report["f1-score"][avg_type] = f"{metrics['f1-score']:.4f}"
        formatted_report["support"][avg_type] = int(metrics['support'])

    return formatted_report

def train_xgboost(X, y, num_class, system_monitor=None):
    """
    Enhanced XGBoost training with system monitoring and early stopping
    """
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    xgb_model = XGBClassifier(
        n_estimators=500,
        learning_rate=0.1,
        max_depth=6,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='multi:softprob',
        num_class=num_class,
        tree_method='hist',
        eval_metric=['mlogloss', 'merror'],
        early_stopping_rounds=20,
        random_state=42,
        n_jobs=-1
    )

    cv_scores = []
    for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
        fold_start_time = time.time()

        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        eval_set = [(X_val, y_val)]

        xgb_model.fit(
            X_train, y_train,
            eval_set=eval_set,
            verbose=100
        )

        score = xgb_model.score(X_val, y_val)
        cv_scores.append(score)

        fold_time = time.time() - fold_start_time
        print(f"Fold {fold} accuracy: {score:.4f} (Time: {fold_time:.2f}s)")

        if system_monitor:
            system_monitor.update()

    print(f"\nMean CV accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")

    # Final training on full dataset
    eval_set = [(X, y)]
    xgb_model.fit(X, y, eval_set=eval_set, verbose=100)

    return xgb_model

def evaluate_model(model, X, y, malware_type_encoder, feature_names, output_dir, system_monitor):
    """
    Enhanced evaluation with system metrics and precise formatting
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    y_pred = model.predict(X_test)

    class_names = malware_type_encoder.classes_

    # Generate formatted classification report
    formatted_report = format_classification_report(y_test, y_pred, class_names)

    # Save metrics to JSON
    metrics_file = os.path.join(output_dir, 'model_metrics.json')
    system_stats = system_monitor.stop()

    metrics_data = {
        'classification_metrics': formatted_report,
        'system_stats': system_stats,
        'timestamp': datetime.now().isoformat()
    }

    with open(metrics_file, 'w') as f:
        json.dump(metrics_data, f, indent=2)

    # Print formatted results
    print("\nClassification Metrics (4 decimal precision):")
    print("-" * 80)
    for metric in ["precision", "recall", "f1-score"]:
        print(f"\n{metric.upper()}:")
        for class_name, value in formatted_report[metric].items():
            print(f"{class_name}: {value}")

    print("\nSystem Statistics:")
    print(f"Total training time: {system_stats['training_time_seconds']:.2f} seconds")
    print(f"Average CPU usage: {system_stats['avg_cpu_percent']:.1f}%")
    print(f"Peak CPU usage: {system_stats['max_cpu_percent']:.1f}%")
    print(f"Average memory usage: {system_stats['avg_memory_mb']:.1f} MB")
    print(f"Peak memory usage: {system_stats['max_memory_mb']:.1f} MB")

    # Create confusion matrix
    plt.figure(figsize=(12, 10))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
    plt.close()

    # Feature importance plot
    n_top_features = 20
    importance_type = 'weight'
    importance_scores = model.get_booster().get_score(importance_type=importance_type)

    feature_importance_list = [(feature, importance_scores.get(feature, 0))
                              for feature in feature_names]

    feature_importance = pd.DataFrame(feature_importance_list, columns=['feature', 'importance'])
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    feature_importance = feature_importance.head(n_top_features)

    plt.figure(figsize=(12, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance)
    plt.title(f'Top {n_top_features} Most Important Features ({importance_type})')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'feature_importance.png'))
    plt.close()

def save_artifacts(output_dir, model, tfidf, malware_type_encoder,
                  first_api_encoder, last_api_encoder, feature_names):
    """
    Save all model artifacts required for inference.
    """
    os.makedirs(output_dir, exist_ok=True)

    artifact_dict = {
        'malware_classifier.json': lambda x: x.save_model(os.path.join(output_dir, 'malware_classifier.json')),
        'tfidf_vectorizer.joblib': lambda x: joblib.dump(x, os.path.join(output_dir, 'tfidf_vectorizer.joblib')),
        'malware_type_encoder.joblib': lambda x: joblib.dump(x, os.path.join(output_dir, 'malware_type_encoder.joblib')),
        'first_api_encoder.joblib': lambda x: joblib.dump(x, os.path.join(output_dir, 'first_api_encoder.joblib')),
        'last_api_encoder.joblib': lambda x: joblib.dump(x, os.path.join(output_dir, 'last_api_encoder.joblib')),
        'feature_names.joblib': lambda x: joblib.dump(x, os.path.join(output_dir, 'feature_names.joblib'))
    }

    artifacts = [model, tfidf, malware_type_encoder, first_api_encoder, last_api_encoder, feature_names]

    for (filename, save_func), artifact in zip(artifact_dict.items(), artifacts):
        try:
            save_func(artifact)
            print(f"Saved {filename}")
        except Exception as e:
            print(f"Error saving {filename}: {str(e)}")

def main(csv_paths, output_dir, api_vocab_file):
    """
    Enhanced main execution function with system monitoring
    """
    system_monitor = SystemMonitor()
    system_monitor.start()

    # Load and prepare data
    df = load_and_combine_data(csv_paths)
    (X, y, malware_type_encoder, first_api_encoder, last_api_encoder,
     tfidf, feature_names) = prepare_data(df, api_vocab_file, max_features=2000, use_smote=True)

    # Get number of classes for XGBoost
    num_classes = len(np.unique(y))

    # Train model with system monitoring
    model = train_xgboost(X, y, num_classes, system_monitor)

    # Evaluate model with system metrics
    evaluate_model(model, X, y, malware_type_encoder, feature_names, output_dir, system_monitor)

    # Save artifacts
    save_artifacts(
        output_dir,
        model,
        tfidf,
        malware_type_encoder,
        first_api_encoder,
        last_api_encoder,
        feature_names
    )

    print(f"\nTraining complete. All artifacts saved to {output_dir}")
    return model

if __name__ == "__main__":
    csv_paths = [
        "/csv/path/data.csv"
    ]
    output_dir = "output/directory/"
    api_vocab_file = "/api/calls/path/windowsapicalls.txt"
    model = main(csv_paths, output_dir, api_vocab_file)

### Inference

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier, Booster, DMatrix
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import os
import time
import psutil
from datetime import datetime

class ExtendedAPIEncoder:
    """
    Custom encoder for API calls that handles unseen values using a predefined vocabulary.
    """
    def __init__(self, unknown_value=-1):
        self.label_encoder = LabelEncoder()
        self.unknown_value = unknown_value
        self.vocabulary = set()

    def load_vocabulary(self, vocab_file):
        """Load API vocabulary from a text file"""
        with open(vocab_file, 'r') as f:
            api_calls = {line.strip() for line in f if line.strip()}
        self.vocabulary.update(api_calls)

    def add_to_vocabulary(self, api_calls):
        """Add additional API calls to vocabulary"""
        self.vocabulary.update(api_calls)

    def fit(self, api_calls):
        """Fit the encoder using both the vocabulary and training data"""
        all_apis = list(self.vocabulary.union(set(api_calls)))
        self.label_encoder.fit(all_apis)
        return self

    def transform(self, api_calls):
        """Transform API calls, handling unseen values gracefully"""
        api_calls_clean = np.array(api_calls).copy()
        mask = ~np.isin(api_calls_clean, self.label_encoder.classes_)
        if mask.any():
            unseen_apis = set(api_calls_clean[mask])
            print(f"Warning: Found {len(unseen_apis)} unseen API calls not in vocabulary.")
            api_calls_clean[mask] = self.label_encoder.classes_[0]

        return self.label_encoder.transform(api_calls_clean)

    def fit_transform(self, api_calls):
        """Fit and transform in one step"""
        self.fit(api_calls)
        return self.transform(api_calls)

    def inverse_transform(self, encoded_values):
        """Convert encoded values back to API calls"""
        return self.label_encoder.inverse_transform(encoded_values)

    def classes_(self):
        """Return the classes (API calls) known to the encoder"""
        return self.label_encoder.classes_

def create_run_directory(base_output_dir):
    """
    Create a timestamped directory for the current run.

    Args:
        base_output_dir (str): Base directory for all runs

    Returns:
        str: Path to the newly created directory
    """
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    run_dir = os.path.join(base_output_dir, f'run_{timestamp}')

    # Create directory structure
    subdirs = ['individual_results', 'confusion_matrices', 'combined_results', 'metrics']
    for subdir in subdirs:
        os.makedirs(os.path.join(run_dir, subdir), exist_ok=True)

    return run_dir

def get_system_metrics():
    """
    Collect system metrics during model execution.
    """
    process = psutil.Process()
    return {
        'memory_usage_mb': process.memory_info().rss / 1024 / 1024,
        'cpu_percent': process.cpu_percent(),
        'threads': process.num_threads(),
    }

def load_model_artifacts(model_dir):
    """
    Load all saved model artifacts required for inference.
    """
    try:
        # Load XGBoost model using lower-level API to avoid version issues
        booster = Booster()
        booster.load_model(os.path.join(model_dir, 'malware_classifier.json'))

        # Create XGBClassifier wrapper
        model = XGBClassifier()
        model._Booster = booster
        model.n_classes_ = len(booster.get_dump())  # Set number of classes

        # Load other artifacts
        tfidf = joblib.load(os.path.join(model_dir, 'tfidf_vectorizer.joblib'))
        malware_type_encoder = joblib.load(os.path.join(model_dir, 'malware_type_encoder.joblib'))
        first_api_encoder = joblib.load(os.path.join(model_dir, 'first_api_encoder.joblib'))
        last_api_encoder = joblib.load(os.path.join(model_dir, 'last_api_encoder.joblib'))

        # Set objective for XGBClassifier
        model.objective = 'multi:softprob'

        return model, tfidf, malware_type_encoder, first_api_encoder, last_api_encoder

    except Exception as e:
        print(f"Error loading model artifacts: {str(e)}")
        raise

def prepare_batch_samples(df, tfidf, first_api_encoder, last_api_encoder):
    """
    Prepare multiple samples for prediction.
    """
    try:
        # Encode first and last API calls
        first_api_encoded = first_api_encoder.transform(df['first_api'])
        last_api_encoded = last_api_encoder.transform(df['last_api'])

        # Normalize API call count
        api_call_count_norm = np.log1p(df['api_call_count'])

        # Create TF-IDF features for API sequence
        api_sequence_features = tfidf.transform(df['api_sequence'])

        # Combine features
        numeric_features = np.column_stack((
            first_api_encoded,
            last_api_encoded,
            api_call_count_norm
        ))
        numeric_features_sparse = csr_matrix(numeric_features)

        # Create final feature matrix
        X = hstack([numeric_features_sparse, api_sequence_features])

        return X
    except Exception as e:
        print(f"Error preparing features: {str(e)}")
        raise

def plot_confusion_matrix(true_labels, predicted_labels, class_names, output_path_prefix):
    """
    Create and save confusion matrix visualization.

    Args:
        output_path_prefix (str): Base path for saving confusion matrices (without extension)
    """
    try:
        plt.figure(figsize=(12, 10))
        cm = confusion_matrix(true_labels, predicted_labels)
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

        # Raw counts matrix
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=class_names,
                    yticklabels=class_names)
        plt.title('Confusion Matrix (Raw Counts)')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f"{output_path_prefix}_counts.png")
        plt.close()

        # Normalized matrix
        plt.figure(figsize=(12, 10))
        sns.heatmap(cm_normalized, annot=True, fmt='.4%', cmap='Blues',
                    xticklabels=class_names,
                    yticklabels=class_names)
        plt.title('Confusion Matrix (Normalized)')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f"{output_path_prefix}_normalized.png")
        plt.close()
    except Exception as e:
        print(f"Error creating confusion matrix: {str(e)}")
        raise

def process_single_file(model_dir, file_path, run_dir):
    """
    Process a single CSV file and save results.
    """
    start_time = time.time()
    initial_metrics = get_system_metrics()
    file_name = os.path.basename(file_path)

    try:
        print(f"\nProcessing {file_name}...")

        # Load data
        df = pd.read_csv(file_path)
        required_columns = ['first_api', 'last_api', 'api_call_count', 'api_sequence', 'malware_type']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")

        # Load model and artifacts
        model, tfidf, malware_type_encoder, first_api_encoder, last_api_encoder = \
            load_model_artifacts(model_dir)

        # Prepare features
        X = prepare_batch_samples(df, tfidf, first_api_encoder, last_api_encoder)

        # Make predictions
        dmatrix = DMatrix(X)
        pred_proba = model.get_booster().predict(dmatrix)
        pred_classes = pred_proba.argmax(axis=1)
        predicted_labels = malware_type_encoder.inverse_transform(pred_classes)

        # Prepare results DataFrame
        results_df = df.copy()
        results_df['predicted_malware_type'] = predicted_labels
        results_df['confidence'] = [max(probs) for probs in pred_proba]

        # Add class probabilities
        for i, class_name in enumerate(malware_type_encoder.classes_):
            results_df[f'prob_{class_name}'] = pred_proba[:, i]

        # Calculate metrics
        accuracy = (predicted_labels == df['malware_type']).mean()
        file_metrics = {
            'filename': file_name,
            'processing_time': time.time() - start_time,
            'sample_count': len(df),
            'accuracy': round(accuracy, 4),
            'avg_confidence': results_df['confidence'].mean(),
            'low_confidence_count': sum(results_df['confidence'] < 0.5)
        }

        # Create confusion matrix
        output_path_prefix = os.path.join(run_dir, 'confusion_matrices', f'confusion_matrix_{file_name[:-4]}')
        plot_confusion_matrix(
            df['malware_type'],
            predicted_labels,
            malware_type_encoder.classes_,
            output_path_prefix
        )

        # Save classification report
        report = classification_report(df['malware_type'], predicted_labels, output_dict=True)
        report_df = pd.DataFrame(report).transpose()
        report_df.to_csv(os.path.join(run_dir, 'metrics', f'classification_report_{file_name}'), index=True)

        # Add system metrics
        final_metrics = get_system_metrics()
        file_metrics.update({
            'peak_memory_mb': max(initial_metrics['memory_usage_mb'], final_metrics['memory_usage_mb']),
            'peak_cpu_percent': max(initial_metrics['cpu_percent'], final_metrics['cpu_percent']),
            'peak_threads': max(initial_metrics['threads'], final_metrics['threads'])
        })

        # Save individual results
        output_path = os.path.join(run_dir, 'individual_results', f'predictions_{file_name}')
        results_df.to_csv(output_path, index=False)
        print(f"Results saved to {output_path}")

        return results_df, file_metrics

    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None, None

def process_multiple_files(model_dir, csv_paths, base_output_dir):
    """
    Process multiple CSV files and generate combined analysis.
    """
    # Create directory for this run
    run_dir = create_run_directory(base_output_dir)
    print(f"Created new run directory: {run_dir}")

    # Process each file individually
    all_metrics = []
    all_dfs = []

    for file_path in csv_paths:
        if not os.path.exists(file_path):
            print(f"Warning: File not found - {file_path}")
            continue

        df, metrics = process_single_file(model_dir, file_path, run_dir)

        if df is not None and metrics is not None:
            all_dfs.append(df)
            all_metrics.append(metrics)

    if not all_dfs:
        raise ValueError("No files were processed successfully")

    # Process combined dataset
    print("\nProcessing combined dataset...")
    combined_df = pd.concat(all_dfs, ignore_index=True)

    # Calculate combined metrics
    combined_metrics = {
        'filename': 'combined_dataset',
        'sample_count': len(combined_df),
        'accuracy': round((combined_df['malware_type'] == combined_df['predicted_malware_type']).mean(), 4),
        'avg_confidence': combined_df['confidence'].mean(),
        'low_confidence_count': sum(combined_df['confidence'] < 0.5)
    }

    # Create combined confusion matrix
    plot_confusion_matrix(
        combined_df['malware_type'],
        combined_df['predicted_malware_type'],
        combined_df['predicted_malware_type'].unique(),
        os.path.join(run_dir, 'combined_results', 'confusion_matrix_combined')
    )

    # Save combined results
    combined_output = os.path.join(run_dir, 'combined_results', 'combined_predictions.csv')
    combined_df.to_csv(combined_output, index=False)

    # Create summary report
    summary = pd.DataFrame(all_metrics + [combined_metrics])
    summary.to_csv(os.path.join(run_dir, 'metrics', 'processing_metrics.csv'), index=False)

    # Print summary
    print("\nProcessing Summary:")
    print("-" * 50)
    for metrics in all_metrics:
        print(f"\nFile: {metrics['filename']}")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"Processing Time: {metrics['processing_time']:.2f} seconds")
        print(f"Peak Memory Usage: {metrics['peak_memory_mb']:.2f} MB")
        print(f"Peak CPU Usage: {metrics['peak_cpu_percent']:.2f}%")

    print("\nCombined Dataset Results:")
    print(f"Total Samples: {combined_metrics['sample_count']}")
    print(f"Overall Accuracy: {combined_metrics['accuracy']:.4f}")
    print(f"Average Confidence: {combined_metrics['avg_confidence']:.4f}")

    print(f"\nAll results saved in: {run_dir}")

if __name__ == "__main__":
    # Specific CSV paths
    csv_paths = [
        "/csv/path/dataset.csv"
    ]

    model_dir = '/model/directory/'
    base_output_dir = '/output/directory/'

    process_multiple_files(model_dir, csv_paths, base_output_dir)