In [1]:
import os
import pandas as pd
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

from sklearn.metrics import (
     accuracy_score, auc,
     classification_report,
     confusion_matrix)

In [2]:
def show_importances_feature(shape, mapped_df, model):
    importances = model.feature_importances_

    ## sorted by importance
    indexes = sorted(range(len(importances)), key=lambda i: importances[i], reverse=False)
    feature_column_names = [f"Features name - {mapped_df.columns[i]} - index -  {i}" for i in indexes]

    plt.figure(figsize=(8, 14))
    plt.barh(range(shape), importances[indexes], align="center")
    plt.yticks(range(shape), feature_column_names)
    plt.show()


def check_missing_values(data_frame):
    missing_values = data_frame.isnull().sum()
    missing_columns = missing_values[missing_values > 0]

    total_rows = data_frame.shape[0]

    if not missing_columns.empty:
        print("=== Kolumny z brakującymi wartościami ===")
        for column, count in missing_columns.items():
            percentage = (count / total_rows) * 100
            print(f"Kolumna: {column} - {count} brakujących wartości - ({percentage:.2f}% danych)")
    else:
        print("Brak brakujących wartości w zbiorze danych.")


def visualize_labels(data_frame):
    label_counts = data_frame['Label'].value_counts()

    sns.set(style="whitegrid")

    plt.figure(figsize=(10, 6))
    
    ax = sns.barplot(x=label_counts.index, y=label_counts.values, palette="viridis", hue=label_counts.index, legend=False)

    for p in ax.patches:
        ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', va='bottom', fontsize=12)

    plt.title('Distribution of samples', fontsize=16)
    plt.xlabel('Attack type', fontsize=14)
    plt.ylabel('Number of Cases', fontsize=14)
    
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()

def time_execution(func, *args, **kwargs):
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    execution_time = (end_time - start_time) / 60
    print(f"time execution: {execution_time:.4f} minut")

    return result

def extract_important_features(model, mapped_df, threshold=0.010):
    # Importances feature from the model
    importances = model.feature_importances_

    # Get the indices of features that meet the threshold
    indices = np.where(importances >= threshold)[0]  # Get indices of features with importance >= threshold

    # If no features meet the threshold, return a message
    if len(indices) == 0:
        print("No features with significance >= {}.".format(threshold))
        return

    # Get feature names and importances of the selected features
    important_feature_names = [mapped_df.columns[i] for i in indices]
    important_importances = importances[indices]

    # Sort features by importance in descending order
    sorted_indices = np.argsort(important_importances)
    sorted_feature_names = [important_feature_names[i] for i in sorted_indices]
    sorted_importances = important_importances[sorted_indices]

    # Create a horizontal bar plot for the important features
    plt.figure(figsize=(10, 6))
    plt.barh(sorted_feature_names, sorted_importances, align='center')
    plt.xlabel('Importance')
    plt.title('Features with significance >= {} in the DDoS model.'.format(threshold))
    plt.show()

    return indices

 ## Generate and display a detailed confusion matrix
def plot_confusion_matrix(y_true, y_pred, classes, title='Confusion Matrix'):
    cm = confusion_matrix(y_true, y_pred)

    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                 xticklabels=classes, yticklabels=classes)

    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

def visualizationMetrics(accuracy, f1, precision, recall):
    metrics_data = {
     'Metric': ['Accuracy', 'F1 Score', 'Precision', 'Recall'],
     'Value': [accuracy, f1, precision, recall]
    }

    df_metrics = pd.DataFrame(metrics_data)

    print(df_metrics)

def visualize_model_accuracies(accuracies, title='Model Accuracies Comparison', color='skyblue'):
    models = list(accuracies.keys())
    values = list(accuracies.values())

    plt.figure(figsize=(10, 6))
    bars = plt.barh(models, values, color=color)
    
    for bar in bars:
        plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2,
                 f'{bar.get_width():.4f}', va='center')

    plt.xlabel('Accuracy')
    plt.title(title)
    plt.xlim(0, 1)
    plt.grid(axis='x')
    plt.show()


def visualize_model_metrics(metrics, title='Model Performance Metrics Comparison', color='lightcoral'):
    # Przygotowanie danych
    models = list(metrics.keys())
    f1_scores = [metrics[model]['F1 Score'] for model in models]
    precisions = [metrics[model]['Precision'] for model in models]
    recalls = [metrics[model]['Recall'] for model in models]

    # Ustawienia dla wykresu
    x = np.arange(len(models))  # lokalizacje na osi x
    width = 0.2  # zmniejszona szerokość słupków

    fig, ax = plt.subplots(figsize=(12, 6))

    # Tworzenie słupków dla każdej metryki
    bars1 = ax.bar(x - width, f1_scores, width, label='F1 Score', color=color)
    bars2 = ax.bar(x, precisions, width, label='Precision', color='skyblue')
    bars3 = ax.bar(x + width, recalls, width, label='Recall', color='lightgreen')

    # Dodanie etykiet i tytułu
    ax.set_ylabel('Scores')
    ax.set_title(title)
    ax.set_xticks(x)
    ax.set_xticklabels(models)
    ax.legend()

    # Dodanie wartości na słupkach
    for bars in [bars1, bars2, bars3]:
        for bar in bars:
            height = bar.get_height()
            ax.annotate(f'{height:.2f}',
                        xy=(bar.get_x() + bar.get_width() / 2, height),
                        xytext=(0, 3),  # przesunięcie w górę
                        textcoords="offset points",
                        ha='center', va='bottom')

    # Wyświetlenie wykresu
    plt.ylim(0, 1)  # Ustalamy zakres y od 0 do 1
    plt.grid(axis='y')
    
    # Dodanie przestrzeni po prawej stronie wykresu
    plt.subplots_adjust(right=0.85)  
    plt.show()

def plot_roc_curve(model, X_test, Y_test, model_name):
    # Binarizacja etykiet
    n_classes = len(np.unique(Y_test))
    Y_test_bin = label_binarize(Y_test, classes=np.unique(Y_test))

    proba = model.predict_proba(X_test)

    # Obliczanie krzywych ROC i AUC dla każdej klasy
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(Y_test_bin[:, i], proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Rysowanie krzywych ROC
    plt.figure(figsize=(10, 8))

    for i in range(n_classes):
        plt.plot(fpr[i], tpr[i], label=f'Class {i} (AUC = {roc_auc[i]:.2f})')

    plt.plot([0, 1], [0, 1], linestyle='--', color='black', label='Random Classifier (AUC - 0.50)')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic (ROC) Curve for {model_name}')
    plt.legend()
    plt.grid()
    plt.show()


def visualize_model_probabilities(model_probabilities, title='Model Probability Metrics Comparison', color='lightblue'):
    # Przygotowanie danych
    models = list(model_probabilities.keys())
    probabilities = np.array(list(model_probabilities.values()))

    # Ustawienia dla wykresu
    x = np.arange(len(models))  # lokalizacje na osi x
    width = 0.35  # szerokość słupków

    fig, ax = plt.subplots(figsize=(12, 6))

    # Tworzenie słupków dla każdej klasy
    for i in range(probabilities.shape[1]):
        ax.bar(x + i * width, probabilities[:, i], width, label=f'Class {i}', color=color)

    # Dodanie etykiet i tytułu
    ax.set_ylabel('Probability')
    ax.set_title(title)
    ax.set_xticks(x + width / 2)
    ax.set_xticklabels(models)
    ax.legend()

    # Dodanie wartości na słupkach
    for i in range(probabilities.shape[1]):
        for j in range(len(models)):
            ax.annotate(f'{probabilities[j, i]:.2f}',
                        xy=(j + i * width, probabilities[j, i]),
                        xytext=(0, 3),  # przesunięcie w górę
                        textcoords="offset points",
                        ha='center', va='bottom')

    # Wyświetlenie wykresu
    plt.ylim(0, 1)  # Ustalamy zakres y od 0 do 1
    plt.grid(axis='y')
    plt.show()