In [1]:
import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    cohen_kappa_score,
    classification_report,
)
import warnings

warnings.filterwarnings("ignore")

In [2]:
# Путь к объединенным CSV-файлам
combined_path = "combined_csvs"
combined_files = [f for f in os.listdir(combined_path) if f.endswith(".csv")]

# Сортировка файлов по числу пакетов (например, _250_)
combined_files.sort(key=lambda f: int(re.search(r"(\d+)_packets", f).group(1)))

# Словарь моделей с параметром verbose
models = {
    "Random Forest": RandomForestClassifier(random_state=42, verbose=1, n_jobs=-1),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42, verbose=1),
    "SVM": LinearSVC(verbose=1, random_state=42),
    "k-NN": KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    "Logistic Regression": LogisticRegression(
        solver="saga",
        multi_class="ovr",
        class_weight="balanced",
        max_iter=1000,
        verbose=1,
        n_jobs=-1,
        random_state=42,
    ),
}

# Создание папки для промежуточных файлов
intermediate_path = "binary_class_models_results"
os.makedirs(intermediate_path, exist_ok=True)

# Итоговый список для всех результатов
all_results = []

# Проход по каждому CSV-файлу
for combined_file in tqdm(combined_files, desc="Обработка файлов"):
    df = pd.read_csv(os.path.join(combined_path, combined_file))
    X = df.drop(columns=["Label"])
    y = df["Label"]

    # Извлечение размера пакетов из имени файла
    packet_size = int(re.search(r"(\d+)_packets", combined_file).group(1))

    # Разделение данных и нормализация в отдельном блоке try
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42, stratify=y
        )
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    except ValueError as e:
        print(f"  ! Ошибка при подготовке данных для файла '{combined_file}': {e}")
        continue

    # Проход по каждой модели
    for model_name, model in models.items():
        results = {
            "Model": model_name,
            "File": combined_file,
            "Packet Size": packet_size,
        }

        print(f"  - Обучение модели {model_name}")
        try:
            # Обучение модели с выводом прогресса
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            # Расчёт метрик
            results["Accuracy"] = accuracy_score(y_test, y_pred)
            results["Macro F1 Score"] = f1_score(y_test, y_pred, average="macro")
            results["Micro F1 Score"] = f1_score(y_test, y_pred, average="micro")
            results["Weighted F1 Score"] = f1_score(y_test, y_pred, average="weighted")
            results["Macro Precision"] = precision_score(
                y_test, y_pred, average="macro"
            )
            results["Micro Precision"] = precision_score(
                y_test, y_pred, average="micro"
            )
            results["Cohen's Kappa"] = cohen_kappa_score(y_test, y_pred)

            # Отчёт по каждому классу
            class_report = classification_report(y_test, y_pred, output_dict=True)
            for label, metrics in class_report.items():
                if isinstance(metrics, dict):
                    results[f"Precision_{label}"] = metrics["precision"]
                    results[f"Recall_{label}"] = metrics["recall"]
                    results[f"F1_{label}"] = metrics["f1-score"]

        except ValueError as e:
            # Пропуск метрик при ошибке
            results.update(
                {
                    "Accuracy": "0.0",
                    "Macro F1 Score": "0.0",
                    "Micro F1 Score": "0.0",
                    "Weighted F1 Score": "0.0",
                    "Macro Precision": "0.0",
                    "Micro Precision": "0.0",
                    "Cohen's Kappa": "0.0",
                }
            )
            print(
                f"  ! Ошибка обучения модели '{model_name}' для файла '{combined_file}': {e}"
            )

        # Добавление результата
        all_results.append(results)

        # Промежуточный CSV для модели
        model_results_df = pd.DataFrame([results])
        model_results_file = os.path.join(
            intermediate_path, f"{model_name}_multiclass_results.csv"
        )

        # Если файл существует, добавляем данные
        if os.path.exists(model_results_file):
            model_results_df.to_csv(
                model_results_file, mode="a", index=False, header=False, float_format="%.3f"
            )
        else:
            model_results_df.to_csv(model_results_file, index=False, float_format="%.3f")

# Итоговый файл с результатами всех моделей
final_results_df = pd.DataFrame(all_results)
final_results_df.to_csv("final_multiclass_models_results.csv", index=False, float_format="%.3f")
print("Итоговый файл сохранён в 'final_multiclass_models_results.csv'")

Обработка файлов:   0%|          | 0/9 [00:00<?, ?it/s]

  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   13.8s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.9s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.2981            1.34m
         2           1.2377            1.34m
         3           1.1881            1.31m
         4           1.1466            1.28m
         5           1.1120            1.26m
         6           1.0830            1.24m
         7           1.0586            1.24m
         8           1.0374            1.23m
         9           1.0195            1.21m
        10           1.0045            1.19m
        20           0.9301            1.04m
        30           0.9093           54.58s
        40           0.9012           46.92s
        50           0.8970           39.07s
        60           0.8945           31.21s
        70           0.8927           23.41s
        80           0.8909           15.67s
        90           0.8890            7.82s
       100           0.8879            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.


max_iter reached after 247 seconds


Обработка файлов:  11%|█         | 1/9 [06:18<50:29, 378.67s/it]

  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.6s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.2870           17.11s
         2           1.2178           16.99s
         3           1.1608           17.06s
         4           1.1134           16.86s
         5           1.0737           16.65s
         6           1.0400           16.36s
         7           1.0115           16.09s
         8           0.9872           15.90s
         9           0.9664           15.66s
        10           0.9486           15.43s
        20           0.8576           13.71s
        30           0.8314           12.03s
        40           0.8196           10.30s
        50           0.8125            8.58s
        60           0.8076            6.88s
        70           0.8043            5.15s
        80           0.8014            3.44s
        90           0.7991            1.72s
       100           0.7963            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.


convergence after 140 epochs took 4 seconds


Обработка файлов:  22%|██▏       | 2/9 [06:48<20:12, 173.26s/it][Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.


  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.2803            9.20s
         2           1.2055            9.20s
         3           1.1440            9.18s
         4           1.0926            9.02s
         5           1.0493            8.87s
         6           1.0123            8.71s
         7           0.9808            8.52s
         8           0.9538            8.42s
         9           0.9304            8.31s
        10           0.9103            8.17s
        20           0.8062            7.24s
        30           0.7735            6.35s
        40           0.7607            5.44s
        50           0.7536            4.54s
        60           0.7485            3.63s
        70           0.7450            2.71s
        80           0.7418            1.81s
        90           0.7390            0.91s
       100           0.7373            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
Обработка файлов:  33%|███▎      | 3/9 [07:14<10:36, 106.07s/it]

max_iter reached after 13 seconds


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.


  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.2742            4.16s
         2           1.1956            4.11s
         3           1.1297            4.10s
         4           1.0750            4.03s
         5           1.0284            3.97s
         6           0.9889            3.88s
         7           0.9556            3.81s
         8           0.9268            3.76s
         9           0.9019            3.72s
        10           0.8798            3.65s
        20           0.7664            3.23s
        30           0.7313            2.83s
        40           0.7177            2.42s
        50           0.7100            2.01s
        60           0.7050            1.61s
        70           0.7013            1.20s
        80           0.6977            0.81s
        90           0.6949            0.41s
       100           0.6926            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
Обработка файлов:  44%|████▍     | 4/9 [07:24<05:40, 68.06s/it] 

convergence after 787 epochs took 4 seconds
  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.2672            2.18s
         2           1.1814            2.20s
         3           1.1106            2.20s
         4           1.0513            2.16s
         5           1.0013            2.13s
         6           0.9587            2.10s
         7           0.9220            2.06s
         8           0.8907            2.03s
         9           0.8632            2.01s
        10           0.8387            1.98s
        20           0.7108            1.75s
        30           0.6678            1.53s
        40           0.6491            1.31s
        50           0.6386            1.09s
        60           0.6310            0.87s
        70           0.6248            0.65s
        80           0.6203            0.44s
        90           0.6159            0.22s
       100           0.6122            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
Обработка файлов:  56%|█████▌    | 5/9 [07:29<03:02, 45.63s/it]

max_iter reached after 3 seconds
  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.2580            1.58s
         2           1.1652            1.57s
         3           1.0880            1.58s
         4           1.0228            1.56s
         5           0.9675            1.54s
         6           0.9207            1.52s
         7           0.8799            1.49s
         8           0.8451            1.47s
         9           0.8144            1.46s
        10           0.7870            1.43s
        20           0.6428            1.26s
        30           0.5934            1.10s
        40           0.5717            0.93s
        50           0.5599            0.78s
        60           0.5523            0.62s
        70           0.5457            0.46s
        80           0.5407            0.31s
        90           0.5359            0.15s
       100           0.5315            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
Обработка файлов:  67%|██████▋   | 6/9 [07:33<01:34, 31.48s/it]

max_iter reached after 2 seconds
  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.2506            1.19s
         2           1.1511            1.22s
         3           1.0680            1.23s
         4           0.9987            1.22s
         5           0.9396            1.20s
         6           0.8894            1.19s
         7           0.8456            1.17s
         8           0.8076            1.15s
         9           0.7747            1.14s
        10           0.7459            1.12s
        20           0.5949            1.00s
        30           0.5416            0.87s
        40           0.5176            0.75s
        50           0.5041            0.62s
        60           0.4949            0.50s
        70           0.4873            0.37s
        80           0.4812            0.25s
        90           0.4762            0.12s
       100           0.4716            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
Обработка файлов:  78%|███████▊  | 7/9 [07:37<00:44, 22.25s/it]

max_iter reached after 1 seconds
  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.2260            0.39s
         2           1.1063            0.39s
         3           1.0067            0.42s
         4           0.9224            0.41s
         5           0.8479            0.42s
         6           0.7825            0.41s
         7           0.7269            0.40s
         8           0.6801            0.40s
         9           0.6381            0.39s
        10           0.6003            0.40s
        20           0.3902            0.35s
        30           0.3120            0.30s
        40           0.2706            0.26s
        50           0.2386            0.21s
        60           0.2144            0.17s
        70           0.2019            0.12s
        80           0.1882            0.08s
        90           0.1751            0.04s
       100           0.1620            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
Обработка файлов:  89%|████████▉ | 8/9 [07:38<00:15, 15.56s/it]

convergence after 913 epochs took 0 seconds
  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.2126            0.49s
         2           1.0819            0.39s
         3           0.9731            0.36s
         4           0.8810            0.31s
         5           0.8024            0.30s
         6           0.7349            0.28s
         7           0.6761            0.28s
         8           0.6236            0.26s
         9           0.5780            0.26s
        10           0.5381            0.26s
        20           0.3101            0.22s
        30           0.2194            0.19s
        40           0.1744            0.16s
        50           0.1356            0.13s
        60           0.1095            0.10s
        70           0.0924            0.08s
        80           0.0797            0.05s
        90           0.0667            0.03s
       100           0.0576            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
Обработка файлов: 100%|██████████| 9/9 [07:39<00:00, 51.05s/it]

convergence after 722 epochs took 0 seconds
Итоговый файл сохранён в 'final_multiclass_models_results.csv'





In [3]:
# Создание папки для модифицированных промежуточных файлов
intermediate_path_modified = "binary_class_models_results_modified"
os.makedirs(intermediate_path_modified, exist_ok=True)

# Путь к промежуточным файлам
intermediate_path = "binary_class_models_results"
model_results_files = [f for f in os.listdir(intermediate_path) if f.endswith("_multiclass_results.csv")]

# Создаем итоговый DataFrame для всех моделей
final_results = []

# Процесс обработки каждого промежуточного файла модели
for model_results_file in model_results_files:
    # Чтение промежуточного CSV файла
    df = pd.read_csv(os.path.join(intermediate_path, model_results_file))
    # display(df)

    # Получаем список метрик, исключая "Packet Size" и "Model"
    metrics_columns = [col for col in df.columns if col not in ['Packet Size', 'Model', 'File']]

    # Определение лучшей выборки для каждой метрики
    best = {}
    counts = {}

    # Добавление столбца Counts для каждой выборки
    max1_counts_column = []
    for index, row in df.iterrows():
        max1_count = sum(1 for col in metrics_columns if row[col] == df[col].max())  # Считаем, сколько раз максимальное значение встречается
        max1_counts_column.append(max1_count)

    df["max 1"] = max1_counts_column

    # Для каждой метрики находим наилучшие выборки
    for col in metrics_columns:  # Перебираем только метрики
        max_value = df[col].max()  # Находим максимальное значение для метрики
        best[col] = df[df[col] == max_value]["Packet Size"].tolist()  # Сохраняем размеры выборок с максимальным значением

        # Подсчитаем, сколько раз каждая выборка была максимальной для этой метрики
        counts[col] = df[col].value_counts().get(max_value, 0)  # Подсчитаем сколько раз максимальное значение встречается

    # Строка Best
    best_row = {"Packet Size": "Best"}
    for col in metrics_columns:
        best_row[col] = best[col]  # Присваиваем лучшие выборки для каждой метрики

    # Добавление строки Best в DataFrame
    df_best = pd.DataFrame([best_row])

    # Добавление строки Best в DataFrame
    df = pd.concat([df, df_best], ignore_index=True)
    
    model_results_file_modified = os.path.join(
            intermediate_path_modified, f"{os.path.splitext(model_results_file)[0]}_modified.csv"
        )

    # Запись результатов в новый CSV файл
    df.to_csv(model_results_file_modified, index=False)