In [1]:
import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    cohen_kappa_score,
    classification_report,
)
import warnings

warnings.filterwarnings("ignore")

In [3]:
# Путь к объединенным CSV-файлам
combined_path = "combined_csvs"
combined_files = [f for f in os.listdir(combined_path) if f.endswith(".csv")]

# Сортировка файлов по числу пакетов (например, _250_)
combined_files.sort(key=lambda f: int(re.search(r"(\d+)_packets", f).group(1)))

# Словарь моделей с параметром verbose
models = {
    "Random Forest": RandomForestClassifier(random_state=42, verbose=1, n_jobs=-1),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42, verbose=1),
    "SVM": LinearSVC(verbose=1, random_state=42),
    "k-NN": KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    "Logistic Regression": LogisticRegression(
        solver="saga",
        multi_class="ovr",
        class_weight="balanced",
        max_iter=1000,
        verbose=1,
        n_jobs=-1,
        random_state=42,
    ),
}

# Создание папки для промежуточных файлов
intermediate_path = "binary_class_models_results"
os.makedirs(intermediate_path, exist_ok=True)

# Итоговый список для всех результатов
all_results = []

# Проход по каждому CSV-файлу
for combined_file in tqdm(combined_files, desc="Обработка файлов"):
    df = pd.read_csv(os.path.join(combined_path, combined_file))
    X = df.drop(columns=["Label"])
    y = df["Label"]

    # Извлечение размера пакетов из имени файла
    packet_size = int(re.search(r"(\d+)_packets", combined_file).group(1))

    # Разделение данных и нормализация в отдельном блоке try
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42, stratify=y
        )
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    except ValueError as e:
        print(f"  ! Ошибка при подготовке данных для файла '{combined_file}': {e}")
        continue

    # Проход по каждой модели
    for model_name, model in models.items():
        results = {
            "Model": model_name,
            "File": combined_file,
            "Packet Size": packet_size,
        }

        print(f"  - Обучение модели {model_name}")
        try:
            # Обучение модели с выводом прогресса
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            # Расчёт метрик
            results["Accuracy"] = accuracy_score(y_test, y_pred)
            results["Macro F1 Score"] = f1_score(y_test, y_pred, average="macro")
            results["Micro F1 Score"] = f1_score(y_test, y_pred, average="micro")
            results["Weighted F1 Score"] = f1_score(y_test, y_pred, average="weighted")
            results["Macro Precision"] = precision_score(
                y_test, y_pred, average="macro"
            )
            results["Micro Precision"] = precision_score(
                y_test, y_pred, average="micro"
            )
            results["Cohen's Kappa"] = cohen_kappa_score(y_test, y_pred)

            # Отчёт по каждому классу
            class_report = classification_report(y_test, y_pred, output_dict=True)
            for label, metrics in class_report.items():
                if isinstance(metrics, dict):
                    results[f"Precision_{label}"] = metrics["precision"]
                    results[f"Recall_{label}"] = metrics["recall"]
                    results[f"F1_{label}"] = metrics["f1-score"]

        except ValueError as e:
            # Пропуск метрик при ошибке
            results.update(
                {
                    "Accuracy": "0.0",
                    "Macro F1 Score": "0.0",
                    "Micro F1 Score": "0.0",
                    "Weighted F1 Score": "0.0",
                    "Macro Precision": "0.0",
                    "Micro Precision": "0.0",
                    "Cohen's Kappa": "0.0",
                }
            )
            print(
                f"  ! Ошибка обучения модели '{model_name}' для файла '{combined_file}': {e}"
            )

        # Добавление результата
        all_results.append(results)

        # Промежуточный CSV для модели
        model_results_df = pd.DataFrame([results])
        model_results_file = os.path.join(
            intermediate_path, f"{model_name}_multiclass_results.csv"
        )

        # Если файл существует, добавляем данные
        if os.path.exists(model_results_file):
            model_results_df.to_csv(
                model_results_file, mode="a", index=False, header=False, float_format="%.3f"
            )
        else:
            model_results_df.to_csv(model_results_file, index=False, float_format="%.3f")

# Итоговый файл с результатами всех моделей
final_results_df = pd.DataFrame(all_results)
final_results_df.to_csv("final_multiclass_models_results.csv", index=False, float_format="%.3f")
print("Итоговый файл сохранён в 'final_multiclass_models_results.csv'")

Обработка файлов:   0%|          | 0/9 [00:00<?, ?it/s]

  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   14.8s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.4s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.2126            1.38m
         2           1.0939            1.35m
         3           0.9959            1.33m
         4           0.9138            1.30m
         5           0.8444            1.28m
         6           0.7844            1.26m
         7           0.7331            1.23m
         8           0.6883            1.22m
         9           0.6494            1.20m
        10           0.6161            1.18m
        20           0.4371            1.05m
        30           0.3792           54.91s
        40           0.3514           46.90s
        50           0.3357           39.39s
        60           0.3242           31.44s
        70           0.3156           23.41s
        80           0.3095           15.58s
        90           0.3055            7.79s
       100           0.3018            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.


max_iter reached after 206 seconds


Обработка файлов:  11%|█         | 1/9 [05:30<44:01, 330.24s/it]

  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.5s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.2001           18.70s
         2           1.0714           18.32s
         3           0.9650           17.97s
         4           0.8752           17.60s
         5           0.7991           17.35s
         6           0.7335           16.99s
         7           0.6770           16.73s
         8           0.6263           16.49s
         9           0.5837           16.24s
        10           0.5462           15.98s
        20           0.3407           14.19s
        30           0.2711           12.46s
        40           0.2405           10.67s
        50           0.2263            8.89s
        60           0.2173            7.11s
        70           0.2107            5.32s
        80           0.2060            3.55s
        90           0.2022            1.78s
       100           0.1993            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.


max_iter reached after 25 seconds


Обработка файлов:  22%|██▏       | 2/9 [06:20<19:19, 165.62s/it][Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.


  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.1906            9.80s
         2           1.0551            9.75s
         3           0.9420            9.70s
         4           0.8474            9.55s
         5           0.7665            9.44s
         6           0.6974            9.27s
         7           0.6377            9.08s
         8           0.5855            8.98s
         9           0.5394            8.85s
        10           0.5000            8.69s
        20           0.2759            7.75s
        30           0.1989            6.80s
        40           0.1705            5.85s
        50           0.1569            4.87s
        60           0.1496            3.88s
        70           0.1451            2.91s
        80           0.1421            1.94s
        90           0.1394            0.98s
       100           0.1368            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
Обработка файлов:  33%|███▎      | 3/9 [06:45<10:08, 101.45s/it]

max_iter reached after 11 seconds


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.


  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.1856            4.45s
         2           1.0447            4.41s
         3           0.9286            4.39s
         4           0.8306            4.33s
         5           0.7475            4.22s
         6           0.6762            4.14s
         7           0.6137            4.07s
         8           0.5604            4.00s
         9           0.5136            3.95s
        10           0.4713            3.89s
        20           0.2439            3.45s
        30           0.1661            3.02s
        40           0.1369            2.59s
        50           0.1248            2.16s
        60           0.1180            1.72s
        70           0.1143            1.29s
        80           0.1115            0.86s
        90           0.1088            0.43s
       100           0.1060            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
Обработка файлов:  44%|████▍     | 4/9 [06:56<05:28, 65.61s/it] 

max_iter reached after 5 seconds
  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.1838            2.28s
         2           1.0417            2.35s
         3           0.9238            2.30s
         4           0.8247            2.28s
         5           0.7399            2.24s
         6           0.6676            2.22s
         7           0.6045            2.18s
         8           0.5503            2.16s
         9           0.5027            2.13s
        10           0.4604            2.10s
        20           0.2303            1.85s
        30           0.1528            1.64s
        40           0.1216            1.41s
        50           0.1085            1.17s
        60           0.1007            0.94s
        70           0.0956            0.70s
        80           0.0915            0.47s
        90           0.0887            0.23s
       100           0.0845            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
Обработка файлов:  56%|█████▌    | 5/9 [07:01<02:55, 43.83s/it]

convergence after 776 epochs took 1 seconds
  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.1822            2.67s
         2           1.0395            2.40s
         3           0.9209            2.26s
         4           0.8214            2.16s
         5           0.7367            2.11s
         6           0.6632            2.05s
         7           0.6004            2.01s
         8           0.5452            1.98s
         9           0.4968            1.96s
        10           0.4549            1.92s
        20           0.2219            1.54s
        30           0.1441            1.32s
        40           0.1125            1.10s
        50           0.0968            0.90s
        60           0.0886            0.72s
        70           0.0815            0.53s
        80           0.0746            0.35s
        90           0.0717            0.17s
       100           0.0622            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
Обработка файлов:  67%|██████▋   | 6/9 [07:05<01:30, 30.33s/it]

convergence after 965 epochs took 1 seconds
  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.1829            1.68s
         2           1.0410            1.67s
         3           0.9234            1.65s
         4           0.8243            1.61s
         5           0.7403            1.58s
         6           0.6675            1.55s
         7           0.6047            1.53s
         8           0.5497            1.51s
         9           0.5020            1.49s
        10           0.4591            1.47s
        20           0.2298            1.21s
        30           0.1498            1.02s
        40           0.1181            0.86s
        50           0.0997            0.71s
        60           0.0905            0.56s
        70           0.0863            0.42s
        80           0.0821            0.28s
        90           0.0788            0.14s
       100           0.0755            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
Обработка файлов:  78%|███████▊  | 7/9 [07:08<00:42, 21.43s/it]

convergence after 790 epochs took 1 seconds
  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.1818            0.49s
         2           1.0368            0.44s
         3           0.9187            0.45s
         4           0.8174            0.46s
         5           0.7341            0.46s
         6           0.6616            0.45s
         7           0.5967            0.45s
         8           0.5406            0.44s
         9           0.4922            0.43s
        10           0.4489            0.41s
        20           0.2091            0.35s
        30           0.1154            0.30s
        40           0.0748            0.25s
        50           0.0547            0.21s
        60           0.0431            0.17s
        70           0.0321            0.13s
        80           0.0262            0.08s
        90           0.0217            0.04s
       100           0.0187            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
Обработка файлов:  89%|████████▉ | 8/9 [07:10<00:15, 15.00s/it]

convergence after 555 epochs took 0 seconds
  - Обучение модели Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.0s finished


  - Обучение модели Gradient Boosting
      Iter       Train Loss   Remaining Time 
         1           1.1759            0.70s
         2           1.0289            0.49s
         3           0.9084            0.39s
         4           0.8071            0.36s
         5           0.7211            0.32s
         6           0.6448            0.31s
         7           0.5807            0.29s
         8           0.5228            0.29s
         9           0.4737            0.28s
        10           0.4308            0.27s
        20           0.1801            0.22s
        30           0.0792            0.19s
        40           0.0379            0.16s
        50           0.0210            0.14s
        60           0.0114            0.11s
        70           0.0069            0.08s
        80           0.0043            0.05s
        90           0.0027            0.03s
       100           0.0019            0.00s
  - Обучение модели SVM
[LibLinear]  - Обучение модели k-NN
 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
Обработка файлов: 100%|██████████| 9/9 [07:10<00:00, 47.89s/it]

convergence after 425 epochs took 0 seconds
Итоговый файл сохранён в 'final_multiclass_models_results.csv'





In [4]:
# Создание папки для модифицированных промежуточных файлов
intermediate_path_modified = "binary_class_models_results_modified"
os.makedirs(intermediate_path_modified, exist_ok=True)

# Путь к промежуточным файлам
intermediate_path = "binary_class_models_results"
model_results_files = [f for f in os.listdir(intermediate_path) if f.endswith("_multiclass_results.csv")]

# Создаем итоговый DataFrame для всех моделей
final_results = []

# Процесс обработки каждого промежуточного файла модели
for model_results_file in model_results_files:
    # Чтение промежуточного CSV файла
    df = pd.read_csv(os.path.join(intermediate_path, model_results_file))
    # display(df)

    # Получаем список метрик, исключая "Packet Size" и "Model"
    metrics_columns = [col for col in df.columns if col not in ['Packet Size', 'Model', 'File']]

    # Определение лучшей выборки для каждой метрики
    best = {}
    counts = {}

    # Добавление столбца Counts для каждой выборки
    max1_counts_column = []
    for index, row in df.iterrows():
        max1_count = sum(1 for col in metrics_columns if row[col] == df[col].max())  # Считаем, сколько раз максимальное значение встречается
        max1_counts_column.append(max1_count)

    df["max 1"] = max1_counts_column

    # Для каждой метрики находим наилучшие выборки
    for col in metrics_columns:  # Перебираем только метрики
        max_value = df[col].max()  # Находим максимальное значение для метрики
        best[col] = df[df[col] == max_value]["Packet Size"].tolist()  # Сохраняем размеры выборок с максимальным значением

        # Подсчитаем, сколько раз каждая выборка была максимальной для этой метрики
        counts[col] = df[col].value_counts().get(max_value, 0)  # Подсчитаем сколько раз максимальное значение встречается

    # Строка Best
    best_row = {"Packet Size": "Best"}
    for col in metrics_columns:
        best_row[col] = best[col]  # Присваиваем лучшие выборки для каждой метрики

    # Добавление строки Best в DataFrame
    df_best = pd.DataFrame([best_row])

    # Добавление строки Best в DataFrame
    df = pd.concat([df, df_best], ignore_index=True)
    
    model_results_file_modified = os.path.join(
            intermediate_path_modified, f"{os.path.splitext(model_results_file)[0]}_modified.csv"
        )

    # Запись результатов в новый CSV файл
    df.to_csv(model_results_file_modified, index=False)