# Create synthetic samples from UNSW dataset

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import datacompy
import os, sys
import numpy as np
import re
import ast
import gower
from pathlib import Path
import openpyxl
import itertools
import torch

# narzedzia
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    learning_curve,
    RepeatedStratifiedKFold,
    GridSearchCV,
    RandomizedSearchCV
)
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score, accuracy_score, pairwise_distances, make_scorer, precision_score, f1_score, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from scipy import stats
from scipy.stats import chi2_contingency, f_oneway, friedmanchisquare, wilcoxon
from scipy.spatial import distance
from joblib import dump, load

# modele
from xgboost import XGBClassifier, XGBRFClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# methods
from imblearn.under_sampling import ClusterCentroids, NearMiss
from scipy.optimize import differential_evolution
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, HDBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier, NearestNeighbors
from sklearn.inspection import permutation_importance
from scipy.spatial.distance import euclidean
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from ctgan import CTGAN

  import datacompy


### Preprocessing

In [3]:
#df_unsw_features = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\NUSW-NB15_features.csv')
df_unsw_1 = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_1.csv')
df_unsw_2 = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_2.csv')
df_unsw_3 = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_3.csv')
df_unsw_4 = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_4.csv')

  df_unsw_1 = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_1.csv')
  df_unsw_2 = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_2.csv')


In [None]:
df_unsw = pd.concat([df_unsw_1, df_unsw_2, df_unsw_3, df_unsw_4], ignore_index=True)
df_unsw = df_unsw.drop(columns=['attack_cat'])

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,Label
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0.0,0,3,7,1,3,1,1,1,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0.0,0,2,4,2,3,1,1,2,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0.0,0,12,8,1,2,2,1,1,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0.0,0,6,9,1,1,1,1,1,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0.0,0,7,9,1,1,1,1,1,0


In [None]:
# what to do with ports? src drop because is random, dst - keep because is related to service, port number >1000 is random
def to_int_or_hex(x):
    try:
        if isinstance(x, (int, float)) and not pd.isna(x):
            return int(x)
        elif isinstance(x, str) and x.startswith('0x'):
            return int(x, 16)
        elif isinstance(x, str):
            return int(x)
        else:
            return pd.NA
    except ValueError:
        return pd.NA

df_unsw['dsport'] = df_unsw['dsport'].apply(to_int_or_hex).astype('Int64')

mask_well_known = df_unsw['dsport'] <= 1000
df_well_known = df_unsw[mask_well_known].copy()
df_other_ports = df_unsw[~mask_well_known].copy()

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_ports = encoder.fit_transform(df_well_known[['dsport']])
encoded_df = pd.DataFrame(
    encoded_ports,
    columns=encoder.get_feature_names_out(['dsport']),
    index=df_well_known.index
)

df_well_known = pd.concat([encoded_df, df_well_known.drop(columns=['dsport'])], axis=1)

df_other_ports['dsport_other'] = 1

for col in df_well_known.columns:
    if col not in df_other_ports.columns:
        df_other_ports[col] = 0

df_unsw = pd.concat([df_well_known, df_other_ports], axis=0).sort_index().reset_index(drop=True)

  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0


In [None]:
# insert 'dsport_other' next to 'proto' and fill NaN with 0
col = df_unsw.pop('dsport_other')
dst_col = df_unsw.columns.get_loc('proto')
df_unsw.insert(dst_col, 'dsport_other', col)
df_unsw['dsport_other'] = df_unsw['dsport_other'].fillna(0)
df_unsw = df_unsw.drop(columns=['dsport', 'srcip', 'dstip', 'sport'])

In [None]:

df_unsw['proto_simplified'] = df_unsw['proto'].apply(
    lambda x: x if x in ['tcp', 'udp'] else 'other'
)

df_unsw = pd.get_dummies(df_unsw, columns=['proto_simplified'], prefix='proto')
#df_unsw = df_unsw.loc[:, ~df_unsw.columns.duplicated(keep='first')]

col = df_unsw.pop('proto_other')
col2 = df_unsw.pop('proto_tcp')
col3 = df_unsw.pop('proto_udp')
dst_col = df_unsw.columns.get_loc('state')
df_unsw.insert(dst_col, 'proto_other', col)
df_unsw.insert(dst_col + 1, 'proto_tcp', col2)
df_unsw.insert(dst_col + 2, 'proto_udp', col3)

df_unsw = df_unsw.drop(columns=['proto'])

df_unsw.loc[:, df_unsw.columns.str.startswith('proto')] = (
    df_unsw.loc[:, df_unsw.columns.str.startswith('proto')].astype(int)
)

" proto_index = df_unsw.columns.get_loc('proto')\n\nfor i, col in enumerate(df_proto.columns):\n    df_unsw.insert(proto_index + 1 + i, col, df_unsw[col]) "

In [None]:
encoded_ports = encoder.fit_transform(df_unsw[['proto']])
encoded_df = pd.DataFrame(
    encoded_ports,
    columns=encoder.get_feature_names_out(['proto']),
    index=df_unsw.index
)
proto_index = df_unsw.columns.get_loc('proto')

for i, col in enumerate(encoded_df.columns):
    df_unsw.insert(proto_index + 1 + i, col, encoded_df[col])

In [None]:
# reduced protocols to tcp, udp, other
df_unsw['proto'].value_counts()

In [None]:
df_unsw = pd.get_dummies(df_unsw, columns=['proto'], prefix='proto')

In [10]:
cols_to_normalize = [
    'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss',
    'Sload', 'Dload', 'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb',
    'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit',
    'Stime', 'Ltime', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat',
    'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 
    'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
    'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm'
]

scaler = StandardScaler()
df_unsw[cols_to_normalize] = scaler.fit_transform(df_unsw[cols_to_normalize])

with open(f"D:\\ml\\undersampling_data\\models\\unsw\\scaler.pkl", "wb") as f:
            pickle.dump(scaler, f)

In [16]:
df_unsw.replace(r'^\s*$', np.nan, regex=True, inplace=True)
df_unsw['ct_ftp_cmd'] = df_unsw['ct_ftp_cmd'].fillna(0)

scaler2 = StandardScaler()
df_unsw[['ct_ftp_cmd']] = scaler2.fit_transform(df_unsw[['ct_ftp_cmd']])

with open(f"D:\\ml\\undersampling_data\\models\\unsw\\scaler_ct_ftp_cmd.pkl", "wb") as f:
            pickle.dump(scaler2, f)

In [20]:
valid_states = ['FIN', 'CON', 'INT', 'REQ', 'RST', 'CLO', 'ACC']
df_unsw = df_unsw[df_unsw['state'].isin(valid_states)].reset_index(drop=True)

In [24]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_state = encoder.fit_transform(df_unsw[['state']])
encoded_df = pd.DataFrame(
    encoded_state,
    columns=encoder.get_feature_names_out(['state']),
    index=df_unsw.index
)
state_index = df_unsw.columns.get_loc('state')

for i, col in enumerate(encoded_df.columns):
    df_unsw.insert(state_index + 1 + i, col, encoded_df[col])

In [2]:
df_unsw = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_processed_final.csv')

In [3]:
df_unsw['dsport_20.0'] = (df_unsw['service'] == 'ftp-data').astype(int)

col = df_unsw.pop('dsport_20.0')
dst_col = df_unsw.columns.get_loc('dsport_21.0')
df_unsw.insert(dst_col, 'dsport_20.0', col)

df_unsw = df_unsw[~df_unsw['service'].isin(['radius', 'irc'])].reset_index(drop=True)

In [None]:
df_unsw['ct_flw_http_mthd'] = df_unsw['ct_flw_http_mthd'].fillna(0)
df_unsw['is_ftp_login'] = df_unsw['is_ftp_login'].fillna(0)
df_unsw = df_unsw.drop_duplicates().reset_index(drop=True)

df_unsw = df_unsw.drop(columns=['service'])

In [5]:
X = df_unsw.drop(columns=['Label'])
y = df_unsw['Label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,        # 20% danych do testu
    random_state=42,      # dla powtarzalności
    stratify=y            # zachowaj proporcje klas
)

X_train.shape, X_test.shape

((1432298, 161), (613842, 161))

In [None]:
X_train = X_train.drop(columns=['service'])
X_test = X_test.drop(columns=['service'])

In [6]:
X_train.to_csv('D:\\ml\\undersampling_data\\data\\unsw\\X_train.csv', index=False)
X_test.to_csv('D:\\ml\\undersampling_data\\data\\unsw\\X_test.csv', index=False)
y_train.to_csv('D:\\ml\\undersampling_data\\data\\unsw\\y_train.csv', index=False)
y_test.to_csv('D:\\ml\\undersampling_data\\data\\unsw\\y_test.csv', index=False)
df_unsw.to_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_final.csv', index=False)

### Oversamling

In [3]:
X_train = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\X_train.csv')
y_train = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\y_train.csv')

In [4]:
df = pd.concat([X_train, y_train], axis=1)

In [7]:
df.to_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_train_final.csv', index=False)

In [None]:
file_path1 = "D:\\ml\\undersampling_data\\data\\unsw\\oversampling\\smote_data.csv"
file_path2 = "D:\\ml\\undersampling_data\\data\\unsw\\oversampling\\smote3_data.csv"
#zamiana jesli istnieje to wczytaj plik jesli nie to stworz
coun = Counter(y_train)
majority_class = max(coun, key=coun.get)
minority_class = min(coun, key=coun.get)
missing_samples = coun[majority_class] - coun[minority_class]
print("Before", coun)
smote = SMOTE()
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

smote2 = SMOTE(sampling_strategy={minority_class: coun[minority_class] + 2 * missing_samples}, random_state=42)
X_train_sm3, y_train_sm3 = smote2.fit_resample(X_train, y_train)

train_data_smote = pd.concat([X_train_sm, y_train_sm], axis=1)          #polaczenie danych wygenerowanych X_train oraz y_train
train_data_smote3 = pd.concat([X_train_sm3, y_train_sm3], axis=1)

#smote generated data
train_data_smote['generated_by_smote'] = ['original' if i < len(df) else 'smote' for i in range(len(train_data_smote))]
smote_data = train_data_smote[train_data_smote['generated_by_smote'] == 'smote'].drop('generated_by_smote', axis=1)
smote_data["source"] = "smote"
if not os.path.exists(file_path1):
    smote_data.to_csv(file_path1, index=False)
else:
    print(f'Plik istnieje pod ścieżką: {file_path1}')

#smote3 generated data
train_data_smote3['generated_by_smote'] = ['original' if i < len(df) else 'smote' for i in range(len(train_data_smote3))]
smote_data3 = train_data_smote3[train_data_smote3['generated_by_smote'] == 'smote'].drop('generated_by_smote', axis=1)
smote_data3["source"] = "smote"
if not os.path.exists(file_path2):
    smote_data3.to_csv(file_path2, index=False)
else:
    print(f'Plik istnieje pod ścieżką: {file_path2}')

con1 = Counter(y_train_sm)
print("After", con1)
con2 = Counter(y_train_sm3)
print("After generation 3x SMOTE", con2)
#pd.Series(y_train_sm).value_counts().plot.bar()
#pd.Series(y_train_sm3).value_counts().plot.bar()

Before Counter({0: 1370400, 1: 61898})
After Counter({0: 1370400, 1: 1370400})
After generation 3x SMOTE Counter({1: 2678902, 0: 1370400})


In [14]:
coun3 = Counter(y_train)
print(y_train.value_counts())

Label
0        1370400
1          61898
Name: count, dtype: int64


In [None]:
file_path2 = "D:\\ml\\undersampling_data\\data\\unsw\\oversampling\\borderline_data.csv"
file_path1 = "D:\\ml\\undersampling_data\\data\\unsw\\oversampling\\borderline3_data.csv"
#generate new data by borderLineSMOTE
coun3 = Counter(y_train['Label'])
majority_class = max(coun3, key=coun3.get)
minority_class = min(coun3, key=coun3.get)
missing_samples = coun3[majority_class] - coun3[minority_class]
print("Before", coun3)

brdsmote = BorderlineSMOTE(random_state=42)
X_train_bsm, y_train_bsm = brdsmote.fit_resample(X_train, y_train)

brdsmote3 = BorderlineSMOTE(sampling_strategy={minority_class: coun3[minority_class] + 2 * missing_samples}, random_state=42)
X_train_bsm3, y_train_bsm3 = brdsmote3.fit_resample(X_train, y_train)

con4 = Counter(y_train_bsm)
print("After", con4)
con5 = Counter(y_train_bsm3)
print("After generated 3x BorderlineSMOTE", con5)

train_data_borderline_smote = pd.concat([X_train_bsm, y_train_bsm], axis=1)          #polaczenie danych wygenerowanych X_train oraz y_train
train_data_borderline_smote3 = pd.concat([X_train_bsm3, y_train_bsm3], axis=1)

#borderline smote generated data
train_data_borderline_smote['generated_by_borderline_smote'] = ['original' if i < len(df) else 'brd smote' for i in range(len(train_data_borderline_smote))]
boarderline_smote_data = train_data_borderline_smote[train_data_borderline_smote['generated_by_borderline_smote'] == 'brd smote'].drop('generated_by_borderline_smote', axis=1)
boarderline_smote_data["source"]="borderline smote"
boarderline_smote_data = boarderline_smote_data[boarderline_smote_data['source'] != 'original']
if not os.path.exists(file_path2):
    boarderline_smote_data.to_csv(file_path2, index=False)
else:
    print(f'Plik istnieje pod ścieżką: {file_path2}')
    
#borderline smote3 generated data
train_data_borderline_smote3['generated_by_borderline_smote'] = ['original' if i < len(df) else 'brd smote' for i in range(len(train_data_borderline_smote3))]
boarderline_smote_data3 = train_data_borderline_smote3[train_data_borderline_smote3['generated_by_borderline_smote'] == 'brd smote'].drop('generated_by_borderline_smote', axis=1)
boarderline_smote_data3["source"]="borderline smote"
boarderline_smote_data3 = boarderline_smote_data3[boarderline_smote_data3['source'] != 'original']
if not os.path.exists(file_path1):
    boarderline_smote_data3.to_csv(file_path1, index=False)
else:
    print(f'Plik istnieje pod ścieżką: {file_path1}')


Before Counter({0: 1370400, 1: 61898})
After Counter({'Label': 1})
After generated 3x BorderlineSMOTE Counter({'Label': 1})


ValueError: Data must be 1-dimensional, got ndarray of shape (2740800, 1) instead

In [5]:
file_path3 = "D:\\ml\\undersampling_data\\data\\unsw\\oversampling\\GAN_data.csv"        #sciezka wraz z nazwa pod jaka wygenerowac plik
file_path4 = "D:\\ml\\undersampling_data\\data\\unsw\\oversampling\\GAN3_data.csv"        #sciezka wraz z nazwa pod jaka wygenerowac plik
#generate new data by GAN

#data preparation
df_gan = df

#GAN
columns_list = df_gan.columns
target_num = df_gan['Label'].value_counts()
ctgan = CTGAN(epochs=100, batch_size=128, cuda=True)        #model

if target_num[0] > target_num[1]:
    data_y1 = df_gan[df_gan['Label']==1]
    ctgan.fit(data_y1, columns_list)
    sample = abs(target_num[0]-target_num[1])
    df_GAN = ctgan.sample(sample)
    print('Dane wygenerowane: ', df_GAN['Label'].value_counts())
    balanced_data = pd.concat([df_gan, df_GAN], ignore_index=False)
else:
    data_y0 = df_gan[df_gan['Label']==0]
    ctgan.fit(data_y0, columns_list)
    sample = abs(target_num[0]-target_num[1])
    df_GAN = ctgan.sample(sample)
    print('Dane wygenerowane: ', df_GAN['Label'].value_counts())
    balanced_data = pd.concat([df_gan, df_GAN], ignore_index=False)

#GAN3
if target_num[0] > target_num[1]:
    data_y1 = df_gan[df_gan['Label']==1]
    ctgan.fit(data_y1, columns_list)
    sample = abs(target_num[0]-target_num[1])
    df_GAN3 = ctgan.sample(sample*2)
    print('Dane wygenerowane: ', df_GAN3['Label'].value_counts())
    balanced_data3 = pd.concat([df_gan, df_GAN3], ignore_index=False)
else:
    data_y0 = df_gan[df_gan['Label']==0]
    ctgan.fit(data_y0, columns_list)
    sample = abs(target_num[0]-target_num[1])
    df_GAN3 = ctgan.sample(sample*2)
    print('Dane wygenerowane: ', df_GAN3['Label'].value_counts())
    balanced_data3 = pd.concat([df_gan, df_GAN3], ignore_index=False)

#balanced_data = balanced_data.drop(columns=["source"])  
y_train_gan = balanced_data["Label"]
X_train_gan = balanced_data.drop(columns=["Label"])

#GAN3
y_train_gan3 = balanced_data3["Label"]
X_train_gan3 = balanced_data3.drop(columns=["Label"])
con5 = Counter(y_train)
print("Before", con5)
con6 = Counter(y_train_gan)
print("After", con6)
gan_data = df_GAN
con7 = Counter(y_train_gan3)
print("After", con7)
gan_data3 = df_GAN3

gan_data["source"] = "gan"
if not os.path.exists(file_path3):
    gan_data.to_csv(file_path3, index=False)
else:
    print(f'Plik istnieje pod ścieżką: {file_path3}')
    
gan_data3["source"] = "gan"
if not os.path.exists(file_path4):
    gan_data3.to_csv(file_path4, index=False)
else:
    print(f'Plik istnieje pod ścieżką: {file_path4}')

PicklingError: Could not pickle the task to send it to the workers.

In [10]:
# balance_with_ctgan.py
import os
import pandas as pd
from ctgan import CTGAN  # możesz użyć też: from ctgan import CTGANSynthesizer
import multiprocessing as mp

# --- KONFIG ---
EPOCHS = 100
BATCH_SIZE = 64          # jeśli OOM: 128 -> 64
RANDOM_STATE = 42         # powtarzalność
USE_CUDA = True           # GPU

def infer_discrete_columns(df: pd.DataFrame):
    """Zbuduj listę kolumn dyskretnych dla CTGAN (kategorie/boole/label)."""
    discrete = []
    for c in df.columns:
        dt = str(df[c].dtype)
        if dt in ("object", "bool") or "category" in dt:
            discrete.append(c)
    # Upewnij się, że 'Label' jest traktowana jako dyskretna
    if "Label" in df.columns and "Label" not in discrete:
        discrete.append("Label")
    return discrete

def main():
    # 1) Wczytaj dane
    # Podmień na swoją ścieżkę:
    df_gan = pd.read_parquet("data.parquet") if os.path.exists("data.parquet") else pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_train_final.csv')

    # Upewnij się, że Label jest int/kat.
    if df_gan["Label"].dtype != "int64" and df_gan["Label"].dtype != "int32":
        try:
            df_gan["Label"] = df_gan["Label"].astype("int32")
        except Exception:
            df_gan["Label"] = df_gan["Label"].astype("category")

    # 2) Wyznacz klasy mniejszości/większości poprawnie
    counts = df_gan["Label"].value_counts()
    minority = counts.idxmin()
    majority = counts.idxmax()
    n_min, n_maj = counts[minority], counts[majority]
    gap = int(n_maj - n_min)

    if gap <= 0:
        print("Zbiór już zbalansowany:", counts.to_dict())
        return

    # 3) Dane klasy mniejszości do treningu CTGAN
    data_min = df_gan[df_gan["Label"] == minority].copy()

    # 4) Lista dyskretnych kolumn (NIE wszystkie kolumny!)
    discrete_cols = infer_discrete_columns(df_gan)
    # Opcjonalnie doprecyzuj: jeśli masz kategorie zakodowane liczbowo, dołóż je ręcznie:
    # discrete_cols += ["kolumna_kat_1", "kolumna_kat_2"]

    # 5) Model CTGAN na GPU
    ctgan = CTGAN(
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        generator_dim=(256, 256),
        discriminator_dim=(256, 256),
        cuda=USE_CUDA,
        verbose=True
    )
    # Alternatywnie:
    # from ctgan import CTGANSynthesizer
    # ctgan = CTGANSynthesizer(epochs=EPOCHS, batch_size=BATCH_SIZE, cuda=USE_CUDA)

    # 6) Trening tylko na klasie mniejszości (by generować próbki tej klasy)
    ctgan.set_random_state(RANDOM_STATE) if hasattr(ctgan, "set_random_state") else None
    ctgan.fit(data_min, discrete_columns=discrete_cols)

    # 7) Generowanie brakujących próbek klasy mniejszości
    df_synth = ctgan.sample(gap)
    # Upewnij się, że wygenerowana etykieta to mniejszość (przy treningu na jednej klasie powinna być stała)
    if "Label" in df_synth.columns:
        df_synth["Label"] = minority

    print("Oryginalne klasy:", counts.to_dict())
    print("Wygenerowane (Label counts):", df_synth["Label"].value_counts().to_dict())

    # 8) Sklejenie i wynik
    balanced = pd.concat([df_gan, df_synth], ignore_index=True)
    print("Po zbalansowaniu:", balanced["Label"].value_counts().to_dict())

    balanced.to_parquet("balanced.parquet", index=False)
    print("Zapisano: balanced.parquet")

if __name__ == "__main__":
    # Na Windows błąd picklowania znika dzięki spawn + guard:
    try:
        mp.set_start_method("spawn", force=True)
    except RuntimeError:
        pass
    main()


Gen. (0.00) | Discrim. (0.00):   0%|          | 0/100 [00:00<?, ?it/s]


AssertionError: 