# Create synthetic samples from UNSW dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import datacompy
import os, sys
import numpy as np
import re
import ast
import gower
from pathlib import Path
import openpyxl
import itertools

# narzedzia
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    learning_curve,
    RepeatedStratifiedKFold,
    GridSearchCV,
    RandomizedSearchCV
)
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score, accuracy_score, pairwise_distances, make_scorer, precision_score, f1_score, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from scipy import stats
from scipy.stats import chi2_contingency, f_oneway, friedmanchisquare, wilcoxon
from scipy.spatial import distance
from joblib import dump, load

# modele
from xgboost import XGBClassifier, XGBRFClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# methods
from imblearn.under_sampling import ClusterCentroids, NearMiss
from scipy.optimize import differential_evolution
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, HDBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier, NearestNeighbors
from sklearn.inspection import permutation_importance
from scipy.spatial.distance import euclidean
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
#from ctgan import CTGAN

  import datacompy


### Preprocessing

In [3]:
#df_unsw_features = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\NUSW-NB15_features.csv')
df_unsw_1 = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_1.csv')
df_unsw_2 = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_2.csv')
df_unsw_3 = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_3.csv')
df_unsw_4 = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_4.csv')

  df_unsw_1 = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_1.csv')
  df_unsw_2 = pd.read_csv('D:\\ml\\undersampling_data\\data\\unsw\\UNSW-NB15_2.csv')


In [None]:
df_unsw = pd.concat([df_unsw_1, df_unsw_2, df_unsw_3, df_unsw_4], ignore_index=True)
df_unsw = df_unsw.drop(columns=['attack_cat'])

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,Label
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0.0,0,3,7,1,3,1,1,1,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0.0,0,2,4,2,3,1,1,2,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0.0,0,12,8,1,2,2,1,1,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0.0,0,6,9,1,1,1,1,1,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0.0,0,7,9,1,1,1,1,1,0


In [None]:
# what to do with ports? src drop because is random, dst - keep because is related to service, port number >1000 is random
def to_int_or_hex(x):
    try:
        if isinstance(x, (int, float)) and not pd.isna(x):
            return int(x)
        elif isinstance(x, str) and x.startswith('0x'):
            return int(x, 16)
        elif isinstance(x, str):
            return int(x)
        else:
            return pd.NA
    except ValueError:
        return pd.NA

df_unsw['dsport'] = df_unsw['dsport'].apply(to_int_or_hex).astype('Int64')

mask_well_known = df_unsw['dsport'] <= 1000
df_well_known = df_unsw[mask_well_known].copy()
df_other_ports = df_unsw[~mask_well_known].copy()

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_ports = encoder.fit_transform(df_well_known[['dsport']])
encoded_df = pd.DataFrame(
    encoded_ports,
    columns=encoder.get_feature_names_out(['dsport']),
    index=df_well_known.index
)

df_well_known = pd.concat([encoded_df, df_well_known.drop(columns=['dsport'])], axis=1)

df_other_ports['dsport_other'] = 1

for col in df_well_known.columns:
    if col not in df_other_ports.columns:
        df_other_ports[col] = 0

df_unsw = pd.concat([df_well_known, df_other_ports], axis=0).sort_index().reset_index(drop=True)

  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0
  df_other_ports[col] = 0


In [None]:
# insert 'dsport_other' next to 'proto' and fill NaN with 0
col = df_unsw.pop('dsport_other')
dst_col = df_unsw.columns.get_loc('proto')
df_unsw.insert(dst_col, 'dsport_other', col)
df_unsw['dsport_other'] = df_unsw['dsport_other'].fillna(0)
df_unsw = df_unsw.drop(columns=['dsport', 'srcip', 'dstip', 'sport'])

In [None]:

df_unsw['proto_simplified'] = df_unsw['proto'].apply(
    lambda x: x if x in ['tcp', 'udp'] else 'other'
)

df_unsw = pd.get_dummies(df_unsw, columns=['proto_simplified'], prefix='proto')
#df_unsw = df_unsw.loc[:, ~df_unsw.columns.duplicated(keep='first')]

col = df_unsw.pop('proto_other')
col2 = df_unsw.pop('proto_tcp')
col3 = df_unsw.pop('proto_udp')
dst_col = df_unsw.columns.get_loc('state')
df_unsw.insert(dst_col, 'proto_other', col)
df_unsw.insert(dst_col + 1, 'proto_tcp', col2)
df_unsw.insert(dst_col + 2, 'proto_udp', col3)

df_unsw = df_unsw.drop(columns=['proto'])

df_unsw.loc[:, df_unsw.columns.str.startswith('proto')] = (
    df_unsw.loc[:, df_unsw.columns.str.startswith('proto')].astype(int)
)

" proto_index = df_unsw.columns.get_loc('proto')\n\nfor i, col in enumerate(df_proto.columns):\n    df_unsw.insert(proto_index + 1 + i, col, df_unsw[col]) "

In [None]:
encoded_ports = encoder.fit_transform(df_unsw[['proto']])
encoded_df = pd.DataFrame(
    encoded_ports,
    columns=encoder.get_feature_names_out(['proto']),
    index=df_unsw.index
)
proto_index = df_unsw.columns.get_loc('proto')

for i, col in enumerate(encoded_df.columns):
    df_unsw.insert(proto_index + 1 + i, col, encoded_df[col])

In [None]:
# reduced protocols to tcp, udp, other
df_unsw['proto'].value_counts()

In [None]:
df_unsw = pd.get_dummies(df_unsw, columns=['proto'], prefix='proto')

In [10]:
cols_to_normalize = [
    'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss',
    'Sload', 'Dload', 'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb',
    'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit',
    'Stime', 'Ltime', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat',
    'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 
    'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
    'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm'
]

scaler = StandardScaler()
df_unsw[cols_to_normalize] = scaler.fit_transform(df_unsw[cols_to_normalize])

with open(f"D:\\ml\\undersampling_data\\models\\unsw\\scaler.pkl", "wb") as f:
            pickle.dump(scaler, f)

In [16]:
df_unsw.replace(r'^\s*$', np.nan, regex=True, inplace=True)
df_unsw['ct_ftp_cmd'] = df_unsw['ct_ftp_cmd'].fillna(0)

scaler2 = StandardScaler()
df_unsw[['ct_ftp_cmd']] = scaler2.fit_transform(df_unsw[['ct_ftp_cmd']])

with open(f"D:\\ml\\undersampling_data\\models\\unsw\\scaler_ct_ftp_cmd.pkl", "wb") as f:
            pickle.dump(scaler2, f)

In [20]:
valid_states = ['FIN', 'CON', 'INT', 'REQ', 'RST', 'CLO', 'ACC']
df_unsw = df_unsw[df_unsw['state'].isin(valid_states)].reset_index(drop=True)

In [24]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_state = encoder.fit_transform(df_unsw[['state']])
encoded_df = pd.DataFrame(
    encoded_state,
    columns=encoder.get_feature_names_out(['state']),
    index=df_unsw.index
)
state_index = df_unsw.columns.get_loc('state')

for i, col in enumerate(encoded_df.columns):
    df_unsw.insert(state_index + 1 + i, col, encoded_df[col])

In [28]:
X = df_unsw.drop(columns=['Label'])
y = df_unsw['Label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,        # 20% danych do testu
    random_state=42,      # dla powtarzalności
    stratify=y            # zachowaj proporcje klas
)

X_train.shape, X_test.shape

((1777666, 160), (761857, 160))

In [29]:
X_train.to_csv('D:\\ml\\undersampling_data\\data\\unsw\\X_train.csv', index=False)
X_test.to_csv('D:\\ml\\undersampling_data\\data\\unsw\\X_test.csv', index=False)
y_train.to_csv('D:\\ml\\undersampling_data\\data\\unsw\\y_train.csv', index=False)
y_test.to_csv('D:\\ml\\undersampling_data\\data\\unsw\\y_test.csv', index=False)

### Oversamling

In [36]:
df = pd.concat([X_train, y_train], axis=1)

df = df.drop_duplicates()

In [37]:
df['source'] = 'original' 

In [38]:
file_path1 = "D:\\ml\\undersampling_data\\data\\unsw\\oversampling\\smote_data.csv"
file_path2 = "D:\\ml\\undersampling_data\\data\\unsw\\oversampling\\smote3_data.csv"
#zamiana jesli istnieje to wczytaj plik jesli nie to stworz
coun = Counter(y_train)
majority_class = max(coun, key=coun.get)
minority_class = min(coun, key=coun.get)
missing_samples = coun[majority_class] - coun[minority_class]
print("Before", coun)
smote = SMOTE()
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

smote2 = SMOTE(sampling_strategy={minority_class: coun[minority_class] + 2 * missing_samples}, random_state=42)
X_train_sm3, y_train_sm3 = smote2.fit_resample(X_train, y_train)

train_data_smote = pd.concat([X_train_sm, y_train_sm], axis=1)          #polaczenie danych wygenerowanych X_train oraz y_train
train_data_smote3 = pd.concat([X_train_sm3, y_train_sm3], axis=1)

#smote generated data
train_data_smote['generated_by_smote'] = ['original' if i < len(df) else 'smote' for i in range(len(train_data_smote))]
smote_data = train_data_smote[train_data_smote['generated_by_smote'] == 'smote'].drop('generated_by_smote', axis=1)
smote_data["source"] = "smote"
if not os.path.exists(file_path1):
    smote_data.to_csv(file_path1, index=False)
else:
    print(f'Plik istnieje pod ścieżką: {file_path1}')

#smote3 generated data
train_data_smote3['generated_by_smote'] = ['original' if i < len(df) else 'smote' for i in range(len(train_data_smote3))]
smote_data3 = train_data_smote3[train_data_smote3['generated_by_smote'] == 'smote'].drop('generated_by_smote', axis=1)
smote_data3["source"] = "smote"
if not os.path.exists(file_path2):
    smote_data3.to_csv(file_path2, index=False)
else:
    print(f'Plik istnieje pod ścieżką: {file_path2}')

con1 = Counter(y_train_sm)
print("After", con1)
con2 = Counter(y_train_sm3)
print("After generation 3x SMOTE", con2)
pd.Series(y_train_sm).value_counts().plot.bar()
pd.Series(y_train_sm3).value_counts().plot.bar()

Before Counter({0: 1552768, 1: 224898})


ValueError: could not convert string to float: 'dns'