In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.mixture import GaussianMixture
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import ClusterCentroids
# from pomegranate import BayesianNetwork
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import HillClimbSearch, BicScore
from pgmpy.sampling import BayesianModelSampling
#from sdv.single_table import TVAE, CTGAN
from sdv.single_table import TVAESynthesizer
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
from scipy.stats import entropy
from scipy.stats import wasserstein_distance
from scipy.stats import ks_2samp

In [None]:
# Load dataset
data = pd.read_csv('imp_final_df.csv')

# Display the first few rows of the dataset to understand its structure


print('originail shape:', data.shape)

data.head()


In [3]:

X = data.drop(columns=['target'])
y = data['target']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [None]:

# Random Oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train, y_train)

# Save the synthetic dataset in a separate dataframe
df_ros = pd.DataFrame(X_ros, columns=X_train.columns)
df_ros['target'] = y_ros


# Display the first few rows of the synthetic dataset
print(df_ros.shape)
df_ros.head()


In [5]:
df_ros.to_csv('df_ros.csv', index = False)

In [None]:

# SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

df_smote = pd.DataFrame(X_smote, columns=X_train.columns)
df_smote['target'] = y_smote

print(df_smote.shape)
df_smote.head()

In [7]:
df_smote.to_csv('df_smote.csv', index = False)

In [None]:

# ADASYN
adasyn = ADASYN(random_state=42)
X_adasyn, y_adasyn = adasyn.fit_resample(X_train, y_train)

df_adasyn = pd.DataFrame(X_adasyn, columns=X_train.columns)
df_adasyn['target'] = y_adasyn

print(df_adasyn.shape)
df_adasyn.head()

In [9]:
df_adasyn.to_csv('df_adasyn.csv', index = False)

In [None]:

# Cluster Centroids
cc = ClusterCentroids(random_state=42)
X_cc, y_cc = cc.fit_resample(X_train, y_train)

df_cc = pd.DataFrame(X_cc, columns=X_train.columns)
df_cc['target'] = y_cc


print(df_cc.shape)
df_cc.head()

In [11]:
df_cc.to_csv('df_cc.csv', index = False)

In [None]:
# Gaussian Mixture Model
gmm = GaussianMixture(n_components=2, random_state=42)
gmm.fit(X_train)
X_gmm, y_gmm = gmm.sample(n_samples=len(X_train))

df_gmm = pd.DataFrame(X_gmm, columns=X_train.columns)
df_gmm['target'] = y_gmm


print(df_gmm.shape)
df_gmm.head()


In [13]:
df_gmm.to_csv('df_gmm.csv', index = False)

In [None]:
data.target.value_counts()

In [None]:
# Bayesian Network

# Learn the structure of the Bayesian Network
hc = HillClimbSearch(data)
best_model = hc.estimate(scoring_method=BicScore(data))

# Create a BayesianNetwork model from the learned structure
bn = BayesianNetwork(best_model.edges())

# After learning the initial structure with HillClimbSearch
all_variables = set(data.columns)
learned_variables = set([var for edge in best_model.edges() for var in edge])

# Identify missing variables
missing_variables = all_variables - learned_variables

# Add missing variables to the model as independent nodes
for variable in missing_variables:
    bn.add_node(variable)

# Now fit the Bayesian Network with all variables included
bn.fit(data, estimator=MaximumLikelihoodEstimator)
# Generating synthetic data
num_samples = data.shape[0]  # Number of synthetic data points to generate
synthetic_data = bn.simulate(num_samples, seed=np.random.seed(42))

# Convert the generated data into a DataFrame
df_bn = pd.DataFrame(synthetic_data)
df_bn.to_csv('df_bn.csv', index = False)
df_bn.head()


In [None]:
# TVAE

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42, stratify=data['target'])

# Define metadata for the dataset
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)

# TVAE
tvae = TVAESynthesizer(metadata)
tvae.fit(train_data)
synthetic_data = tvae.sample(len(data))
X_tvae = synthetic_data.drop(columns='target').values
y_tvae = synthetic_data['target'].values



df_tvae = pd.DataFrame(X_tvae, columns=X.columns)
df_tvae['target'] = y_tvae

df_tvae.to_csv('df_tvae.csv', index = False)
print(df_tvae.shape)
df_tvae.head()


In [None]:
# CTGAN
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42, stratify=data['target'])
# Define metadata for the dataset
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)


# CTGAN
ctgan = CTGANSynthesizer(metadata)
ctgan.fit(train_data)

synthetic_data = ctgan.sample(len(data))
X_ctgan = synthetic_data.drop(columns='target').values
y_ctgan = synthetic_data['target'].values

df_ctgan = pd.DataFrame(X_ctgan, columns=X_train.columns)
df_ctgan['target'] = y_ctgan
df_ctgan.to_csv('df_ctgan.csv', index = False)

df_ctgan.head()


## CTGAN execution time 3511m 55.6 s = 58.53 hours = 2.5 days

In [7]:
# data loading
df = pd.read_csv("imp_final_df.csv")
df_ros = pd.read_csv("df_ros.csv")
df_smote = pd.read_csv("df_smote.csv")
df_adasyn = pd.read_csv("df_adasyn.csv")
df_cc = pd.read_csv("df_cc.csv")
df_gmm = pd.read_csv("df_gmm.csv")
df_tvae = pd.read_csv("df_tvae.csv")
df_ctgan = pd.read_csv("df_ctgan.csv")

In [None]:
# Plot the correlation matrix of the original and all oversampled data
fig, axes = plt.subplots(4, 3, figsize=(40, 40))
sns.heatmap(data.corr(), ax=axes[0, 0], cmap="coolwarm", annot=False, xticklabels=data.corr().columns, yticklabels=data.corr().columns)
axes[0, 0].set_title("Original Data Correlation Matrix")
sns.heatmap(df_ros.corr(), ax=axes[0, 1], cmap="coolwarm", annot=False, xticklabels=df_ros.corr().columns, yticklabels=df_ros.corr().columns)
axes[0, 1].set_title("Random Oversampling Correlation Matrix")
sns.heatmap(df_smote.corr(), ax=axes[0, 2], cmap="coolwarm", annot=False, xticklabels=df_smote.corr().columns, yticklabels=df_smote.corr().columns)
axes[0, 2].set_title("SMOTE Correlation Matrix")
sns.heatmap(df_adasyn.corr(), ax=axes[1, 0], cmap="coolwarm", annot=False, xticklabels=df_adasyn.corr().columns, yticklabels=df_adasyn.corr().columns)
axes[1, 0].set_title("ADASYN Correlation Matrix")
sns.heatmap(df_cc.corr(), ax=axes[1, 1], cmap="coolwarm", annot=False, xticklabels=df_cc.corr().columns, yticklabels=df_cc.corr().columns)
axes[1, 1].set_title("Cluster Centroids Correlation Matrix")
sns.heatmap(df_gmm.corr(), ax=axes[1, 2], cmap="coolwarm", annot=False, xticklabels=df_gmm.corr().columns, yticklabels=df_gmm.corr().columns)
axes[1, 2].set_title("Gaussian Mixture Model Correlation Matrix")
#sns.heatmap(df_bn.corr(), ax=axes[2, 0], cmap="coolwarm", annot=False, xticklabels=df_bn.corr().columns, yticklabels=df_bn.corr().columns)
#axes[2, 0].set_title("Bayesian Network Correlation Matrix")
sns.heatmap(df_tvae.corr(), ax=axes[2, 1], cmap="coolwarm", annot=False, xticklabels=df_tvae.corr().columns, yticklabels=df_tvae.corr().columns)
axes[2, 1].set_title("TVAE Correlation Matrix")
sns.heatmap(df_ctgan.corr(), ax=axes[2, 2], cmap="coolwarm", annot=False, xticklabels=df_ctgan.corr().columns, yticklabels=df_ctgan.corr().columns)
axes[2, 2].set_title("CTGAN Correlation Matrix")
# sns.heatmap(df_ddmp.corr(), ax=axes[3, 0], cmap="coolwarm", annot=False, xticklabels=df_ddmp.corr().columns, yticklabels=df_ddmp.corr().columns)
# axes[3, 0].set_title("TABDDMP Correlation Matrix")
plt.show()

In [None]:
# Calculate the correlation matrices for the original and synthetic datasets
corr_original = data.corr()
corr_ros = df_ros.corr()
corr_smote = df_smote.corr()
corr_adasyn = df_adasyn.corr()
corr_cc = df_cc.corr()
corr_gmm = df_gmm.corr()
#corr_bn = df_bn.corr()
corr_tvae = df_tvae.corr()
corr_ctgan = df_ctgan.corr()
# corr_ddmp = df_ddmp.corr()

# Calculate the absolute difference matrices
abs_diff_ros = np.abs(corr_original - corr_ros)
abs_diff_smote = np.abs(corr_original - corr_smote)
abs_diff_adasyn = np.abs(corr_original - corr_adasyn)
abs_diff_cc = np.abs(corr_original - corr_cc)
abs_diff_gmm = np.abs(corr_original - corr_gmm)
#abs_diff_bn = np.abs(corr_original - corr_bn)
abs_diff_tvae = np.abs(corr_original - corr_tvae)
abs_diff_ctgan = np.abs(corr_original - corr_ctgan)
# abs_diff_ddmp = np.abs(corr_original - corr_ddmp)

# Plot the absolute difference heatmaps
fig, axes = plt.subplots(3, 3, figsize=(40, 40))
sns.heatmap(abs_diff_ros, ax=axes[0, 0], cmap="coolwarm", annot=False, xticklabels=abs_diff_ros.columns, yticklabels=abs_diff_ros.columns)
axes[0, 0].set_title("Absolute Difference with Random Oversampling")
sns.heatmap(abs_diff_smote, ax=axes[0, 1], cmap="coolwarm", annot=False, xticklabels=abs_diff_smote.columns, yticklabels=abs_diff_smote.columns)
axes[0, 1].set_title("Absolute Difference with SMOTE")
sns.heatmap(abs_diff_adasyn, ax=axes[0, 2], cmap="coolwarm", annot=False, xticklabels=abs_diff_adasyn.columns, yticklabels=abs_diff_adasyn.columns)
axes[0, 2].set_title("Absolute Difference with ADASYN")
sns.heatmap(abs_diff_cc, ax=axes[1, 0], cmap="coolwarm", annot=False, xticklabels=abs_diff_cc.columns, yticklabels=abs_diff_cc.columns)
axes[1, 0].set_title("Absolute Difference with Cluster Centroids")
sns.heatmap(abs_diff_gmm, ax=axes[1, 1], cmap="coolwarm", annot=False, xticklabels=abs_diff_gmm.columns, yticklabels=abs_diff_gmm.columns)
axes[1, 1].set_title("Absolute Difference with Gaussian Mixture Model")
#sns.heatmap(abs_diff_bn, ax=axes[1, 2], cmap="coolwarm", annot=False, xticklabels=abs_diff_bn.columns, yticklabels=abs_diff_bn.columns)
#axes[1, 2].set_title("Absolute Difference with Bayesian Network")
sns.heatmap(abs_diff_tvae, ax=axes[2, 0], cmap="coolwarm", annot=False, xticklabels=abs_diff_tvae.columns, yticklabels=abs_diff_tvae.columns)
axes[2, 0].set_title("Absolute Difference with TVAE")
sns.heatmap(abs_diff_ctgan, ax=axes[2, 1], cmap="coolwarm", annot=False, xticklabels=abs_diff_ctgan.columns, yticklabels=abs_diff_ctgan.columns)
axes[2, 1].set_title("Absolute Difference with CTGAN")
# sns.heatmap(abs_diff_ddmp, ax=axes[2, 2], cmap="coolwarm", annot=False, xticklabels=abs_diff_ddmp.columns, yticklabels=abs_diff_ddmp.columns)
# axes[2, 2].set_title("Absolute Difference with TABDDMP")
plt.show()


In [None]:

# List of DataFrames
dfs = [data, df_ros, df_smote, df_adasyn, df_cc, df_gmm, df_tvae, df_ctgan]
titles = ["Original Data", "Random Oversampling", "SMOTE", "ADASYN", "Cluster Centroids", 
          "Gaussian Mixture Model", "TVAE", "CTGAN"]

# Function to compare distributions and calculate KS test results
def compare_distributions(real_data, synthetic_datasets, titles):
    results = {}
    average_results = {}
    
    for synthetic_data, title in zip(synthetic_datasets, titles):
        
        ks_results = []
        
        equal_count = 0
        different_count = 0
        
        for column in real_data.columns:
            if column == 'target':
                continue
            
            real_values = real_data[column].values
            synthetic_values = synthetic_data[column].values
            
            # Perform KS Test
            ks_statistic, ks_p_value = ks_2samp(real_values, synthetic_values)
            
            # Store KS test results
            ks_results.append({
                'Feature': column,
                'KS Statistic': ks_statistic,
                'KS P-value': ks_p_value
            })
            
            # Count equal and different distributions
            if ks_p_value > 0.05:
                equal_count += 1
            else:
                different_count += 1
        
        # Store results for current synthetic dataset
        results[title] = ks_results
        
        # Calculate average results
        total_features = len(real_data.columns) - 1  # excluding target column
        average_results[title] = {
            'Equal Distributions': equal_count,
            'Different Distributions': different_count,
            'Average KS P-value': np.mean([result['KS P-value'] for result in ks_results])
        }
    
    # Print average results
    print("\n--- Average Results ---")
    for title, avg_res in average_results.items():
        print(f"{title}:")
        print(f"Equal Distributions: {avg_res['Equal Distributions']}")
        print(f"Different Distributions: {avg_res['Different Distributions']}")
        print(f"Average KS P-value: {avg_res['Average KS P-value']}")
        print("-----------------------------")
    
    return results, average_results

# Function to plot KDE plots for all features across all datasets
def plot_kde_comparison(real_data, synthetic_datasets, titles, ks_results):
    num_datasets = len(synthetic_datasets) + 1  # including original data
    num_features = len(real_data.columns) - 1  # excluding target column
    
    for column in real_data.columns:
        if column == 'target':
            continue
        
        plt.figure(figsize=(12, 8))
        for i, synthetic_data in enumerate(synthetic_datasets):
            sns.kdeplot(synthetic_data[column], label=titles[i])
        
        sns.kdeplot(real_data[column], label='Original Data', linewidth=3, color='black')
        
        # Annotate KS test results on the plot
        #for result in ks_results:
         #   ks_statistic = result['KS Statistic']
          #  ks_p_value = result['KS P-value']
           # if ks_p_value > 0.05:
            #    plt.text(0.98, 0.95 - ks_results.index(result)*0.05, f'{result["Feature"]}: KS p-value={ks_p_value:.3f}', transform=plt.gca().transAxes, ha='right', color='green')
            #else:
             #   plt.text(0.98, 0.95 - ks_results.index(result)*0.05, f'{result["Feature"]}: KS p-value={ks_p_value:.3f}', transform=plt.gca().transAxes, ha='right', color='red')
        
        plt.title(f'Probability Distribution of {column}')
        plt.legend()
        plt.show()

# Example usage
# Assuming `dfs`, `titles`, and `df` are defined as before
results, average_results = compare_distributions(data, dfs, titles)
plot_kde_comparison(data, dfs, titles, results[titles[1]])  # Display for one synthetic dataset (change index as needed)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    return accuracy, f1

# Initialize results dictionary
utility_results = {title: {'TRTR_accuracy': 0, 'TRTR_f1': 0, 'TSTR_accuracy': 0, 'TSTR_f1': 0} for title in titles[1:]}

# Evaluate TRTR (Training on Real, Testing on Real)
model_trtr = RandomForestClassifier(random_state=42)
model_trtr.fit(X_train, y_train)
trtr_accuracy, trtr_f1 = evaluate_model(model_trtr, X_test, y_test)
utility_results["Original Data"] = {'TRTR_accuracy': trtr_accuracy, 'TRTR_f1': trtr_f1}

# Evaluate TSTR (Training on Synthetic, Testing on Real)
for i, data in enumerate(dfs[1:], 1):
    X_synthetic = data.drop('target', axis=1)
    y_synthetic = data['target']
    model_tstr = RandomForestClassifier(random_state=42)
    model_tstr.fit(X_synthetic, y_synthetic)
    tstr_accuracy, tstr_f1 = evaluate_model(model_tstr, X_test, y_test)
    utility_results[titles[i]]['TSTR_accuracy'] = tstr_accuracy
    utility_results[titles[i]]['TSTR_f1'] = tstr_f1

# Display the utility results
utility_df = pd.DataFrame(utility_results).T
print("Utility results (F1 score, accuracy comparison of TRTR and TSTR) for all datasets:")
print(utility_df)


In [None]:
data.head()

In [None]:
dfs = [data, df_ros, df_smote, df_adasyn, df_cc, df_gmm, df_tvae, df_ctgan]
titles = ["Original Data", "Random Oversampling", "SMOTE", "ADASYN", "Cluster Centroids", 
          "Gaussian Mixture Model", "TVAE", "CTGAN"]

for title, df in zip(titles, dfs):
    #print(f"DataFrame: {title}")
    #print(df.logged_in.value_counts())
    #print(df.protocol_type_icmp.value_counts())
    #print(df.protocol_type_tcp.value_counts())
    #print(df.service_http.value_counts())
    #print(df.flag_S0.value_counts())
    #print(df.flag_SF.value_counts())
    print(df.target.value_counts())
    print("\n")  # Add a newline for better readability between DataFrames
