In [None]:
import json
import pandas as pd
import numpy as np
import bnlearn as bn
from sklearn.mixture import BayesianGaussianMixture

In [None]:
path = "/home/aschoen/my_storage/aschoen/dataset/flow_chronicle_dataset/xp1/CIDDS_xp1_train.csv"

## Load Training Data

In [None]:
df = pd.read_csv(path)

In [None]:
df

In [None]:
train = df.copy()

In [None]:
train["Date first seen"] = pd.to_datetime(train["Date first seen"])
train["Day"] = train["Date first seen"].dt.dayofweek
train["Time"] = train["Date first seen"].dt.time
train["Time"] = pd.to_timedelta(train["Time"].astype(str)).dt.total_seconds()
train.drop("Date first seen", axis=1, inplace=True)

In [None]:
ip_pubs = [ip for ip in df["Dst IP Addr"].unique() if "_" in ip and "EXT" not in ip]

In [None]:
train.loc[train["Dst IP Addr"].isin(ip_pubs),"Dst IP Addr"] = "IP_PUB"

## Preprocess Training Data

In [8]:
def preprocess(df, n, name, v, max_iter):
    class NumpyEncoder(json.JSONEncoder):
        """ Special json encoder for numpy types """
        def default(self, obj):
            if isinstance(obj, np.integer):
                return int(obj)
            elif isinstance(obj, np.floating):
                return float(obj)
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            return json.JSONEncoder.default(self, obj)
    data = df.copy()
    continuous = data.loc[:,["In Byte", "Out Byte", "Time"]]
    dic = {}
    for col in continuous.columns:
        m = continuous[col].describe([.9])[5]
        temp = continuous.loc[data[col]<=m, col].to_numpy().reshape(-1,1)
        gm = BayesianGaussianMixture(n_components=n, random_state=0, verbose=v, max_iter = max_iter).fit(temp)
        dic[col] = {"edge":m, "max": continuous[col].max(), "weights": gm.weights_.squeeze(), "means": gm.means_.squeeze(), "std": gm.covariances_.squeeze()}
        continuous.loc[data[col]<=m, col] = gm.predict(temp)
        continuous.loc[data[col]>m, col] = n
        data[col] = continuous[col]
    with open(name, "w") as output:
        json.dump(dic, output, cls = NumpyEncoder)
        output.close()
    return data.astype("category")

In [9]:
def reconstruct(df, dic):
    data = df.copy()
    continuous = data.loc[:,["In Byte", "Out Byte", "Time"]].astype(int)
    for col in continuous.columns:
        n = len(dic[col]["weights"])
        continuous.loc[continuous[col] == float(n), col] = np.random.uniform(dic[col]["edge"], dic[col]["max"], (data[col] == float(n)).sum())
        norm = continuous.loc[continuous[col] < float(n), col]
        mean = np.array(dic[col]["means"])[norm.values.astype(int)]
        std = np.array(dic[col]["std"])[norm.values.astype(int)]
        continuous.loc[continuous[col] < float(n), col] = np.random.normal(mean, np.sqrt(std))
        continuous[col]=np.maximum(continuous[col], np.zeros(len(continuous)))
        if col != "Time":
            continuous[col]=continuous[col].round(1)
        data[col] = continuous[col]
    return data

In [10]:
train_p = preprocess(train, 40, "standard_bn_stats.json", v=2, max_iter=50)

Initialization 0
  Iteration 10	 time lapse 50.22334s	 ll change 999.60292
  Iteration 20	 time lapse 42.69724s	 ll change 281.18958
  Iteration 30	 time lapse 42.61872s	 ll change 119.88953
  Iteration 40	 time lapse 42.48185s	 ll change 73.16623
  Iteration 50	 time lapse 42.39951s	 ll change 55.79154
Initialization converged: False	 time lapse 220.42089s	 ll -6697234.64296
Initialization 0
  Iteration 10	 time lapse 51.28072s	 ll change 721.13960
  Iteration 20	 time lapse 44.97155s	 ll change 8969.85265
  Iteration 30	 time lapse 44.69175s	 ll change 136.94265
  Iteration 40	 time lapse 44.43947s	 ll change 73.45324
  Iteration 50	 time lapse 44.88533s	 ll change 44.60963
Initialization converged: False	 time lapse 230.26903s	 ll -5732448.95850
Initialization 0
  Iteration 10	 time lapse 48.45492s	 ll change 627.64709
  Iteration 20	 time lapse 44.57129s	 ll change 212.60583
  Iteration 30	 time lapse 44.23670s	 ll change 110.16332
  Iteration 40	 time lapse 44.00224s	 ll change 67

In [11]:
def build_time_dep(df, steps=5):
    res = pd.concat([df.shift(-i) for i in range(steps)], axis=1)
    res.columns = [f'{col}_{i+1}' for i in range(steps) for col in df.columns]
    res = res.iloc[::steps,:]
    return res.dropna()

In [12]:
train_5step = build_time_dep(train_p, 5)

In [13]:
def regroup(df, steps):
    initial_columns = set('_'.join(col.split('_')[:-1]) for col in df.columns)
    res = []
    for initial_column in initial_columns:
        df_grouped = df[[c for c in df.columns if initial_column in c]]
        ser = []
        for col in df_grouped.columns:
            i = int(col.split('_')[-1])-1
            s = df_grouped[col]
            s.name = initial_column
            s.index = s.index + i
            ser.append(s)
        column = pd.concat(ser, axis=0).sort_index()
        res.append(column)
    reconstructed_df = pd.concat(res,axis=1)
    reconstructed_df = reconstructed_df[['Proto','Src IP Addr','Dst IP Addr','Dst Pt','In Byte','Out Byte','Day','Time']]
    return reconstructed_df

In [14]:
regroup(train_5step, 5)

Unnamed: 0,Proto,Src IP Addr,Dst IP Addr,Dst Pt,In Byte,Out Byte,Day,Time
0,TCP,192.168.220.15,192.168.100.5,445.0,40.0,40.0,2,14.0
1,UDP,192.168.200.4,192.168.200.255,137.0,22.0,17.0,2,14.0
2,TCP,192.168.220.4,192.168.100.5,445.0,40.0,40.0,2,14.0
3,UDP,192.168.200.5,192.168.200.255,138.0,22.0,17.0,2,14.0
4,TCP,192.168.220.12,192.168.100.5,445.0,40.0,40.0,2,14.0
...,...,...,...,...,...,...,...,...
1089915,UDP,192.168.200.5,192.168.200.255,137.0,22.0,17.0,1,40.0
1089916,TCP,192.168.200.8,EXT_SERVER,8000.0,21.0,15.0,1,40.0
1089917,TCP,192.168.200.8,EXT_SERVER,8000.0,26.0,15.0,1,40.0
1089918,TCP,192.168.200.8,EXT_SERVER,8000.0,26.0,15.0,1,40.0


## Train Network Structures

In [142]:
model_indep  = bn.structure_learning.fit(train_p, methodtype='hc', scoretype='bic')

[bnlearn] >Computing best DAG using [hc]
[bnlearn] >Set scoring type at [bic]
[bnlearn] >Compute structure scores for model comparison (higher is better).


In [15]:
model_timedep = bn.structure_learning.fit(train_5step, methodtype='hc', scoretype='bic')

[bnlearn] >Computing best DAG using [hc]
[bnlearn] >Set scoring type at [bic]
[bnlearn] >Compute structure scores for model comparison (higher is better).


## Train Network Parameters

In [146]:
model_indep = bn.parameter_learning.fit(model_indep, train_p, methodtype='bayes')

[bnlearn] >Parameter learning> Computing parameters using [bayes]
[bnlearn] >Converting [<class 'pgmpy.base.DAG.DAG'>] to BayesianNetwork model.
[bnlearn] >Converting adjmat to BayesianNetwork.
[bnlearn] >CPD of Src IP Addr:
+------------------------------+-----+------------------------+
| Out Byte                     | ... | Out Byte(40.0)         |
+------------------------------+-----+------------------------+
| Src IP Addr(0.0.0.0)         | ... | 8.494828518294631e-06  |
+------------------------------+-----+------------------------+
| Src IP Addr(17121_51)        | ... | 8.494828518294631e-06  |
+------------------------------+-----+------------------------+
| Src IP Addr(192.168.100.3)   | ... | 0.00016445988011418408 |
+------------------------------+-----+------------------------+
| Src IP Addr(192.168.100.4)   | ... | 0.00020115753931321687 |
+------------------------------+-----+------------------------+
| Src IP Addr(192.168.100.5)   | ... | 0.00018280870971370048 |
+------

In [16]:
model_timedep = bn.parameter_learning.fit(model_timedep, train_5step, methodtype='bayes')

[bnlearn] >Parameter learning> Computing parameters using [bayes]
[bnlearn] >Converting [<class 'pgmpy.base.DAG.DAG'>] to BayesianNetwork model.
[bnlearn] >Converting adjmat to BayesianNetwork.
[bnlearn] >CPD of Proto_1:
+----------------+-----+---------------------+
| Dst Pt_1       | ... | Dst Pt_1(50514.0)   |
+----------------+-----+---------------------+
| Proto_1(ICMP ) | ... | 0.24248302618816683 |
+----------------+-----+---------------------+
| Proto_1(IGMP ) | ... | 0.24248302618816683 |
+----------------+-----+---------------------+
| Proto_1(TCP  ) | ... | 0.24248302618816683 |
+----------------+-----+---------------------+
| Proto_1(UDP  ) | ... | 0.2725509214354995  |
+----------------+-----+---------------------+
[bnlearn] >CPD of Dst IP Addr_1:
+--------------------------------+-----+------------------------+
| Proto_1                        | ... | Proto_1(UDP  )         |
+--------------------------------+-----+------------------------+
| Dst IP Addr_1(10000_140)     

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



## Sample New Data

In [211]:
new_data_indep = bn.sampling(model_indep,len(train_p))

In [19]:
new_data_timedep = regroup(bn.sampling(model_timedep,len(train_5step)), train_5step.shape[1]//train_p.shape[1])

In [20]:
new_data_indep = new_data_indep[train_p.columns]

NameError: name 'new_data_indep' is not defined

## Post-process Generated Data

In [214]:
with open("standard_bn_stats.json", "r") as file:
    dic = json.load(file)

In [215]:
new_data_indep = reconstruct(new_data_indep, dic)
new_data_timedep = reconstruct(new_data_timedep, dic)

In [21]:
new_data_timedep

Unnamed: 0,Proto,Src IP Addr,Dst IP Addr,Dst Pt,In Byte,Out Byte,Day,Time
0,TCP,192.168.220.12,10704_24,80.0,18.0,18.0,4,10.0
1,UDP,192.168.220.12,10031_250,53.0,8.0,0.0,1,10.0
1,TCP,192.168.220.14,10704_22,80.0,5.0,40.0,4,10.0
2,TCP,192.168.220.14,10793_174,443.0,12.0,36.0,6,38.0
2,TCP,192.168.220.14,10797_253,443.0,30.0,7.0,1,10.0
...,...,...,...,...,...,...,...,...
217985,TCP,192.168.200.9,10106_93,443.0,0.0,7.0,2,16.0
217985,UDP,192.168.210.4,DNS,53.0,38.0,0.0,2,26.0
217986,TCP,192.168.220.15,10163_59,443.0,18.0,5.0,0,14.0
217986,TCP,192.168.200.9,10117_88,443.0,20.0,1.0,2,16.0


In [217]:
new_data_indep.loc[new_data_indep["Dst IP Addr"]=="IP_PUB","Dst IP Addr"] = np.random.choice(ip_pubs, (new_data_indep["Dst IP Addr"]=="IP_PUB").sum())
new_data_timedep.loc[new_data_timedep["Dst IP Addr"]=="IP_PUB","Dst IP Addr"] = np.random.choice(ip_pubs, (new_data_timedep["Dst IP Addr"]=="IP_PUB").sum())

In [218]:
origTimestamp = pd.to_datetime(min(pd.to_datetime(df["Date first seen"]).dt.date))

In [227]:
new_data_indep["Date first seen"] = pd.to_timedelta(pd.to_timedelta(new_data_indep.Time, unit='s').dt.total_seconds()+new_data_indep.Day*24*3600,unit='s')+origTimestamp
new_data_timedep["Date first seen"] = pd.to_timedelta(pd.to_timedelta(new_data_timedep.Time, unit='s').dt.total_seconds()+new_data_timedep.Day*24*3600,unit='s')+origTimestamp

new_data_indep.drop(["Day","Time"],axis=1,inplace=True)
new_data_timedep.drop(["Day","Time"],axis=1,inplace=True)

new_data_indep = new_data_indep[df.columns]
new_data_timedep = new_data_timedep[df.columns]

## Save Generated Data

In [None]:
new_data_indep.to_csv("/home/aschoen/my_storage/aschoen/dataset/flow_chronicle_dataset/xp1/BN_indep.csv",index=False)
new_data_timedep.to_csv("/home/aschoen/my_storage/aschoen/dataset/flow_chronicle_dataset/xp1/BN_indep.csv",index=False)