In [68]:
import pandas as pd


df = pd.read_csv('edge/Edge-IIoTset dataset/Selected dataset for ML and DL/DNN-EdgeIIoT-dataset.csv', encoding='utf-8', low_memory=False)
df.shape

(2219201, 63)

In [69]:
df.dtypes

frame.time             object
ip.src_host            object
ip.dst_host            object
arp.dst.proto_ipv4     object
arp.opcode            float64
                       ...   
mbtcp.len             float64
mbtcp.trans_id        float64
mbtcp.unit_id         float64
Attack_label            int64
Attack_type            object
Length: 63, dtype: object

In [70]:
print(df['Attack_type'].value_counts())

Attack_type
Normal                   1615643
DDoS_UDP                  121568
DDoS_ICMP                 116436
SQL_injection              51203
Password                   50153
Vulnerability_scanner      50110
DDoS_TCP                   50062
DDoS_HTTP                  49911
Uploading                  37634
Backdoor                   24862
Port_Scanning              22564
XSS                        15915
Ransomware                 10925
MITM                        1214
Fingerprinting              1001
Name: count, dtype: int64


In [71]:
drop_columns = ["frame.time", "ip.src_host", "ip.dst_host", "arp.src.proto_ipv4","arp.dst.proto_ipv4", 



         "http.file_data","http.request.full_uri","icmp.transmit_timestamp",



         "http.request.uri.query", "tcp.options","tcp.payload","tcp.srcport",



         "tcp.dstport", "udp.port", "mqtt.msg"]


df.drop(drop_columns, axis=1, inplace=True)

In [72]:
print(df.isna().any(axis=1).sum(), "rows with at least one NaN to remove")
print(df.isna().sum().sort_values(ascending=False))
df = df.dropna(axis=0, how='any')

0 rows with at least one NaN to remove
arp.opcode                   0
arp.hw.size                  0
icmp.checksum                0
icmp.seq_le                  0
icmp.unused                  0
http.content_length          0
http.request.method          0
http.referer                 0
http.request.version         0
http.response                0
http.tls_port                0
tcp.ack                      0
tcp.ack_raw                  0
tcp.checksum                 0
tcp.connection.fin           0
tcp.connection.rst           0
tcp.connection.syn           0
tcp.connection.synack        0
tcp.flags                    0
tcp.flags.ack                0
tcp.len                      0
tcp.seq                      0
udp.stream                   0
udp.time_delta               0
dns.qry.name                 0
dns.qry.name.len             0
dns.qry.qu                   0
dns.qry.type                 0
dns.retransmission           0
dns.retransmit_request       0
dns.retransmit_request_in    0


In [73]:
print(df.duplicated().sum(), "fully duplicate rows to remove")
print(df.shape)
df.drop_duplicates(subset=None, keep="first", inplace=True)
df.shape

309530 fully duplicate rows to remove
(2219201, 48)


(1909671, 48)

In [74]:
from fastai.tabular.all import df_shrink

In [75]:
# print(df.info(memory_usage="deep"))
# df = df_shrink(df = df, obj2cat=False, int2uint=False)
# df.info(memory_usage='deep')

In [76]:
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

In [77]:
encode_text_dummy(df,'http.request.method')
encode_text_dummy(df,'http.referer')
encode_text_dummy(df,"http.request.version")
encode_text_dummy(df,"dns.qry.name.len")
encode_text_dummy(df,"mqtt.conack.flags")
encode_text_dummy(df,"mqtt.protoname")
encode_text_dummy(df,"mqtt.topic")

In [78]:
# df = df_shrink(df=df, obj2cat=False, int2uint=False)
# df.info(memory_usage='deep')


In [79]:
from sklearn.preprocessing import MinMaxScaler


df.shape


def normalize_data(df, types=['float64', 'int64']):
    scaler = MinMaxScaler()
    columns_to_normalize = df.select_dtypes(include=types).columns

    for col in columns_to_normalize:
        if not df[col].dropna().apply(lambda x: x in [0, 1]).all():
            df[[col]] = scaler.fit_transform(df[[col]])  # <== le double crochets ici
    print(df.head())

normalize_data(df)
df.shape


   arp.opcode  arp.hw.size  icmp.checksum  icmp.seq_le  icmp.unused  \
0         0.0          0.0            0.0          0.0          0.0   
1         0.0          0.0            0.0          0.0          0.0   
2         0.0          0.0            0.0          0.0          0.0   
3         0.0          0.0            0.0          0.0          0.0   
4         0.0          0.0            0.0          0.0          0.0   

   http.content_length  http.response  http.tls_port       tcp.ack  \
0                  0.0            0.0            0.0  2.531948e-10   
1                  0.0            0.0            0.0  2.531948e-10   
2                  0.0            0.0            0.0  3.797922e-09   
3                  0.0            0.0            0.0  3.797922e-09   
4                  0.0            0.0            0.0  1.265974e-09   

   tcp.ack_raw  ...  mqtt.conack.flags-1471198  mqtt.conack.flags-1471199  \
0     0.734199  ...                      False                      False  

(1909671, 97)

In [80]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df["Attack_type"])
df["Attack_type"]=encoded_labels

df.to_csv('preprocessed_DNN-EdgeIIoT-dataset.csv')

In [81]:
df.shape
df.head

<bound method NDFrame.head of          arp.opcode  arp.hw.size  icmp.checksum  icmp.seq_le  icmp.unused  \
0               0.0          0.0       0.000000     0.000000          0.0   
1               0.0          0.0       0.000000     0.000000          0.0   
2               0.0          0.0       0.000000     0.000000          0.0   
3               0.0          0.0       0.000000     0.000000          0.0   
4               0.0          0.0       0.000000     0.000000          0.0   
...             ...          ...            ...          ...          ...   
2219162         0.0          0.0       0.536356     0.668574          0.0   
2219167         0.0          0.0       0.833214     0.675227          0.0   
2219181         0.0          0.0       0.743305     0.691203          0.0   
2219192         0.0          0.0       0.919079     0.694423          0.0   
2219193         0.0          0.0       0.864389     0.694636          0.0   

         http.content_length  http.response  

In [3]:
import pandas as pd
my_df= pd.read_csv("preprocessed_DNN-EdgeIIoT-dataset.csv")

my_df["Attack_label"]

0          0
1          0
2          0
3          0
4          0
          ..
1909666    1
1909667    1
1909668    1
1909669    1
1909670    1
Name: Attack_label, Length: 1909671, dtype: int64

In [5]:
print(my_df['Attack_type'].value_counts())

Attack_type
7     1363998
4      121567
2       67939
11      50826
3       50062
13      50026
8       49933
1       48544
12      36807
0       24026
9       19977
14      15066
10       9689
5         853
6         358
Name: count, dtype: int64


In [8]:
my_df= pd.read_csv("splits/train.csv")

print(my_df['Attack_type'].value_counts())

my_df= pd.read_csv("splits/mia.csv")
print(my_df['Attack_type'].value_counts())

Attack_type
7     954798
4      85097
2      47557
11     35578
3      35044
13     35018
8      34953
1      33981
12     25765
0      16818
9      13984
14     10546
10      6782
5        597
6        251
Name: count, dtype: int64
Attack_type
7     409200
4      36470
2      20382
11     15248
3      15018
13     15008
8      14980
1      14563
12     11042
0       7208
9       5993
14      4520
10      2907
5        256
6        107
Name: count, dtype: int64
