In [118]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [119]:
file_path = "dataset/CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv"
df = pd.read_csv(file_path)

In [120]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008748 entries, 0 to 1008747
Data columns (total 23 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   ts              1008748 non-null  int64 
 1   uid             1008748 non-null  object
 2   id.orig_h       1008748 non-null  object
 3   id.orig_p       1008748 non-null  int64 
 4   id.resp_h       1008748 non-null  object
 5   id.resp_p       1008748 non-null  int64 
 6   proto           1008748 non-null  object
 7   service         1008748 non-null  object
 8   duration        1008748 non-null  object
 9   orig_bytes      1008748 non-null  object
 10  resp_bytes      1008748 non-null  object
 11  conn_state      1008748 non-null  object
 12  local_orig      1008748 non-null  object
 13  local_resp      1008748 non-null  object
 14  missed_bytes    1008748 non-null  int64 
 15  history         1008748 non-null  object
 16  orig_pkts       1008748 non-null  int64 
 17  orig_ip_

In [121]:
symbol_to_replace = '-'
df.replace(symbol_to_replace, pd.NA, inplace=True)

In [122]:
print(df.isnull().sum())

ts                      0
uid                     0
id.orig_h               0
id.orig_p               0
id.resp_h               0
id.resp_p               0
proto                   0
service           1005507
duration           796300
orig_bytes         796300
resp_bytes         796300
conn_state              0
local_orig        1008748
local_resp        1008748
missed_bytes            0
history             17421
orig_pkts               0
orig_ip_bytes           0
resp_pkts               0
resp_ip_bytes           0
tunnel_parents    1008748
label                   0
detailed-label     469275
dtype: int64


In [124]:
columns_to_remove = ['service','id.resp_p','id.orig_p', 'orig_bytes', 'resp_bytes', 'local_orig', 'local_resp','resp_ip_bytes', 'tunnel_parents', 'detailed-label']
df.drop(columns=columns_to_remove, inplace=True)
df.head()

KeyError: "['service', 'id.resp_p', 'id.orig_p', 'orig_bytes', 'resp_bytes', 'local_orig', 'local_resp', 'resp_ip_bytes', 'tunnel_parents', 'detailed-label'] not found in axis"

In [125]:
column_to_replace = 'duration'

In [126]:
df[column_to_replace].fillna(0, inplace=True)

In [127]:
column_to_check_duplicates = 'uid'

In [128]:
df.drop_duplicates(subset=column_to_check_duplicates, inplace=True)

In [129]:
column_to_check = 'missed_bytes'

In [130]:
non_zero_values = df[df[column_to_check] != 0][column_to_check]
print("Non-zero values in {}: \n{}".format(column_to_check, non_zero_values))

Non-zero values in missed_bytes: 
Series([], Name: missed_bytes, dtype: int64)


In [131]:
df.drop(columns=[column_to_check,'uid'], inplace=True)
print(df)

                 ts        id.orig_h        id.resp_h proto  duration   
0        1525879831  192.168.100.103   65.127.233.163   tcp  2.999051  \
1        1525879831  192.168.100.103    63.150.16.171   tcp         0   
2        1525879831  192.168.100.103     111.40.23.49   tcp         0   
3        1525879832  192.168.100.103  131.174.215.147   tcp  2.998796   
4        1525879832  192.168.100.103      91.42.47.63   tcp         0   
...             ...              ...              ...   ...       ...   
1008743  1526282655  192.168.100.103    16.219.83.137   udp         0   
1008744  1526282682  192.168.100.103   100.57.245.196   udp         0   
1008745  1526282625  192.168.100.103     249.99.119.9   udp         0   
1008746  1526282676  192.168.100.103  205.103.167.192   udp         0   
1008747  1526282660  192.168.100.103    23.70.168.160   udp         0   

        conn_state history  orig_pkts  orig_ip_bytes  resp_pkts      label  
0               S0       S          3         

In [132]:
# Convert 'duration' column to numeric, coercing non-numeric values to NaN
df['duration'] = pd.to_numeric(df['duration'], errors='coerce')

# Calculate the average excluding NaN and 0 values
average_value = df[df['duration'].notnull() & (df['duration'] != 0)]['duration'].mean()

# Replace 0 values with the average
df['duration'] = df['duration'].replace(0, average_value)

In [133]:
integer_columns = df.select_dtypes(include='int')

In [134]:
column_with_timestamps = 'ts'
df[column_with_timestamps] = pd.to_datetime(df[column_with_timestamps], unit='s', origin='unix')

In [135]:
X = df.drop(['label'], axis=1)
y = df['label']
X_sampled, _, y_sampled, _ = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("\nShape of Sampled Data:")
print(X_sampled.shape)


Shape of Sampled Data:
(806998, 10)


In [136]:
df.to_csv('dataset/preprocessed_file.csv', index=False)