In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
from statistics import mean, stdev

## Cleaning Section:

Here we work on the csv file to prepare it for the following processing part. In particular we need to have the time in a correct format for extrapolating statistical features such as IAT (inter-arrival time)

In [None]:
# We need to import all the csv files on colab in order to work with them (modifying or creating the dataset)

traffic_files = ['zoom.csv',
              'whatsapp.csv',
              'wikipedia.csv',
              'youtube.csv']

for file_ in traffic_files:

  # We read the csv file imported in a folder on colab
  #df = pd.read_csv('/content/ds/' + file_)
  # It is possible to do it without the folder by simply importing it directly on colab and then by doing
  df = pd.read_csv(file_)

  # we drop columns that we will not need
  df = df.drop(columns=['frame.number'])

  # Using regex we create a list with just the numbers
  df['list'] = df['frame.time'].apply(lambda x: re.findall(r"\d+", x))

  # We create a second dataframe with the extracted time information
  df2 = pd.DataFrame(df['list'].tolist(), columns=['day', 'year', 'hour', 'minute', 'second', 'microsecond'])

  # We add the month
  # Note: we cleared the month, which was written Aug, with the regex
  df2['month'] = 8

  # We create a datetime column on the original dataframe
  df['datetime'] = pd.to_datetime(df2[['year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond']])

  # We drop columns that we will not need
  df = df.drop(columns=['frame.time', 'list'])

  # We drop rows without data.len parameters
  df.dropna(inplace=True, subset = ['data.len'])

  # We convert the datetime into the amount of seconds
  df['timestamp'] = df['datetime'].apply(lambda x: (x - df['datetime'].iloc[0]).total_seconds())

  # We remove the datetime column
  df = df.drop(columns=['datetime'])

##Processing section:
Here we create the dataset that will be used in the ML part. In this part we extrapolate the required features and then we create a complete dataset (uplink nad downlink) for each of the chosen steps (in seconds)

In [3]:
  # We create two different dataframes by filtering on mac addresses
  # We define two dataframes
  # One for uplink, denoted ul
  # One for downlink, denoted dl

  uplink_df = df[(df['wlan.sa']=='14:7d:da:93:75:71') & (df['wlan.da'] == '02:7d:60:f0:2d:64')]
  downlink_df = df[(df['wlan.sa']=='02:7d:60:f0:2d:64') & (df['wlan.da'] == '14:7d:da:93:75:71')]

  # We create the new DataFrame on which we will put the cleaned data
  columns = ['avg datalen dl',
             'std datalen dl',
             'n_packets dl',
             'avg iat dl',
             'std iat dl',
             'avg datalen ul',
             'std datalen ul',
             'n_packets ul',
             'avg iat ul',
             'std iat ul']

  clean_dataset = pd.DataFrame(columns = columns)

  # Index used in the for loop
  index = 0

  # We define parameters for the arange
  max_time = downlink_df.iloc[-1]['timestamp']

  # TIME INTERVAL ---> we decided to use 6 possible steps : 0.1s, 0.3s, 0.5s, 1s, 2s, 5s
  step = 5

  for i in np.arange(step, max_time, step):

    # Create partition for downlink
    dl = downlink_df[(downlink_df['timestamp'] >= i-step) & (downlink_df['timestamp'] < i)]
    # Create partition for uplink
    ul = uplink_df[(uplink_df['timestamp'] > i-step) & (uplink_df['timestamp'] < i)]

    # Avg data_len downlink
    dl_mean = dl['data.len'].mean()

    # Std data_len downlink
    dl_std = dl['data.len'].std()

    # Number of packets downlink
    dl_n = dl.shape[0]

    # Computing interarrival times downlink
    dl_iat = []
    for j in range(dl['timestamp'].shape[0]-1):
      previous = dl['timestamp'].iloc[j]
      successive = dl['timestamp'].iloc[j+1]
      dl_iat.append(successive-previous)

    if len(dl_iat) != 0:
      # Avg downlink interarrival time
      dl_iat_mean = mean(dl_iat)

      # Std downlink interarrival time
      if len(dl_iat) >= 2:
        dl_iat_std = stdev(dl_iat)
      else:
        # Stdev cannot be computed with less then two elements
        dl_iat_std = -1

    else:

      # Mean cannot be computed with 0 elements
      dl_iat_mean = 0


    # Avg data_len uplink
    ul_mean = ul['data.len'].mean()

    # Std data_len uplink
    ul_std = ul['data.len'].std()

    # Number of packets uplink
    ul_n = ul.shape[0]

    # Computing interarrival times downlink
    ul_iat = []
    for j in range(ul['timestamp'].shape[0]-1):
      previous = ul['timestamp'].iloc[j]
      successive = ul['timestamp'].iloc[j+1]
      ul_iat.append(successive-previous)


    if len(ul_iat) != 0:

      # Avg uplink interarrival time
      # Mean need at least 1 element
      ul_iat_mean = mean(ul_iat)

      if len(ul_iat) >= 2:
        # Std uplink interarrival time
        # Std needs at least 2 elements
        ul_iat_std = stdev(ul_iat)
      else:
        ul_iat_std = -1

    else:

      ul_iat_mean = 0


    # Create the row
    row = [dl_mean, dl_std, dl_n, dl_iat_mean, dl_iat_std, ul_mean, ul_std, ul_n, ul_iat_mean, ul_iat_std ]
    # Add the new row
    clean_dataset.loc[index] = row
    index += 1

  # We add the supervised label
  name = re.findall(r'[a-z]*', file_)[0]
  clean_dataset['supervised'] = name

  # We save the dataset with a new name
  clean_dataset.to_csv(name + '.csv')

Finally we can create the complete dataset used in the 3) phase

In [4]:
names = ['zoom', 'youtube', 'wikipedia', 'whatsapp']

frames = [pd.read_csv(i+'.csv') for i in names]

dataset = pd.concat(frames)

dataset.to_csv('dataset_5.csv')