Drag and drop the following files in your colab:

- Live_capture_NetLabMeas.csv
- dns_pcap_yt_s_1_1005.pcap.log
- min_out_pcap_yt_s_1_1005.pcap.log
- requests_yt_s_1_1005.log
- yt_s_1_1005.log

Import the 'Live_Capture_NetLabMeas.csv' file into a dataframe. Inspect its content

In [None]:
from google.colab import drive

drive.mount('/content/drive')

%cd /content/drive/MyDrive/Colab Notebooks/Network measurements /

In [None]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')


# **Monitoring Youtube Video Streaming Traffic**

Today we will complete the following task:


*   Live capturing of Youtube Traffic
*   Identification of Video Streaming traffic flows
*   Rapresentation of downstream and upstream video traffic traces
*   Classification of Video Client HTTP Request

In [None]:
# Start the capture and import the pcap/csv file in your notebook!

In [None]:
# Traffic Capture
pcap = 'Live_Capture_NetLabMeas.csv'

In [None]:
# Import pcap in Pandas Dataframe
pcap_data = pd.read_csv(pcap, sep=',',encoding='latin-1') #Try to add this if you get encoding problems: encoding='latin-1'
pcap_data.head(5)

##  **Hands On**: Filter Traffic on Video Server's IP Address

Find the IP Address(es) of the Video Server(s) contacted by the Video Client.

Use DNS information to find the match between Youtube Server's domain name (**googlevideo**) and corresponding IP address.

In [None]:
# HINT:

# 1. E.G., we want to select the string "polimi" from a Pandas column vector
list_of_strings = pd.Series(['laboratory','polimi', 'network', 'polimi', 'laboratory'])

filter = list_of_strings.apply(lambda x: 'polimi' in x)

# Access the item at position "index" in the vector "list_of_strings"
print(list_of_strings[filter])

# Other examples
## Find the index of the desired string in the vector "list_of_strings"
#index = list_of_strings.str.contains('polimi', regex=False)


In [None]:
def filter_traffic(data, domain):

    # Look in DNS Responses for googlevideo domain
    dns_data = data[data['Protocol']=='DNS']
    dns = dns_data[dns_data['Info'].apply(lambda x: 'googlevideo' in x and 'response' in x)]
    ips = dns.Address.values
    server_names = dns.Name.values

    # Filtering on either "Source" or "Destination" IP, get the
    # rows of the dataset that contain at least one of the selected IPs
    downlink = data[data['Source'].apply(lambda x: x in ips)].dropna(subset=['Length'])

    uplink = data[data['Destination'].apply(lambda x: x in ips)].dropna(subset=['Length'])

    return ips, server_names, uplink, downlink

## Select IPs of server domain names that include 'googlevideo':
domain_name = 'googlevideo'
ips, server_names, uplink, downlink = filter_traffic(pcap_data,domain_name)

In [None]:
print(server_names)
print(ips)

In [None]:
print(downlink.shape)
downlink.head()

In [None]:
print(uplink.shape)
uplink.head()

## **Hands On**: Find Dominant Traffic Flow
Select the dominant traffic flow, i.e., the traffic flows which carries the **majority** of the session's DL traffic.

How much data volume ( [MB] ) is downloaded by the Video Client?

HINT 1: how to label a traffic flow univocally?

HINT 2: check the method groupby() ...

In [None]:
def find_dominant(uplink, downlink):

  # Expressed in MB

  # Order flows by cumulative DL Volume
  flows_DL = downlink.groupby(['Source','Destination'])['Length'].sum()/(10**6)
  print(flows_DL)

  # Get (Source,Destination) IPs of dominant flow in DL direction
  dom_id = flows_DL[flows_DL==max(flows_DL)].index[0]

  # Filter traffic selecting the dominant flow (for both DL and UL)
  dom_dl = downlink[downlink['Source']==dom_id[0]]
  dom_ul = uplink[(uplink['Source']==dom_id[1])]

  return dom_ul, dom_dl


In [None]:
dom_ul, dom_dl = find_dominant(uplink, downlink)

In [None]:
dom_ul.head()

In [None]:
dom_dl.head()

In [None]:
'''
            1 video       1 hour
          (3 mins, MB)	   (MB)

4K (HFR)	     135         2700

1080p	         83.5        1650

720p	         43.5         870

480p	         13.2         264
'''

## Represent Uplink & Downlink Traffic Traces

In [None]:
# OPTION 1: MATPLOTLIB
# V) easy-to-use
# X) not interactive

fig, ax = plt.subplots(figsize=(15,10))

ax.scatter(x=dom_dl['Time'],y=dom_dl['Length'],color='blue',s=15)
ax.grid(True)

largeul = dom_ul[dom_ul['Length']>100]
for x in largeul.index:
  plt.axvline(largeul.loc[x,'Time'], ymin=0,
              ymax=largeul.loc[x,'Length'], color='red', ls='--')

In [None]:
# OPTION 2: PLOTLY
# X) require some extra effort
# V) nicer (and interactive) GUI

x=dom_dl['Time']
y=dom_dl['Length']

x2=dom_ul['Time']
y2=dom_ul['Length']

# Select UL Packets larger than 100 Bytes
largeul_timestamp = dom_ul[dom_ul['Length']>100].Time.values
largeul_size = dom_ul[dom_ul['Length']>100].Length.values

# Create trace: one marker per each DL Packet
trace = go.Scatter(x = x, y = y,  mode = 'lines+markers', line_shape='hv',
                   line=dict(color='#4363d8', width=0.6), name='Downlink')

# Create trace: one marker per each UL Packet
trace2 = go.Scatter(x = x2, y = y2,  mode = 'markers',
                    marker=dict(color='#e6194b'), name='Uplink')

# Create trace: vertical line for the first large UL Packet
trace3 = go.Scatter(x = [largeul_timestamp[0], largeul_timestamp[0]], y = [-3000, 3000],  mode = 'lines', line_shape='hv',
                    line=dict(color='#e6194b', width=0.5, dash='dash'), name='HTTP Request')
data = [trace, trace2, trace3]

layout = go.Layout(height=800, width=1200, title='Dominant Streaming Flow', xaxis=dict(title='Playback [s]'),
                       yaxis=dict(title='Packet Size [bytes]'),legend=dict(orientation="h"))
# Plot and embed in ipython notebook!
fig = go.Figure(data=data, layout=layout)

# Add vertical lines for each large UL Packet
for x in largeul_timestamp[1:]:
  fig.add_vline(x=x, line_width=0.5, line_dash="dash", line_color="red")
fig.show()


# **Classification of Video Client HTTP Requests**

*   Dig into Metadata: a close look into encrypted payloads
*   Classification of contents requested by the video client (Audio/Video)





## Repeat same processing for a different capture

The processing applied below repeats what we have done together. The function bodies might be sligthly different from what done during lecture, but the output is exactly the same.

In [None]:
def filter_traffic_v2(dns_data, traffic_data, domain):

    print('Getting Video Server IP...')
    # Look in DNS Responses for googlevideo domain
    dns = dns_data[dns_data['Domain_Name'].apply(lambda x: 'googlevideo' in x)]
    ips = dns.IP.values
    server_names = dns.Domain_Name.values

    print('Filtering Downstream and Upstream Video Traffic...')
    # Filtering on either "Source" or "Destination" IP, get the
    # rows of the dataset that contain at least one of the selected IPs
    downlink = traffic_data[traffic_data['SrcIP'].apply(lambda x: x in ips)].dropna(subset=['Size'])

    uplink = traffic_data[traffic_data['DstIP'].apply(lambda x: x in ips)].dropna(subset=['Size'])

    downlink['Size'] = downlink['Size'].astype(int)
    uplink['Size'] = uplink['Size'].astype(int)

    return uplink, downlink
  
def find_dominant_v2(uplink, downlink):

  flows_DL = downlink.groupby(['SrcIP','SrcPort','DstIP','DstPort'])['Size'].sum()/(10**6)
  flows_UL = uplink.groupby(['SrcIP','SrcPort','DstIP','DstPort'])['Size'].sum()/(10**6)

  # Get Dominant Flow ID
  dominant_DLflow_id = flows_DL[flows_DL==max(flows_DL)].index[0]
  dominant_ULflow_id = flows_UL[flows_UL==max(flows_UL)].index[0]

  # Filter out all the other flows
  downlink_dominant = downlink.set_index(['SrcIP','SrcPort','DstIP','DstPort']).loc[dominant_DLflow_id]
  uplink_dominant = uplink.set_index(['SrcIP','SrcPort','DstIP','DstPort']).loc[dominant_ULflow_id]

  return uplink_dominant, downlink_dominant

In [None]:
# Application Layer Data: DNS
dns = 'dns_pcap_yt_s_1_1005.pcap.log'

# Network Layer Data: IP
pcap = 'min_out_pcap_yt_s_1_1005.pcap.log'

print('Loading data...')
pcap_data = pd.read_csv(pcap, sep=';')
dns_data = pd.read_csv(dns, sep=';',header=None, names = ['IP','Domain_Name'])

print('Pre-Processing data...')
uplink, downlink = filter_traffic_v2(dns_data, pcap_data, domain='googlevideo')
# Normalize timestamps
start = min(uplink['Time']) # smallest UL packet timestamp
uplink['Time'] = uplink['Time'] - start
downlink['Time'] = downlink['Time'] - start

print('Selecting dominant flow...')
uplink_dominant, downlink_dominant = find_dominant_v2(uplink, downlink)

print('Done!')

## Dig Into Metadata

How does encrypted packets payload look like?

In [None]:
# Application Layer Data: HTTP

measurements = 'yt_s_1_1005.log'
http_requests = 'requests_yt_s_1_1005.log'

In [None]:
# Import HTTP Log in Pandas Dataframe
http_data = pd.read_csv(http_requests, sep=',',
                       names=['time','method','protocol','domain','page',
                              'itag','urlparams', 'url']).iloc[1:,:]

# Time-Alignment of Application and Network Layer Data
http_data['time'] = http_data['time'].astype(float)/1000 - start
# Drop Duplicates Rows in Dataset
http_data.drop_duplicates(subset=['method','protocol','domain','page','itag','urlparams','url'],
                         keep='first', inplace=True)

# Show Dataframe
http_data.head(20)


In [None]:
http_data.dropna(subset=['itag']).head(20) # Show only requests for video/audio contents

## **Hands On**: Label HTTP Request

Label Requests as being for Audio or Video contents.


In [None]:
audio_itag = list(map(str,[139,140,141,171,172,249,250,251,256,258,325,328]))

video_itag = list(map(str,[167,168,169,170,218,219,242,243,244,245,246,247,
                  248,271,272,278,302,303,308,313,315,330,331,332,
                  333,334,335,336,337,133,134,135,136,137,138,160,
                  212,213,214,215,216,217,264,266,298,299]))

In [None]:
# Hint:
temp = pd.DataFrame(['0', '1', '3', '244'],columns=['A'], index=range(4))
print(temp)
print(temp.isin(video_itag))
print(temp[temp['A'].isin(video_itag)])
print(temp[temp['A'].isin(video_itag)].index)

In [None]:
def label_itag(audio_itag, video_itag, data):

  data['Content_Type'] = np.zeros(len(data)).astype(str)
  for i,j in zip([audio_itag, video_itag],['Audio','Video']):

      # Get rows of Audio/Video Itags
      index = data[data['itag'].isin(i)].index

      # Add to Content Type Column the label (Audio/Video) of the itag at the
      # selected rows
      data.loc[index,'Content_Type'] = j

  return(data)

http_data.dropna(inplace=True)
http_data = label_itag(audio_itag, video_itag, http_data)
http_data.head()

## Features Extraction

Build a dataset that can be used to perform classification.

Consider data from PB_Time = 2 [s] to PB_Time = 180 [s]

In [None]:
def timebased_filter_v2(data, size=None, min_time=None, max_time=None):
  '''
  :param data: pd dataframe to be filtered. Must contain columns: "Size" and "Time"
  :param size: all packets shorter than size [Bytes] will be discarded (default 0)
  :param min_time: all packets with timestamp smaller than min_time [s] will be discarded (default 0)
  :param max_time: all packets with timestamp larger than max_time [s] will be discarded (default 1000)
  '''

  if size is None:
    size=0
  if min_time is None:
    min_time = 0
  if max_time is None:
    max_time = 1000

  filtered_data = data.copy().reset_index()
  mask = (filtered_data['Size']>=size) & (filtered_data['Time']>=min_time) & (filtered_data['Time']<= max_time)
  filtered_data = filtered_data.loc[mask[mask ==True].index]

  return filtered_data

def find_next(array, value):
    '''
    :param array: np.array, array of floats
    :param value: float, reference value
    :return: position of the closest element of the array greater than "value"
    '''
    delta = np.asarray(array) - value
    idx = np.where(delta >= 0, delta, np.inf).argmin()

    return idx

In [None]:
# Match uplink packets with corresopnding HTTP Request issued by the client

x2=uplink_dominant['Time']
y2=uplink_dominant['Size']

size_threshold = 100 # Bytes
largeul_timestamp = uplink_dominant[uplink_dominant['Size']>size_threshold].Time
largeul_size = uplink_dominant[uplink_dominant['Size']>size_threshold].Size

# Create trace: one vertical dashed line per Large UL Packet
i = 0
data = []
for x in largeul_timestamp[:]:
  trace = go.Scatter(x = [x, x], y = [-3000, 3000],
                      mode = 'lines', line_shape='hv',
                      line=dict(color='#e6194b', width=1),
                      name='Large UL P. {}'.format(i))
  data.append(trace)
  i+=1

# Create trace: one marker point per each UL packet
trace2 = go.Scatter(x = x2, y = y2,  mode = 'markers',
                    marker=dict(color='#e6194b'), name='Uplink Packets')
data.append(trace2)
layout = go.Layout(height=800, width=1200, title='Focus on HTTP Requests',
                   xaxis=dict(title='Playback [s]'),
                   legend=dict(orientation="v"))

# Plot and embed in ipython notebook!
fig = go.Figure(data=data, layout=layout)

for x in http_data.index:
  if http_data.loc[x, 'Content_Type'] in ['Audio']:
    fig.add_vline(x=http_data.loc[x, 'time'], line_width=1, line_dash="dash", line_color="black")
  elif http_data.loc[x, 'Content_Type'] in ['Video']:
    fig.add_vline(x=http_data.loc[x, 'time'], line_width=1, line_dash="dot", line_color="black")
fig.show()


In [None]:
# Filter UL/DL Data
playback_start = 2
playback_end = 180
min_ul_size = 100
min_dl_size = 50
ul = timebased_filter_v2(uplink_dominant, min_ul_size, playback_start, playback_end)
dl = timebased_filter_v2(downlink_dominant, min_dl_size, playback_start, playback_end)
# ****************************************************************************
# Create an empty dataset

dataset = pd.DataFrame(columns=['Request_Size','Inter_RR_Time','DL_Time','DL_Vol','DL_Size','PB_Time'])

# ****************************************************************************
# Feature 1: Client Request Size

dataset['Request_Size'] = list(ul.Size.values)

# ****************************************************************************
# Feature 2: Inter Request-Response Time

rr_time = []
response_time = []
for t in ul.Time:
  response_time.append(find_next(dl.Time, t)) #index of next DL packet timestamp
  rr_time.append(dl.Time.iloc[response_time[-1]] - t)

dataset['Inter_RR_Time'] = rr_time

# ****************************************************************************
# Feature 3-4-5: Download Time, Download Volume, Download Size (# Packets)

dt = []
dv = []
ds = []

for rt1, t, rt2 in zip(response_time[:-1], ul.Time.iloc[1:], response_time[1:]):

  #Download Time
  dt.append(dl.Time.iloc[rt2-1] - dl.Time.iloc[rt1])
  #print(dl.Time.iloc[rt1], dl.Time.iloc[rt2-1], t, dl.Time.iloc[rt2],)

  temp = timebased_filter_v2(dl, 0, dl.Time.iloc[rt1], dl.Time.iloc[rt2-1])
  #Download Volume
  dv.append(temp.Size.sum())

  #Download Size (# Packets)
  ds.append(temp.shape[0])


# Consider also last HTTP iteration
#Download Time
dt.append(dl.Time.iloc[-1] - dl.Time.iloc[rt2])

temp = timebased_filter_v2(dl, 0, dl.Time.iloc[rt2], dl.Time.iloc[-1])
#Download Volume
dv.append(temp.Size.sum())

#Download Size (# Packets)
ds.append(temp.shape[0])


dataset['DL_Time'] = dt
dataset['DL_Vol'] = dv
dataset['DL_Size'] = ds

# ****************************************************************************
# Feature 5: Playback Time

pbt = list(ul.Time.values)
dataset['PB_Time'] = pbt

# ****************************************************************************
# Check Features Consistency
dataset = dataset[(dataset > 0).all(1)]
dataset = dataset[dataset['DL_Time']<20]
# ****************************************************************************

print(dataset.shape)
dataset.head()

In [None]:
# Filter HTTP Data to get groundtruth
mask = (http_data['time']>=playback_start) & (http_data['time']<= playback_end)

groundtruth = http_data.loc[mask[mask == True].index].loc[:,['Content_Type']].reset_index(drop=True)

print(groundtruth.shape)
groundtruth.head()


## Classification of HTTP Requests

Compare the performance of two classification scenarios:


1.   Use as groundtruth a **random** vector of zeros and ones
2.   Use as groundtruth the **true** vector of HTTP requests labels





In [None]:
from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import KFold
import random

def normalize_dataset(training_set, test_set):

  mean_train = training_set.mean()
  std_train = training_set.std()
  norm_train = (training_set - mean_train)/std_train
  norm_test = (test_set - mean_train)/std_train

  return norm_train, norm_test

In [None]:
# generate a random vector of zeros and ones
s = pd.Series(np.random.randint(2, size=(25,)))
s[s==0] = 'Audio'
s[s==1] = 'Video'
print(s)

In [None]:
accuracy_knn = []
accuracy_rf = []
kf = KFold(n_splits=5)

raw_knn = KNeighborsClassifier(1) # KNN Classifier
raw_rf = RandomForestClassifier() # Random Forest Classifier

for gt,case in zip([s, groundtruth],['Random', 'Video Streaming']):

  for train_index, test_index in kf.split(dataset):

    # Get Training and Test Set
    data_train, data_test = dataset.iloc[train_index,:], dataset.iloc[test_index,:]
    labels_train, labels_test = gt.iloc[train_index], gt.iloc[test_index]

    # Normalize datasets: it benefits the learning process
    norm_train, norm_test = normalize_dataset(data_train, data_test)

    # Get Classifiers
    raw_knn = KNeighborsClassifier(1)
    raw_rf = RandomForestClassifier()
    #raw_lr = LogisticRegression()

    # Train the Classifiers
    kn = raw_knn.fit(norm_train, labels_train) # fit knn classifier on training set
    rf = raw_rf.fit(norm_train, labels_train) # fit clf classifier on training set

    # Predict
    prediction_kn = kn.predict(norm_test)
    prediction_rf = rf.predict(norm_test)

    # Collect Results
    accuracy_knn.append(metrics.accuracy_score(labels_test, prediction_kn))
    accuracy_rf.append(metrics.accuracy_score(labels_test, prediction_rf))

  print("############\n")
  print("Case: {}\n".format(case))
  print("Detection Performance (KNN):\n")
  print("Accuracy = : {} (std = {})\n".format(np.mean(accuracy_knn), np.std(accuracy_knn)))
  print("Detection Performance (Random Forest):\n")
  print("Accuracy = : {} (std = {})\n".format(np.mean(accuracy_rf), np.std(accuracy_rf)))
  print("############\n")

In [None]:
import pickle
import joblib

groundtruth_v2 = groundtruth.copy()
groundtruth_v2[groundtruth == 'Audio'] = 0
groundtruth_v2[groundtruth == 'Video'] = 1

# Use a Classifier already Trained to make prediction
custom_rf = joblib.load("finalized_model.joblib")

# Features Mean and Std. values found in the Training Set
mean = [609.442804, 0.448292, 0.638842, 505372.314610, 377.254203, 56.366221]
std = [77.769301, 1.698097, 1.128774, 595930.279100, 445.374095, 42.143888]

# Normalize test set
test = (dataset - mean)/std

# Predict
prediction = custom_rf.predict(test)

# Post Processing
prediction = [int(x) for x in prediction]
groundtruth_v2 = [int(x) for x in groundtruth_v2.values]

print("############\n")
print("Case: {}\n".format('Custom Classifier'))
print("Detection Performance:\n")
print("Accuracy = : {}\n".format(metrics.accuracy_score(groundtruth_v2, prediction)))
print("############\n")