In [33]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from google.colab import drive

In [34]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
df = pd.read_csv('/content/drive/MyDrive/http_requests.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
df = shuffle(df)
df['payload'] = df['payload'].fillna(0)
df['payload'] = df['payload'].apply(lambda x: str(x))
df.drop(columns=['index', 'method', 'url', 'protocol', 'userAgent', 'pragma', 'cacheControl', 'accept', 'acceptEncoding', 'acceptCharset', 'acceptLanguage', 'host', 'connection', 'contentLength', 'contentType', 'cookie'], inplace=True)

In [36]:
# Function to generate a 255 dim feature vector from 'payload'
# Return a list of length 255 with features of payloads mapped to it
# Each list index represents an ASCII character and value represents count
# q = [4, 5, 2, 6, 12,  ....]


def vectorize_payload(payload):
    vec_255 = [0]*255
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 1))
    vectorized = vectorizer.fit_transform([payload])
    mapped = list(zip(vectorizer.get_feature_names(), vectorized.sum(0).getA1()))
    for x in mapped:
        vec_255[ord(x[0])] = x[1]
    return vec_255


In [37]:
# conversion to 256 dimensional vector
vectors = np.array(df['payload'].apply(lambda x: vectorize_payload(x)))
vectors


array([list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 3, 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
       list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 

In [38]:
# Function to generate frequency for 255 dim feature vectors
# Return a list of length 255 with feature frequencies
# Each list index represents an ASCII character and value represents its frequency
# q = [0.23, 0, 0, 0.03, 0.42,  ....] = 1


def calc_frequency(payloads_vectors):
    features_total = 0
    for feature in payloads_vectors:
        features_total += feature
    features_total *= 1.0
    payloads_vectors = [(x/features_total) for x in payloads_vectors]
    return np.array(payloads_vectors, dtype=np.float16)


results = [calc_frequency(x) for x in vectors]
features_frequency = np.vstack(results)
del results


In [39]:
# Function to generate mean vector for n payloads
# Assuming data set has traffic of n packets
# Return a list of length 255 with mean for every 255 features from payload vectors
# x' = [2.3, 3.1, 2.23, 7.5, 12.2,  ....]


def calc_mean(frequency_vectors):
    return np.mean(frequency_vectors, axis=0)


features_mean = calc_mean(features_frequency)


In [40]:
# Function to generate sample covariance matrix n-1
# Assuming data set has traffic of n packets
# Return a covariance matrix for given frequency vectors


def calc_cov_mat():
    mean_shift = np.transpose(np.subtract(features_frequency, features_mean))    
    return np.cov(mean_shift), mean_shift


cov_mat, mean_shift = calc_cov_mat()


In [9]:
eig_val, eig_vec = np.linalg.eigh(cov_mat)
eig_pairs = [(np.abs(eig_val[x]), eig_vec[:,x], x) for x in range(len(eig_val))]


In [None]:
# Sort eig_pairs if need be
eig_pairs.sort(key=lambda x: x[0], reverse=True)
for i in eig_pairs:
    print(i[0])


In [None]:
def scatter_plot_data():
    idx = np.argsort(eig_val)[::-1]
    evecs = eig_vec[:,idx]
    evecs = evecs[:, :2]
    
    data_resc = np.dot(evecs.T, mean_shift).T
    
    # scatter plot for data correlation
    clr1 = '#2026B2'
    fig2 = plt.figure(figsize=(20, 20))
    ax1 = fig2.add_subplot(111)
    ax1.plot(data_resc[:, 0], data_resc[:, 1], '.', mfc=clr1, mec=clr1)
    plt.show()


scatter_plot_data()


Tier 2

Cumulative Energy Test via Eig Vals and Vecs

In [None]:
# Function to sum eigen vals which is equal to the cumulative energy
# Takes into account for a range of eigen values to count for
# Returns a single number representing the sum


def sum_cumulative_energy(pairs):
    THRESHOLD = 100e-6
    sum = 0
    for x in pairs:
        sum += x[0] if x[0] > THRESHOLD else 0
    return sum


cumulative_sum = sum_cumulative_energy(eig_pairs)
cumulative_sum


0.1611124834441891

In [None]:
# This method goes through ordered eig_pairs and calculates cumsum
# If diff of cumsum with previous is less than threshold 155e-6
# It stops when cumsum is minimum to ensure maximum cumulative energy
# and minimum PC's
# It then returns the num of eigen values that can be used as PC


def cumulative_constant(cumulative_sum, pairs, cumulative_threshold=99.5):
    current_sum, curr_cumsum = 0, 0
    i = 0
    pc = []
    while i != len(pairs)-1 and curr_cumsum < cumulative_threshold:
        # print(pairs[i][2])
        current_sum += pairs[i][0]
        curr_cumsum = (current_sum/cumulative_sum)*100
        print("Current Sum: %f, Curr cumsum: %f" % (current_sum, curr_cumsum))
        pc.append(pairs[i][2])
        i += 1
    return i, pc


num_of_PC, cumsum = cumulative_constant(cumulative_sum, eig_pairs)
num_of_PC, cumsum

Scree Test via Sklearn PCA and bar plots

In [None]:
# A simple line plot showing trends in PC's variances


def simple_scree_plot(pca):
    fig = plt.figure(figsize=(20, 20))
    ax = fig.add_subplot()
    ax.plot(np.cumsum(pca.explained_variance_ratio_))
    
    ax.set_xlabel('number of components')
    ax.set_ylabel('cumulative explained variance')
    
    ax.xaxis.label.set_color('white')
    ax.yaxis.label.set_color('white')
    
    ax.tick_params(axis='x', colors='white')
    ax.tick_params(axis='y', colors='white')
    
    plt.show()


In [None]:
# A scree bar plot showing contribution of components in the data


def scree_plot(pca):
    
    num_of_components = len(pca.explained_variance_ratio_)
    index = np.arange(num_of_components)
    values = pca.explained_variance_ratio_
    
    plt.figure(figsize=(20, 10))
    ax = plt.subplot(111)
    cumulative = np.cumsum(values)
    
    ax.bar(index, cumulative)
    ax.plot(index, cumulative)
    
    for i in range(num_of_components):
        ax.annotate(r'%s' % (str(values[i]*100)[:3]), (index[i], values[i]), 
                    va='bottom', ha='center', fontsize=14)
    ax.set_xlabel('Principal component number')
    ax.set_ylabel('Variance Explained Percentage')
    
    ax.xaxis.label.set_color('white')
    ax.yaxis.label.set_color('white')
    
    ax.tick_params(axis='x', colors='white')
    ax.tick_params(axis='y', colors='white')
    
    plt.title('Scree Plot for Payloads Frequency')


In [None]:
# Function to generate PCA using the features matrix
# It uses sklearn library PCA definition
# Returns the PCA object


def cal_pca(features_data):
    pca = PCA(30)
    pca.fit_transform(features_data)
    return pca


pca = cal_pca(features_frequency)


In [None]:
simple_scree_plot(pca)


In [None]:
scree_plot(pca)

In [41]:
compressed_SVD = TruncatedSVD(30)
compressed_data = compressed_SVD.fit_transform(features_frequency)
pd.DataFrame(compressed_SVD.components_)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,245,246,247,248,249,250,251,252,253,254
0,1.109332e-20,3.540121e-24,-2.434385e-25,2.359862e-25,-3.8242189999999996e-26,1.5264910000000001e-27,5.4667600000000005e-27,-2.506744e-27,1.585636e-27,6.013041e-29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-2.8289690000000002e-18,3.586693e-21,-3.1695550000000003e-22,9.201621e-24,-5.533152e-24,1.0221099999999999e-24,1.329416e-24,7.008335e-26,-7.175486e-26,-1.035228e-25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.3462680000000002e-18,-2.6348200000000003e-17,5.215806999999999e-19,-3.1259979999999997e-19,7.703856999999999e-20,-6.622996e-21,1.381852e-20,9.527466999999999e-21,-4.010586e-21,9.167268000000001e-23,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
3,-3.089175e-18,1.36363e-17,-8.904945000000001e-17,-8.705111000000001e-18,3.9502949999999996e-19,-6.886886e-21,2.050098e-19,1.1318029999999998e-19,-6.403640999999999e-20,-1.0715559999999999e-20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-7.273205e-19,5.680708e-17,-5.1808520000000005e-17,-5.4329160000000003e-17,1.245006e-17,-4.667417999999999e-19,-5.950828e-19,-3.7663299999999995e-19,2.0405739999999998e-19,-1.410483e-19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.218547e-18,1.5161780000000003e-17,1.258482e-16,-2.5515450000000002e-17,-6.8951740000000005e-18,-2.901324e-20,-5.369901e-18,8.213959999999999e-19,-1.18428e-18,2.1599769999999997e-19,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
6,2.778697e-18,9.848025e-17,3.1545e-17,2.0365320000000003e-17,-6.367067e-18,-3.3180200000000005e-17,3.7486280000000004e-17,2.751437e-18,-6.340147e-19,1.427305e-18,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
7,2.514318e-20,1.543398e-17,3.508627e-17,-2.1691980000000002e-17,-4.571753e-17,-2.3056320000000002e-17,-4.4570570000000005e-17,-4.7574940000000006e-17,1.9988850000000003e-17,-2.049352e-18,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
8,-1.315366e-18,-2.2595900000000002e-17,4.5929740000000005e-17,3.802239e-17,5.3569910000000005e-17,1.1961530000000001e-17,-1.506966e-17,-3.5345720000000004e-17,1.588358e-17,-7.08511e-18,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
9,8.804584999999999e-19,5.294439e-18,1.3448170000000001e-17,4.9061010000000005e-17,2.5781370000000003e-17,1.56775e-17,2.881152e-18,-6.622481e-17,4.375945e-17,-1.802237e-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
compressed_SVD.explained_variance_ratio_

array([3.7437128 , 0.33433089, 0.37608431, 0.23053849, 0.20311546,
       0.16532139, 0.11362129, 0.10462727, 0.10063692, 0.08713229,
       0.07635089, 0.0631516 , 0.05515907, 0.0523507 , 0.04016385,
       0.03904492, 0.03610638, 0.0340988 , 0.03253479, 0.03089397,
       0.02946278, 0.0263502 , 0.02544903, 0.02293572, 0.01981963,
       0.01918852, 0.01711988, 0.01426646, 0.01314757, 0.00922638])

In [13]:
compressed_SVD.explained_variance_

array([0.09802568, 0.00875415, 0.00984742, 0.00603644, 0.00531839,
       0.00432879, 0.00297507, 0.00273957, 0.00263509, 0.00228148,
       0.00199918, 0.00165357, 0.00144429, 0.00137076, 0.00105165,
       0.00102236, 0.00094541, 0.00089285, 0.00085189, 0.00080893,
       0.00077146, 0.00068996, 0.00066636, 0.00060055, 0.00051896,
       0.00050243, 0.00044827, 0.00037355, 0.00034426, 0.00024158])

In [14]:
np.sum(compressed_SVD.explained_variance_)

0.16014033350581877

In [15]:
pd.DataFrame(compressed_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.005446,0.251554,-0.077316,-0.003824,0.059662,-0.024499,0.024644,-0.042709,-0.004539,0.081682,...,-0.000628,0.058475,-0.022573,-0.034763,0.032393,0.002517,-0.010849,0.014900,0.002472,0.016719
1,0.006165,0.283481,-0.080622,-0.007703,0.056432,0.033775,-0.025597,0.001352,-0.073419,0.006946,...,-0.003514,-0.001238,-0.032586,-0.007399,-0.019324,-0.020654,0.002260,0.002544,0.015767,-0.014622
2,0.006474,0.267162,-0.001570,0.070406,-0.154439,0.006584,-0.052697,0.002850,0.009794,0.108215,...,0.007879,-0.012028,-0.008262,-0.038118,0.008022,-0.005597,-0.008109,0.003274,-0.020417,0.010843
3,0.006093,0.266574,-0.082303,-0.069123,-0.111201,0.136237,0.004872,-0.040032,0.034695,0.009641,...,0.001724,-0.010377,0.015525,0.001336,0.006058,0.008331,0.001539,-0.005672,0.002236,0.005966
4,0.999452,-0.026655,-0.015377,0.010428,0.001942,0.005305,0.000483,0.001545,0.000634,0.001381,...,-0.000156,0.000596,0.000276,-0.000233,-0.000051,0.000060,0.000079,0.000187,-0.000350,0.000148
5,0.005758,0.272508,-0.106794,-0.051524,0.131908,0.049253,-0.100457,-0.025042,-0.046235,0.014606,...,0.002083,-0.018670,0.003883,-0.004555,0.010714,0.011716,0.001998,-0.002898,-0.002100,0.009166
6,0.005232,0.195347,0.055154,0.105189,0.018723,-0.054673,0.009848,0.035923,-0.019580,-0.048602,...,-0.020515,-0.013330,0.035052,0.025922,0.008680,0.005627,-0.089491,0.026374,-0.008088,-0.002944
7,0.056558,0.078666,0.165198,-0.143689,-0.026841,-0.073916,0.026354,0.011745,-0.078972,0.059479,...,-0.041832,-0.013628,-0.008945,-0.022436,-0.006182,0.001408,-0.006655,0.017924,-0.009984,0.002011
8,0.005100,0.208822,0.010035,0.072686,-0.090047,-0.020596,0.041746,0.095835,0.025107,-0.035961,...,-0.004018,0.014451,0.027479,0.033982,0.018667,0.077782,-0.008351,-0.009363,-0.004243,-0.005865
9,0.056748,0.084739,0.164430,-0.152003,-0.051219,-0.035903,0.042747,-0.039372,-0.057738,0.011736,...,-0.054236,-0.040489,-0.023008,-0.017973,-0.008753,0.010892,-0.020116,0.011391,-0.009200,0.005790


In [16]:
sum_of_pcs = np.sum(compressed_data, axis=0)

In [None]:
for x in sum_of_pcs:
  print(x)

In [21]:
compressed_data.shape

(223585, 30)

In [42]:
def calc_mean(frequency_vectors):
    return np.mean(frequency_vectors, axis=0)

features_reduced_mean = calc_mean(compressed_data)
features_reduced_mean.shape

(30,)

In [43]:
def calc_sd(features_frequency):
    return np.std(features_frequency, axis=0)

features_reduced_sd = calc_sd(compressed_data)
features_reduced_sd.shape

(30,)

In [44]:
def format_freq():
    features = np.transpose(compressed_data)
    covs = np.array([], dtype=np.float16)
    return np.append(covs, [cal_covariance(x) for x in features])

def cal_covariance(feature):
    feature_vector = np.array([feature, feature])
    result = np.matmul(feature_vector, np.transpose(feature_vector))
    return result[0][0]

covariances = format_freq()


In [25]:
covariances

array([25956.58905232,  9642.17859005,  2248.2587503 ,  1358.43141829,
        1189.59127805,   978.76991892,   665.34480995,   612.57843095,
         589.43469374,   510.13025381,   448.32775332,   369.80686996,
         323.27152272,   306.52637941,   235.24492415,   228.61571637,
         211.44021996,   199.6273395 ,   190.47640888,   180.86633031,
         172.48596537,   154.27170956,   149.0205592 ,   134.30529449,
         116.07034189,   112.33858611,   100.23426948,    83.53392636,
          76.98609397,    54.02437048])

In [45]:
np.square(features_reduced_sd)

array([0.09802568, 0.00875415, 0.00984742, 0.00603644, 0.00531839,
       0.00432879, 0.00297507, 0.00273957, 0.00263509, 0.00228148,
       0.00199918, 0.00165357, 0.00144429, 0.00137076, 0.00105165,
       0.00102236, 0.00094541, 0.00089285, 0.00085189, 0.00080893,
       0.00077146, 0.00068996, 0.00066636, 0.00060055, 0.00051896,
       0.00050243, 0.00044827, 0.00037355, 0.00034426, 0.00024158])

In [46]:
def calc_mahalanobis_dist(feature_x1, feature_x2, cov_x1, cov_x2):
    feature_diff = feature_x1-feature_x2
    feature_diff = feature_diff*np.transpose(feature_diff)
    if cov_x1+cov_x2 == 0:
        return 0
    return feature_diff/(cov_x1+cov_x2)
  

In [47]:
def construct_mdm(features):
    mdm = []
    i=0
    for index, feature1 in enumerate(features):
        feature_map = []
        for index2, feature2 in enumerate(features):
            feature_map.append(calc_mahalanobis_dist(feature1, feature2, covariances[index], covariances[index2]))
        mdm.append(feature_map)
    return np.array(mdm)

payload_map = np.array([construct_mdm(x) for x in compressed_data[:10000]])


In [None]:
print("Payload map size: %d bytes" % (payload_map.size * payload_map.itemsize))

Payload map size: 72000000 bytes


In [None]:
payload_map.shape

(10000, 30, 30)

In [None]:
payload_map[0]

In [48]:
payloads_map_mean = np.mean(payload_map, axis=0)
payloads_map_mean.shape

(30, 30)

In [49]:
payloads_map_sd = np.std(payload_map, axis=0)
payloads_map_sd.shape

(30, 30)

In [50]:
payloads_map_variances = np.square(payloads_map_sd)
payloads_map_variances.shape

(30, 30)

In [54]:
payloads_map_variances

array([[0.00000000e+00, 8.30376269e-11, 1.35016179e-10, 1.29381630e-10,
        1.36133220e-10, 1.36505013e-10, 1.43226467e-10, 1.43212796e-10,
        1.43993864e-10, 1.44526132e-10, 1.46446901e-10, 1.47532954e-10,
        1.48219784e-10, 1.47763327e-10, 1.49120712e-10, 1.49093673e-10,
        1.49069782e-10, 1.49332478e-10, 1.49588081e-10, 1.49384033e-10,
        1.49810756e-10, 1.49510784e-10, 1.49786081e-10, 1.50313494e-10,
        1.50422020e-10, 1.50411160e-10, 1.50544933e-10, 1.50704000e-10,
        1.51140490e-10, 1.51116406e-10],
       [8.30376269e-11, 0.00000000e+00, 1.90946681e-11, 1.31650837e-11,
        1.78157165e-11, 8.17713618e-12, 1.28727491e-11, 1.13177929e-11,
        1.30016428e-11, 9.70880520e-12, 1.20795843e-11, 9.72579298e-12,
        7.96123517e-12, 8.95570439e-12, 8.16975876e-12, 7.34161687e-12,
        9.23676256e-12, 7.11011141e-12, 6.84279464e-12, 7.04255351e-12,
        6.76539086e-12, 8.02283871e-12, 8.31077932e-12, 8.20168583e-12,
        8.42735900e-12,

In [None]:
def construct_payload_map(payload):
  vectorized = vectorize_payload(payload)
  payload_frequency = calc_frequency(vectorized)
  payload_mean = calc_mean(payload_frequency)
  payload_sd = np.std(payload_frequency)
  mean_shift = np.transpose(np.subtract(features_frequency, features_mean))    
  payload_cov_mat = np.cov(mean_shift)
  print(payload_cov_mat)

In [51]:
real_payload_map = construct_mdm(compressed_data[130000])

In [52]:
real_payload_map.shape

(30, 30)

In [53]:
def cal_weight(real_payload_map):
  return (np.square(real_payload_map - payloads_map_mean)/payloads_map_variances)

payload_weight = cal_weight(real_payload_map)
payload_weight

  


array([[       nan, 7.54262303, 7.56200619, 7.54763418, 7.56542273,
        7.5609581 , 7.56563377, 7.56704978, 7.5690074 , 7.56699655,
        7.56871227, 7.56905498, 7.57138578, 7.5693801 , 7.56744445,
        7.56835888, 7.56944343, 7.56834489, 7.568262  , 7.56781634,
        7.5690603 , 7.56977957, 7.57113765, 7.57079044, 7.57081862,
        7.57047026, 7.57150587, 7.57087274, 7.5716803 , 7.57102416],
       [7.54262303,        nan, 1.03841974, 1.44197082, 1.10209907,
        2.31195844, 1.51240596, 1.70417053, 1.49319293, 2.00869271,
        1.60812798, 1.97671928, 2.42633433, 2.16073828, 2.36686418,
        2.64486902, 2.10670379, 2.70589339, 2.82454541, 2.76592258,
        2.86816723, 2.40451552, 2.30454788, 2.35706526, 2.28875733,
        2.43546944, 2.49674888, 2.51103536, 2.4881386 , 2.58466722],
       [7.56200619, 1.03841974,        nan, 0.29035386, 0.63643997,
        0.57157836, 0.42829748, 0.3290314 , 0.41348444, 0.32006061,
        0.43383061, 0.29635568, 0.39672482, 0.