In [54]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


In [3]:
df = pd.read_csv('http_requests.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
df = shuffle(df)
df['payload'] = df['payload'].fillna(0)
df['payload'] = df['payload'].apply(lambda x: str(x))
df.drop(columns=['index', 'method', 'url', 'protocol', 'userAgent', 'pragma', 'cacheControl', 'accept', 'acceptEncoding', 'acceptCharset', 'acceptLanguage', 'host', 'connection', 'contentLength', 'contentType', 'cookie'], inplace=True)

In [4]:
print(df.head())

                          payload label
219824  apellidos=Acosta+Hausdorf  norm
167792                          0  norm
222921                remember=on  norm
132575          B2=Vaciar+carrito  norm
165244       ciudad=Villames%EDas  norm


In [5]:
# Function to generate a 255 dim feature vector from 'payload'
# Return a list of length 255 with features of payloads mapped to it
# Each list index represents an ASCII character and value represents count
# q = [4, 5, 2, 6, 12,  ....]


def vectorize_payload(payload):
    vec_255 = [0]*255
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 1))
    vectorized = vectorizer.fit_transform([payload])
    mapped = list(zip(vectorizer.get_feature_names(), vectorized.sum(0).getA1()))
    for x in mapped:
        vec_255[ord(x[0])] = x[1]
    return vec_255


In [6]:
# conversion to 256 dimensional vector
vectors = np.array(df['payload'].apply(lambda x: vectorize_payload(x)))
vectors


array([list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 2, 1, 1, 0, 1, 1, 0, 0, 2, 0, 0, 3, 1, 0, 1, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
       list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [7]:
# Function to generate frequency for 255 dim feature vectors
# Return a list of length 255 with feature frequencies
# Each list index represents an ASCII character and value represents its frequency
# q = [0.23, 0, 0, 0.03, 0.42,  ....] = 1


def calc_frequency(payloads_vectors):
    features_total = 0
    for feature in payloads_vectors:
        features_total += feature
    features_total *= 1.0
    payloads_vectors = [(x/features_total) for x in payloads_vectors]
    return np.array(payloads_vectors, dtype=np.float16)


results = [calc_frequency(x) for x in vectors]
features_frequency = np.vstack(results)
del results


In [8]:
# Function to generate mean vector for n payloads
# Assuming data set has traffic of n packets
# Return a list of length 255 with mean for every 255 features from payload vectors
# x' = [2.3, 3.1, 2.23, 7.5, 12.2,  ....]


def calc_mean(frequency_vectors):
    # TOTAL_PACKETS = len(vectors)
    # compiled = [0.0]*255
    # for x in frequency_vectors:
    #     compiled = list(map(sum, zip(compiled, x)))
    # return np.array([x/(TOTAL_PACKETS*1.0) for x in compiled], dtype=np.float16)
    return np.mean(frequency_vectors, axis=0)


features_mean = calc_mean(features_frequency)


In [9]:
# Function to generate sample covariance matrix n-1
# Assuming data set has traffic of n packets
# Return a covariance matrix for given frequency vectors


def calc_cov_mat():
    mean_shift = np.transpose(np.subtract(features_frequency, features_mean))    
    return np.cov(mean_shift), mean_shift


cov_mat, mean_shift = calc_cov_mat()


In [10]:
eig_val, eig_vec = np.linalg.eigh(cov_mat)


In [11]:
idx = np.argsort(eig_val)[::-1]
evecs = eig_vec[:,idx]
evals = eig_val[idx]
evecs = evecs[:, :2]


In [12]:
eig_pairs = [(np.abs(eig_val[x]), eig_vec[:,x]) for x in range(len(eig_val))]


0.10455772603410776
0.01098373665373174
0.006383121788944296
0.005339163514272897
0.004762442434858571
0.002984705390482854
0.0027433028929525548
0.0026534916036020577
0.0022841861785507793
0.002115367584953084
0.001662469587153863
0.0014812182541796323
0.001374342992510149
0.0010702896559079548
0.001025643016130007
0.0009536517321429655
0.0008929507665629224
0.0008529954106616987
0.00080938993938536
0.0007715309158515647
0.0006927365956453035
0.0006741885224295125
0.0006085880605266296
0.0005299970452115453
0.0005027235599449835
0.0004508444182056775
0.00037956875797764196
0.00034993503686711727
0.00024904620321695027
0.00022221503925848936
0.00021750360915805743
0.0001568850152431578
0.00013026935087834441
0.0001256762343040862
0.00012057964837882773
9.678737566974062e-05
5.1395324556304715e-05
4.414492231053872e-05
4.358924944373092e-05
3.353893733039595e-05
1.1601364085250412e-05
9.873456504876644e-06
1.0098393161357111e-06
4.1218535586887686e-10
1.2859582481655535e-17
4.6671828806

In [None]:
eig_pairs.sort(key=lambda x: x[0], reverse=True)
for i in eig_pairs:
    print(i[0])


In [62]:
data_resc = np.dot(evecs.T, mean_shift).T

In [None]:
#scatter plot for data correlation
clr1 = '#2026B2'
fig2 = plt.figure(figsize=(20, 20))
ax1 = fig2.add_subplot(111)
ax1.plot(data_resc[:, 0], data_resc[:, 1], '.', mfc=clr1, mec=clr1)
plt.show()
