In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.utils import shuffle
import numpy as np


In [2]:
# CSIC Dataset

df = pd.read_csv('http_requests.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')


In [3]:
# Shuffling of data and filling in na
# Converting to string for use in CV

df = shuffle(df)
df['payload'] = df['payload'].fillna(0)
df['payload'] = df['payload'].apply(lambda x: str(x))
df.drop(columns=['index', 'method', 'url', 'protocol', 'userAgent', 'pragma', 'cacheControl', 'accept', 'acceptEncoding', 'acceptCharset', 'acceptLanguage', 'host', 'connection', 'contentLength', 'contentType', 'cookie'], inplace=True)


In [4]:
print(df.head())


                                  payload label
154379                                  0  norm
152717  email=daughtry%40thecupidhouse.az  norm
150334                                  0  norm
23639                        login=cluett  anom
113923                provincia=Cantabria  anom


In [5]:
# Function to generate a 255 dim feature vector from 'payload'
# Return a list of length 255 with features of payloads mapped to it
# Each list index represents an ASCII character and value represents count
# q = [4, 5, 2, 6, 12,  ....]


def vectorize_payload(payload):
    vec_255 = [0]*255
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 1))
    vectorized = vectorizer.fit_transform([payload])
    mapped = list(zip(vectorizer.get_feature_names(), vectorized.sum(0).getA1()))
    for x in mapped:
        vec_255[ord(x[0])] = x[1]
    return vec_255


In [6]:
# conversion to 256 dimensional vector
vectors = np.array(df['payload'].apply(lambda x: vectorize_payload(x)))
vectors


array([list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
       list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 

In [7]:
print("%d bytes" % (vectors.size * vectors.itemsize))


1788680 bytes


In [8]:
# Function to generate frequency for 255 dim feature vectors
# Return a list of length 255 with feature frequencies
# Each list index represents an ASCII character and value represents its frequency
# q = [0.23, 0, 0, 0.03, 0.42,  ....] = 1


def calc_frequency(payloads_vectors):
    features_total = 0
    for feature in payloads_vectors:
        features_total += feature
    features_total *= 1.0
    payloads_vectors = [(x/features_total) for x in payloads_vectors]
    return np.array(payloads_vectors, dtype=np.float16)


results = [calc_frequency(x) for x in vectors]
features_frequency = np.vstack(results)
del results


In [9]:
# Function to generate mean vector for n payloads
# Assuming data set has traffic of n packets
# Return a list of length 255 with mean for every 255 features from payload vectors
# x' = [2.3, 3.1, 2.23, 7.5, 12.2,  ....]


def calc_mean(frequency_vectors):
    # TOTAL_PACKETS = len(vectors)
    # compiled = [0.0]*255
    # for x in frequency_vectors:
    #     compiled = list(map(sum, zip(compiled, x)))
    # return np.array([x/(TOTAL_PACKETS*1.0) for x in compiled], dtype=np.float16)
    return np.mean(frequency_vectors, axis=0)


features_mean = calc_mean(features_frequency)


In [10]:
# Function to generate sd vector for n payloads
# Assuming data set has traffic of n packets
# Return a list of length 255 with sd for every 255 features from payload vectors
# x' = [2.3, 3.1, 2.23, 7.5, 12.2,  ....]


def calc_sd(features_frequency):
    return np.std(features_frequency, axis=0)


features_sd = calc_sd(features_frequency)


In [11]:
def calc_covariance():
    return np.square(features_sd)


covariances = np.array(calc_covariance(), dtype=np.float16)


In [12]:
def calc_mahalanobis_dist(feature_x1, feature_x2, cov_x1, cov_x2):
    feature_diff = feature_x1-feature_x2
    feature_diff = feature_diff*np.transpose(feature_diff)
    if cov_x1+cov_x2 == 0:
        return 0
    return feature_diff/(cov_x1+cov_x2)


In [24]:
def construct_mdm(features):
    mdm = []
    for index, feature1 in enumerate(features):
        feature_map = []
        for index2, feature2 in enumerate(features):
            feature_map.append(calc_mahalanobis_dist(feature1, feature2, covariances[index], covariances[index2]))
        mdm.append(feature_map)
    print("Completed...")
    return np.array(mdm)


payload_map = np.array([construct_mdm(x) for x in features_frequency[1:2]])


F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F

F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F

F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
F1: 0.000000, F2: 0.000000
Completed...


In [13]:
print("Feature Frequency arr size: %d bytes" % (features_frequency.size * features_frequency.itemsize))


Feature Frequency arr size: 114028350 bytes


In [14]:
print("Features mean arr size: %d bytes" % (features_mean.size * features_mean.itemsize))


Features mean arr size: 510 bytes


In [15]:
print("Feature SD arr size: %d bytes" % (features_sd.size * features_sd.itemsize))


Feature SD arr size: 510 bytes


In [None]:
print("Covariances arr size: %d bytes" % (covariances.size * covariances.itemsize))


In [None]:
for x in payload_map:
    print(x.shape)


