# Machine Learning for Encrypted Malicious Traffic Detection 

In [1]:
#importing required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from numpy import genfromtxt
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from matplotlib.pyplot import hist
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report

## About the Dataset

-  **There are in total approximately 1,38,047 samples in the entire dataset.**
-  **The dataset contains 41,323 binary files (including exe, dll files) which are legitimate in nature means they are malware free.**
- **Also, the dataset contains 96,724 malware samples.**
- **It has 57 features.**

## Dataset Exploration

In [2]:
#Reading the dataset from the csv file

data= pd.read_csv("/content/drive/MyDrive/Malware_Data_Updated.csv") 

#Printing all the columns in the dataset
print("Features :- \n" )
for i in data.columns:
    print(i)

FileNotFoundError: ignored

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
print("Shape of the dataset is :- ",data.shape)

In [None]:
# Viewing the top 10 dataset rows
data.head()

So, here we have in total 57 features in the dataset and the last feature '**legitimate**' which tells which sample is legitimate and which sample has malware. When legitimate value is 0, it is malware free and when legitimate value is 1, it means it is a malware encrypted file.

**I'm also dividing the dataset into two parts containing malware and not containing malware.**

**Here we will Dropping the legitimate column, because I'll be using it as a label in the output.**

In [None]:
malwarefree = data[0:41323].drop(['legitimate'],axis=1)
has_malware = data[41323::].drop(['legitimate'],axis=1)

print("The shape of malware-free dataset is: %s samples, %s features"%(malwarefree.shape[0],malwarefree.shape[1]))
print("The shape of malware containing dataset is: %s samples, %s features"%(has_malware.shape[0],has_malware.shape[1]))

## Feature Extraction

In [None]:
# ExtraTreesClassifier is used for optimizing the dataset
from sklearn.ensemble import ExtraTreesClassifier      

# SelectFromModel is used to increase the accuracy of the model by selecting few important features from the dataset.
from sklearn.feature_selection import SelectFromModel  

# train_test_split is used to split the data
from sklearn.model_selection import train_test_split   
from sklearn.metrics import accuracy_score

###### In code written below, I'm removing some irrelevant features such as Name, md5, legitimate value from the original dataset.
###### The values in 'legitimate' column will be used as a label for producing outputs.

In [None]:
import pandas as pd

# remove Nan values
res = data.dropna()
data = res

# I removed three columns from the original dataset
input_data = data.drop(['Name','md5','legitimate'],axis=1).values 

# Values in legitimate column are used as our labels
labels = data['legitimate'].values 

#Fitting the data inside ExtraTreesClassifier
extratrees = ExtraTreesClassifier().fit(input_data, labels)

#Feature Selection is achieved using SelectFromModel,which is used to select features based on their importances which is Gini score
selection =  SelectFromModel(extratrees,prefit=True)
new_data = selection.transform(input_data)

#Thus, SelectFromModel transforms the dataset and selects fewer features based on their importances

#Checking the shape of old as well as new data
print(input_data.shape,new_data.shape)

In [None]:
features = input_data.shape[1]

# Computing the importance score of each feature
importances = extratrees.feature_importances_

# Now sorting these features on values of their importances in reverse order using numpy's argsort() method
indices = np.argsort(importances)[::-1]

weights = {}

#Looping through all the features and printing their indices, name of feature and their corresponding importance score
print("Top Features after feature Extraction:-")
for i in range(10):
    weights[data.columns[2+indices[i]]] = importances[indices[i]]*100
    print("%d"%(i+1),":-",data.columns[2+indices[i]],":-",(importances[indices[i]]*100))

names = list(weights.keys())
values = list(weights.values())

plt.rcParams["figure.figsize"] = (20,15)


# visualising
plt.bar(range(len(weights)), values, tick_label=names)
plt.show()

#### Thus ExtraTreesClassifier and SelectFromModel Helped in reducing the features with respect to their imporatnces.

### Splitting data for training and testing

Splitting :- Training (80%)  Testing (20%)

I am splitting the dataset into four parts namely- legitimate_train, legitimate_test, malware_train and malware_test

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(new_data,labels,test_size=0.2)

#We will start implementing the classifiers and store their outputs
output_acc ={}
output_prec = {}

## Naive Bayes Classifier Implementation

In [None]:
# implementing Multinomial Naive Bayes Classifier
# supervised machine learning algorithm, which is used for classification tasks based on applying Bayes' theorem with independence assumptions between the features

MNB = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
MNB.fit(X_train,Y_train)
score = MNB.score(X_test,Y_test)
print ("MNB Accuracy : ", score)
output_acc["MNB"] = score
y_score = MNB.predict(X_test)
precision = average_precision_score(Y_test, y_score)
recall = recall_score(Y_test, y_score, average='macro')
print("Precision : ",precision)
output_prec["MNB"] = precision
print("Recall : ",recall)
print("\nConfusion Matrix :- ")
cm = confusion_matrix(Y_test, y_score)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
cm_display.plot()
plt.show()
print( "\n",confusion_matrix(Y_test, y_score))

## Linear Discriminant Analysis

In [None]:
#implementing Linear Discriminant Analysis
# linear decision boundary, generated by fitting class conditional densities to the data and using Bayes’ rule.

LDA= LinearDiscriminantAnalysis()
LDA.fit(X_train,Y_train)
score=LDA.score(X_test,Y_test)
print ("LDA Accuracy : ", score)
output_acc["LDA"] = score
y_score = LDA.predict(X_test)
precision = average_precision_score(Y_test, y_score)
recall=recall_score(Y_test, y_score, average='macro')
print("Precision : ",precision)
output_prec["LDA"] = precision
print("Recall : ",recall)

print("\nConfusion Matrix :- ")
cm = confusion_matrix(Y_test, y_score)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
cm_display.plot()
plt.show()
print( "\n",confusion_matrix(Y_test, y_score))

## Multilayer Perceptron (MLP classifier)

In [None]:
#implementing Multilayer Perceptron Classifier Neural Network

MLP=MLPClassifier(alpha=0.1)
MLP.fit(X_train,Y_train)
score=MLP.score(X_test,Y_test)
print ("MLP : ", score)
output_acc["MLP"] = score
y_score = MLP.predict(X_test)
precision = average_precision_score(Y_test, y_score)
recall=recall_score(Y_test, y_score, average='macro')
print("Precision : ",precision)
output_prec["MLP"] = precision
print("Recall : ",recall)

print("\nConfusion Matrix :- ")
cm = confusion_matrix(Y_test, y_score)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
cm_display.plot()
plt.show()
print( "\n",confusion_matrix(Y_test, y_score))

## Adaboost Classifier

In [None]:
#implementing Adaboost Classifier (Boosting Technique)
print("Adaboost Classifier :-")

adb = AdaBoostClassifier(n_estimators=100)
model = adb.fit(X_train,Y_train)
prediction = model.predict(X_test)

result1 = confusion_matrix(Y_test, prediction)
print("Confusion Matrix:")
cm = confusion_matrix(Y_test, y_score)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
cm_display.plot()
plt.show()
print(result1,"\n")

result2 = classification_report(Y_test, prediction,output_dict=True)
print("Classification Report:",)
print (result2,"\n")

precision =  result2['weighted avg']['precision']
output_prec['Adaboost'] = precision

result3 = accuracy_score(Y_test,prediction)
print("The accuracy score (in percentage) of the algorithm: ",result3)
output_acc["Adaboost"] = result3

In [None]:
algos = list(output_acc.keys())
print(algos)

In [None]:
accuracies =list(output_acc.values())
new_accuracies=[i * 100 for i in accuracies]
print(new_accuracies)
precisions = list(output_prec.values())
new_precisions =[i * 100 for i in precisions]
print(new_precisions)

## Visualising the Results

In [None]:
import matplotlib.pyplot as plt

# create sample data

Data = {}

Data['Accuracies'] = new_accuracies
Data['Precisions'] = new_precisions

# data = {
#     'Accuracies': [93.36834480260775, 95.09235784136183, 92.70191959434987, 98.47519014849692],
#     'Precisions': [83.9617735350683, 87.60188269121217, 82.44935128105438, 98.47317652094412]
# }

# create x-axis labels
labels = ['MNB', 'LDA', 'MLP', 'Adaboost']

# set bar width
bar_width = 0.2

# create index for each group
index = np.arange(len(labels))

# plot bars for each group
plt.bar(index, Data['Accuracies'], bar_width, label = 'Accuracies')
plt.bar(index + bar_width, Data['Precisions'], bar_width, label='Precisions')
# set x-axis ticks and labels
plt.xticks(index + 0.5*bar_width, labels)


plt.rcParams["figure.figsize"] = (8,5)
plt.legend()

# set chart title and legend
plt.xlabel('Algorithm Used')
plt.title('Plot of Accuracies and Precisions')

# show chart

plt.show()

#### So, as of now we have discovered 4 algorithms as implemented above. We are getting an accuracy of 98.7 at max from LDA Classifier. In future we are discovering more algorithms which will give more accuracy.

K NEAREST NEIGHBOUR

In [None]:
#implementing K Nearest Neighbour

KNN=KNeighborsClassifier(3)
KNN.fit(X_train,Y_train)
score=KNN.score(X_test,Y_test)
print ("KNN : ", score)
output_acc["KNN"]=score
y_score = KNN.predict(X_test)
precision = average_precision_score(Y_test, y_score)
recall=recall_score(Y_test, y_score, average='macro')
print("Precision : ",precision)
output_prec["KNN"] = precision
print("Recall : ",recall)
print("Fscore : " ,2*precision*recall/(precision+recall))
print( confusion_matrix(Y_test, y_score))

print("\nConfusion Matrix :- ")
cm = confusion_matrix(Y_test, y_score)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
cm_display.plot()
plt.show()
print( "\n",confusion_matrix(Y_test, y_score))

DECISION TREE CLASSIFIER

In [None]:
#implementing Decision Tree Classifier

DTC= DecisionTreeClassifier(max_depth=5)
DTC.fit(X_train,Y_train)
score=DTC.score(X_test,Y_test)
print ("DTC : ", score)
output_acc["Decision Tree"]=score
y_score = DTC.predict(X_test)
precision = average_precision_score(Y_test, y_score)
recall=recall_score(Y_test, y_score, average='macro')
print("Precision : ",precision)
output_prec["Decision Tree"]=precision
print("Recall : ",recall)
print("Fscore : " ,2*precision*recall/(precision+recall))
print( confusion_matrix(Y_test, y_score))

print("\nConfusion Matrix :- ")
cm = confusion_matrix(Y_test, y_score)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
cm_display.plot()
plt.show()
print( "\n",confusion_matrix(Y_test, y_score))

BERNOULLI NAIVE NAYES

In [None]:
#implementing Bernoulli Naive Bayes

BNB=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
BNB.fit(X_train,Y_train)
score=BNB.score(X_test,Y_test)
print ("BNB : ", score)
output_acc["BNB"]=score
y_score = BNB.predict(X_test)
precision = average_precision_score(Y_test, y_score)
recall=recall_score(Y_test, y_score, average='macro')
print("Precision : ",precision)
output_prec["BNB"]=precision
print("Recall : ",recall)
print("Fscore : " ,2*precision*recall/(precision+recall))
print( confusion_matrix(Y_test, y_score))

print("\nConfusion Matrix :- ")
cm = confusion_matrix(Y_test, y_score)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
cm_display.plot()
plt.show()
print( "\n",confusion_matrix(Y_test, y_score))

RANDOM FOREST CLASSIFIER

In [None]:
#implementing Random forest

rfc = RandomForestClassifier(n_estimators=50)

#Now, we need to fit the classifier on the training dataset, Therefore
rfc.fit(X_train,Y_train)
prediction = rfc.predict(X_test)

#calculating the confusion matrix
res = confusion_matrix(Y_test, prediction)
print("Confusion Matrix:")
print(res,"\n")

#printing the classification report
res1 = classification_report(Y_test, prediction)
print("Classification Report:",)
print (res1)

#Calculating the accuracy of the trained model 
accuracy = accuracy_score(Y_test,prediction)
print("The accuracy score (in percentage) of the algorithm: ",accuracy*100)
output_acc["Random Forest"]=accuracy

print("\nConfusion Matrix :- ")
cm = confusion_matrix(Y_test, y_score)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
cm_display.plot()
plt.show()
print( "\n",confusion_matrix(Y_test, y_score))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gdb = GradientBoostingClassifier(n_estimators=50)
gdb.fit(X_train,Y_train)
prediction2 = gdb.predict(X_test)


#calculating the confusion matrix
res3 = confusion_matrix(Y_test, prediction2)
print("Confusion Matrix:")
print(res3,"\n")

#printing the classification report
res4 = classification_report(Y_test, prediction2)
print("Classification Report:",)
print (res4)

#Calculating the accuracy of the trained model
res5 = accuracy_score(Y_test,prediction2)
print("The accuracy score (in percentage) of the algorithm: ",res5*100)