# Multi-Classification Machine Learning for Malware Analysis
## 9 Types of Malware in this dataset:
1. Ramnit         - RAT
2. Lollipop       - Adware
3. Kelihos_ver3   - RAT
4. Vundo          - Adware
5. Simda          - Botnet
6. Tracur         - Malicious Browser Plugin
7. Kelihos_ver1   - RAT
8. Obfuscator.ACY - Obfuscates other malware/information
9. Gatak          - RAT

## Game Plan:

- Look into creating more metrics to show off my model
- Improve the way I import data for the model
- Explain my code and solution in detail
- Port into the main program/script



## Imports

In [None]:
#pip install scikit-learn
#pip install seaborn
#pip install matplotlib
#pip install pandas
#pip install torch
#pip install torchvision
#pip install jupyterthemes

In [None]:
# All imports centralised here

import sys
import os
import re
import csv
import shutil
import heapq
import codecs
import json
from collections import Counter, OrderedDict, defaultdict
from pathlib import Path #Convert all directory accesses to this
from functools import reduce
import glob

import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn import svm
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

pd.options.mode.chained_assignment = None  # default='warn'

## Functions

In [None]:
# Functions are all contained inside here

def createFullDirectory(mainDirectory,subDirectory):
    return str(mainDirectory+subDirectory)

def createFullPathToFile(fullDirectory, fileName):
    return str(fullDirectory+fileName)

def listFilesInDirectory(directoryContainingFiles):
    return glob.glob(directoryContainingFiles) 

def stripFilePathAndExtension(filePath, prefixToStrip, suffixToStrip):
    filePath = filePath.replace(prefixToStrip, "")
    filePath = filePath.replace(suffixToStrip, "")
    #return filePath
    return Path(filePath).stem

def replaceFilePathAndExtension(filePath, prefixToStrip, prefixToInsert, suffixToStrip, suffixToInsert):
    filePath = filePath.replace(prefixToStrip, prefixToInsert)
    filePath = filePath.replace(suffixToStrip, suffixToInsert)
    return filePath

def printDataFrame(dataframe):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        print(dataframe)

def zeroOutDataframe(dataframe):
    dataframe = dataframe.fillna(0)
    return dataframe

def countEntriesInDataframe(dataframe):
    return np.count_nonzero(dataframe)

def sortDictionary(dictionary):
    returnVal = sorted(dict(Counter(dictionary)).items(), key=lambda kv:
                 (kv[1], kv[0]))
    return returnVal

def fileNewlineIntoList(filePath):
    lineList = []
    with open(filePath) as openFile:
        for line in openFile:
            temp = line.strip()
            lineList.append(temp)
    return lineList

def stripNewlineAndWhitespace(textStringToStrip):
    textStringToStrip = textStringToStrip.replace("\t","")
    textStringToStrip = textStringToStrip.replace("\n","")
    textStringToStrip = textStringToStrip.replace(" ","")
    return textStringToStrip

def stripNewlineAndWhitespaceFromList(listToStrip):
    for i in range(0,len(listToStrip)):
        listToStrip[i] = listToStrip[i].replace("\t","")
        listToStrip[i] = listToStrip[i].replace("\n","")
        listToStrip[i] = listToStrip[i].replace(" ","")
    return listToStrip

def regexSearchFile(filePath, regexPattern):
    with open(filePath) as openFile:
        matches = re.findall(regexPattern, openFile.read())
    openFile.close()
    return matches

def cleanFileNameList(fileNameList,malwareClass, sortedDatasetDirectory): #NEED TO PORT THIS
    filePathToNameDict = {}
    for i in range(0, len(fileNameList)): 
        strippedFile = stripFilePathAndExtension(fileNameList[i], sortedDatasetDirectory+"/class-"+str(malwareClass)+"/", ".asm") #FIX THIS TO ALLOW FOR DIFFERENT CLASSES
        filePathToNameDict[strippedFile] = fileNameList[i]
        fileNameList[i] = strippedFile
    return fileNameList

def generateClassDataFrame(listColumnsToUse,listRowsToUse):
    return zeroOutDataframe(pd.DataFrame(columns=listColumnsToUse,index=listRowsToUse))

def moveFilesToClassFolders(backupFileList, fullFileNamesListFromCSV, unsortedDataset,sortedDataset): #Old and working before I tried the next version
    fullFileNamesListFromCSV.set_index("Id",inplace=True)
    for fileIndex in range(0,len(backupFileList)): # file is the full path to the file, fileClean is just the name of the file without extension
        fileClean = stripFilePathAndExtension(backupFileList[fileIndex],unsortedDataset,".asm")
        try:
            shutil.copyfile(backupFileList[fileIndex],sortedDataset+"class-"+str(fullFileNamesListFromCSV.loc[fileClean,"Class"])+"/"+str(fullFileNamesListFromCSV.loc[fileClean].name)+".asm")
        except:
            fileIndex = fileIndex + 1

def generateFilenameToDirectoryDict(fileDirectory):
    filePathToNameDict = {}
    for file in fileDirectory:
        filePathToNameDict[Path(file).stem] = file
    return filePathToNameDict

def populateMalwareDataframe(fileDirectoryTopLevel,instructionList):

    filePathToNameDict = generateFilenameToDirectoryDict(listFilesInDirectory(fileDirectoryTopLevel))
    dataFrame = zeroOutDataframe(pd.DataFrame(columns=instructionList,index=filePathToNameDict.keys()))

    for file in filePathToNameDict.keys(): # Go through every file in our directory
        fileDirectory = filePathToNameDict[file] # Convert using dict here
        instructionsForThisFile = stripNewlineAndWhitespaceFromList(regexSearchFile(fileDirectory,"(?:\t{3,7}       (?!db|dd)[a-zA-Z]{2,6} {1,})")) # cleaning and pulling instructions

        pandasSeriesTest = pd.Series(instructionsForThisFile).value_counts().index, pd.Series(instructionsForThisFile).value_counts().values # Counting each instruction up   
        for i in range(0, len(pandasSeriesTest[0])):
            dataFrame.loc[file,pandasSeriesTest[0][i]] = pandasSeriesTest[1][i]  #0 = instruction and 1 = count columns ||| Second value is index within that column
        
        #Optional cleaning options for my DF to merge dupe columns and group them up
        dataFrame = dataFrame.groupby(axis=1, level=0).sum() # Merges dupe columns
        #dataFrame = dataFrame.loc[:, (dataFrame != 0).any(axis=0)] # Removes columns with no values
    return dataFrame

def classDataFrameCompletion(instructionList,sortedDataset,classList,classInteger):
    print(sortedDataset+classList[classInteger-1]+"/*.asm")

    
    dataFrameInFunction = generateClassDataFrame(
        instructionList,      # This is the instruction list
        cleanFileNameList(    # This is the list of files
            listFilesInDirectory(sortedDataset+classList[classInteger-1]+"/*.asm"),  # This is the directory containing the files
            classInteger,
            sortedDataset))  #This is the malware class for cleanFileNameList
    
    dataFrameInFunction = populateMalwareDataframe(
                            sortedDataset+classList[classInteger-1]+"/*.asm",
                            instructionList)

    dataFrameInFunction = zeroOutDataframe(dataFrameInFunction)
    
    dataFrameInFunction.loc[~(dataFrameInFunction==0).all(axis=1)]
    
    dataFrameInFunction.insert(0,"class",classInteger)

    print(sortedDataset+classList[classInteger-1]+"/*.asm")

    return dataFrameInFunction

def removeNanValuesFromDataframe(dataframeToSanitise):
    dataframeToSanitise = dataframeToSanitise.replace(np.nan,0)
    return dataframeToSanitise

def normaliseData(dataframeToNormalise):
    #return (data -trainStats["mean"]) / trainStats['std'] #Works fine, experimenting with the OTHER
    #return data.div(data.sum(axis=1), axis=0)

    dataframeToNormalise = removeNanValuesFromDataframe(dataframeToNormalise)
    return dataframeToNormalise
    
def modelSVMClassifierCreate(cValue, kernelType):
    return svm.SVC(C=cValue, kernel=kernelType)
    
def svmModelFit(modelToFit,trainingDataframe, trainingDatasetLabels):
    return modelToFit.fit(trainingDataframe, trainingDatasetLabels)

def svmModelPredict(modelForPrediction, dataframeToPredictWith):
    return modelForPrediction.predict(dataframeToPredictWith)

def trainAndPredictModel(cValue, kernelType, trainingDataframe, trainingLabels):
    model = modelSVMClassifierCreate(cValue, kernelType)
    model = svmModelFit(model, trainingDataframe, trainingLabels)
    modelPrediction = svmModelPredict(model, trainingDataframe)
    return modelPrediction, model



## Defining Directories and required structures

In [None]:
# Directory structures are defined here

baseDirectory = "/home/eddy/machine-learning/data/"
classList = ["class-1","class-2","class-3","class-4","class-5","class-6","class-7","class-8","class-9"]

unsortedDataset = createFullDirectory(baseDirectory,"dataset-training-full-sanitised/")
sortedDataset = createFullDirectory(baseDirectory,"dataset-training-subset-sorted/")

## Pulling the files from the dataset into the class folders

In [None]:
# Moving files from the santised but unsorted folder into the sanitised and sorted folder

#moveFilesToClassFolders(listFilesInDirectory(unsortedDataset+"*"),pd.read_csv("/home/eddy/machine-learning/data/trainLabels.csv"),unsortedDataset,sortedDataset)
print(len(listFilesInDirectory(unsortedDataset+"*")))

### Making sure there are less than 250 files in each class

In [None]:
# Deleting all but 250 files in each class

for fileClass in classList:
    directory = str(sortedDataset+fileClass+"/*")
    fileList = listFilesInDirectory(directory) #glob.glob(directory)

    print(fileClass)
    print(len(fileList))

    i = 0
    for i in range(0,len(fileList)):
        if(i >= 250):
            os.remove(fileList[i])
    print(len(listFilesInDirectory(sortedDataset+classList[0]+"/*.asm")))


## Creating the Pandas DataFrame for the malware classes

In [None]:
# Creating a Dataframe for each of the classes

instructionList = fileNewlineIntoList("/home/eddy/machine-learning/instructionListComplete.txt")
instructionList = [instruction.lower() for instruction in instructionList] # Making all instructions lowercase

dataframeClassOne = classDataFrameCompletion(instructionList, sortedDataset, classList, 1)
dataframeClassTwo = classDataFrameCompletion(instructionList, sortedDataset, classList, 2)
dataframeClassThree = classDataFrameCompletion(instructionList, sortedDataset, classList, 3)
dataframeClassFour = classDataFrameCompletion(instructionList, sortedDataset, classList, 4)
dataframeClassFive = classDataFrameCompletion(instructionList, sortedDataset, classList, 5)
dataframeClassSix = classDataFrameCompletion(instructionList, sortedDataset, classList, 6)
dataframeClassSeven = classDataFrameCompletion(instructionList, sortedDataset, classList, 7)
dataframeClassEight = classDataFrameCompletion(instructionList, sortedDataset, classList, 8)
dataframeClassNine = classDataFrameCompletion(instructionList, sortedDataset, classList, 9)

In [None]:
# Constructing the Final Dataframe

dataframesList = [dataframeClassOne,dataframeClassTwo,dataframeClassThree,dataframeClassFour,dataframeClassFive,dataframeClassSix,dataframeClassSeven,dataframeClassEight,dataframeClassNine]
finalDF = pd.concat(dataframesList).drop_duplicates()
finalDF = zeroOutDataframe(finalDF)
finalDF.loc[~(finalDF==0).all(axis=1)]
finalDF = finalDF.loc[:, (finalDF != 0).any(axis=0)] # Removes columns with no values
finalDF = finalDF.sample(frac=1)
finalDF.info()

## Splitting the data into train+test sets

In [None]:
# Dividing up the dataset into train, validate and test sets
trainDF, testAndValidDF = train_test_split(finalDF, test_size=0.4)
testDF, validDF = train_test_split(testAndValidDF, test_size=0.5)

print(f"Training Dataset rows and columns: {trainDF.shape}")
print(f"Test Dataset rows and columns: {testDF.shape}")
print(f"Validation Dataset rows and columns: {validDF.shape}")

In [None]:
# Showing training dataframe information and removing the classes to feed into the model
trainStats = trainDF.describe()
trainStats.pop("class")
#not doing sns here

## Training Stats

In [None]:
# Creating training stats based on the trainDF dataset
trainStats = trainDF.describe()
trainStats.pop("class")
trainStats = trainStats.transpose()
trainStats.to_csv("/home/eddy/traindata.csv")

In [None]:
# Creating Training, Validation and Testing Labels and printing the length of each
trainLabels = trainDF.pop("class")
print("Training data rows:    "+str(len(trainLabels)))

validLabels = validDF.pop("class")
print("Validation data rows:  "+str(len(validLabels)))

testLabels = testDF.pop("class")
print("Testing data rows:     "+str(len(testLabels)))

## Data Normalisation/Scaling

In [None]:
# Normalisation Functions happen here

normalisedTrainDF = removeNanValuesFromDataframe(normaliseData(trainDF))
normalisedValidDF = removeNanValuesFromDataframe(normaliseData(validDF))
normalisedTestDF = removeNanValuesFromDataframe(normaliseData(testDF))

normalisedTrainDF.head(10)

## Training the model and creating a small prediction for testing

In [None]:
# SVM Classifier Object is Created
#model      = svm.SVC(C = 1.5, kernel='linear')
#modelPoly  = svm.SVC(C = 1.5, kernel='poly')
#modelRBF   = svm.SVC(C = 1.5, kernel='rbf')
#modelSig   = svm.SVC(C = 1.5, kernel='sigmoid')

#Train the model using the training sets
#model.fit(normalisedTrainDF, trainLabels)
#modelPoly.fit(normalisedTrainDF, trainLabels)
#modelRBF.fit(normalisedTrainDF, trainLabels)
#modelSig.fit(normalisedTrainDF, trainLabels)



#Predict the response for test dataset
#y_pred = model.predict(normalisedTrainDF)
#svmPolyModelPrediction = modelPoly.predict(normalisedTrainDF)
#svmRBFModelPrediction = modelRBF.predict(normalisedTrainDF)
#svmSigmoidModelPrediction = modelSig.predict(normalisedTrainDF)

y_pred, model = trainAndPredictModel(1.5, "linear", normalisedTrainDF, trainLabels)
svmPolyModelPrediction, modelPoly = trainAndPredictModel(1.5, "poly", normalisedTrainDF, trainLabels)
svmRBFModelPrediction, modelRBF = trainAndPredictModel(1.5, "rbf", normalisedTrainDF, trainLabels)
svmSigmoidModelPrediction, modelSig = trainAndPredictModel(1.5, "sigmoid", normalisedTrainDF, trainLabels)



print(y_pred)
print(trainLabels)

In [None]:
# Printing out the example prediction

#exampleResultDF = generateClassDataframe(["id","prediction","actualClass"],)

exampleResult = model.predict(normalisedTestDF[:10])

print(pd.Series(list(normalisedTestDF[:10].index),index=exampleResult).to_string())
print(f"Predicted values: {exampleResult}")

## Accuracy of training, validation and testing

In [None]:
# Accuracy for the Training Set

#normalisedTrainDF = normalisedTrainDF[np.isfinite(normalisedTrainDF).all(1)] #Testing commenting this out

print("Linear Train Accuracy: ",metrics.accuracy_score(trainLabels,y_pred))
print("Poly Train Accuracy: ",metrics.accuracy_score(trainLabels,svmPolyModelPrediction))
print("RBF Train Accuracy: ",metrics.accuracy_score(trainLabels,svmRBFModelPrediction))
print("Sigmoid Train Accuracy: ",metrics.accuracy_score(trainLabels,svmSigmoidModelPrediction))

In [None]:
# Accuracy for the Validation Set

svmValidationDatasetPrediction = svmModelPredict(model, normalisedValidDF)
#svmValidationPrediction = model.predict(normalisedValidDF)
print("Linear Valid Accuracy: ",metrics.accuracy_score(validLabels,svmValidationDatasetPrediction))

In [None]:
# Accuracy for the Testing Set

svmTestDatasetPrediction = svmValidationPrediction = svmModelPredict(model, normalisedTestDF)
#svmTestPrediction = model.predict(normalisedTestDF)
print("Test Accuracy: ",metrics.accuracy_score(testLabels,svmTestDatasetPrediction))

## Confusion Matrix for the model

In [None]:
#Confusion Matrix Plotted and Printed Here

ax = plt.subplot()
predictResults = model.predict(normalisedTestDF)
cm = confusion_matrix(predictResults,predictResults)

ax.set_xlabel("Predicted Labels")
ax.set_ylabel("True Labels")
ax.set_title("Confusion Matrix - Linear")
ax.set_xticks([1,2,3,4,5,6,7,8,9])

sns.heatmap(cm, annot=True, ax=ax, yticklabels=["1","2","3","4","5","6","7","8","9"], xticklabels=["1","2","3","4","5","6","7","8","9"]); #Semicolon removes the annoying text above the graph

In [None]:
# Classification Report Details
print(classification_report(testLabels,y_pred))

## Permutation importance stats for the model's weighting of features

In [None]:
# Permutation Importance Graphs Plotted and Printed Here - Compare these with how many files these instructions actually occur in

permutationImportance = permutation_importance(model, normalisedTrainDF, trainLabels)
featuresList = np.array(list(normalisedTrainDF.columns))
sortedIDX = permutationImportance.importances_mean.argsort()
mostImportantIndexesPermutation = [list(permutationImportance.importances_mean[sortedIDX]).index(i) for i in heapq.nlargest(30, permutationImportance.importances_mean[sortedIDX])]

### Showing the largest features
newFeaturesList = []
newPermutationImportanceList = []

for i in mostImportantIndexesPermutation[::-1]:
    newFeaturesList.append(featuresList[sortedIDX][i])
    newPermutationImportanceList.append(permutationImportance.importances_mean[sortedIDX][i])

occurancesQuantity={}
for i in newFeaturesList[::-1]:
    occurancesQuantity.update({i:str(int(finalDF[i].mean()))})


from sklearn import preprocessing
plt.subplot(1, 2, 1)
plt.barh(newFeaturesList, newPermutationImportanceList);
plt.xlabel("Permutation Importance/Feature");
plt.margins(x=0)
plt.xticks([0,0.1,0.2,0.3,0.4,0.5],["0","0.2","0.4","0.6","0.8","1"])

plt.subplot(1, 2, 2)
plt.barh(list(occurancesQuantity.keys())[::-1], preprocessing.minmax_scale(list(occurancesQuantity.values())[::-1],feature_range=(0,0.5)));
plt.xlabel("Mean Relative occurances/Feature");
plt.xticks([0,0.1,0.2,0.3,0.4,0.5],["0","0.2","0.4","0.6","0.8","1"])
plt.margins(x=0)
plt.tight_layout()


In [None]:
# Raw Classification Report stats graphed out here

classificationReportDF = pd.DataFrame(classification_report(testLabels,y_pred,output_dict=True)).transpose()[:9]
classificationReportF1Supp = classificationReportDF
classificationReportF1Supp = classificationReportF1Supp[classificationReportF1Supp.columns[2:4]]
classificationReportF1Supp["support"] = classificationReportF1Supp["support"].astype(int).div(100)


fig = plt.figure()
ax = fig.add_subplot(111)

ax.bar(
    x=classificationReportF1Supp.index.values.tolist(), 
    height=classificationReportF1Supp["f1-score"], 
    width=0.5, 
    align='center')

ax.bar(
    x=classificationReportF1Supp.index.values.tolist(), 
    height=classificationReportF1Supp["support"], 
    width=0.35, 
    align='center')

f1ScoreBar = mpatches.Patch(color='blue', label="f1 score")
supportScoreBar = mpatches.Patch(color='orange', label="support")
ax.legend(handles=[f1ScoreBar, supportScoreBar],bbox_to_anchor=(0.5, -0.055), loc="upper center",ncol=2)
ax.set_title("A graph demonstrating the relationship between F1 scores and support")

plt.tight_layout()
plt.show()