# Multi-Classification Machine Learning for Malware Analysis
## 9 Types of Malware in this dataset:
1. Ramnit         - RAT
2. Lollipop       - Adware
3. Kelihos_ver3   - RAT
4. Vundo          - Adware
5. Simda          - Botnet
6. Tracur         - Malicious Browser Plugin
7. Kelihos_ver1   - RAT
8. Obfuscator.ACY - Obfuscates other malware/information
9. Gatak          - RAT

## Game Plan:

- Create Functions
    - List files in a directory /
    - Regex Search within a file
    - Replace parts of string (perhaps predefined file path and extension) /
    - Pull list of instructions out of the file (newline separated)
    - Sort a dictionary /
    - Print a DF /
    

## Questions:




In [1]:
import sys
import os
import re
import csv
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import json
import torch
import torch.nn as nn
from torch.autograd import Variable
from collections import Counter, OrderedDict, defaultdict
import shutil

In [2]:
def listFilesInDirectory(directoryContainingFiles):
    return glob.glob(directoryContainingFiles) 

def stripFilePathAndExtension(filePath, prefixToStrip, suffixToStrip):
    filePath = filePath.replace(prefixToStrip, "")
    filePath = filePath.replace(suffixToStrip, "")
    return filePath

def replaceFilePathAndExtension(filePath, prefixToStrip, prefixToInsert, suffixToStrip, suffixToInsert):
    filePath = filePath.replace(prefixToStrip, "")
    filePath = filePath.replace(suffixToStrip, "")
    return filePath

def printDataFrame(dataframe):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        print(dataframe)

def zeroOutDataframe(dataframe):
    dataframe = dataframe.fillna(0)
    return dataframe

def countEntriesInDataframe(dataframe):
    return np.count_nonzero(dataframe)

def sortDictionary(dictionary):
    returnVal = sorted(dict(Counter(dictionary)).items(), key=lambda kv:
                 (kv[1], kv[0]))
    return returnVal

def fileNewlineIntoList(filePath):
    lineList = []
    with open(filePath) as openFile:
        for line in openFile:
            temp = line.strip()
            lineList.append(temp)
    return lineList

def stripNewlineAndWhitespace(textStringToStrip):
    textStringToStrip = textStringToStrip.replace("\t","")
    textStringToStrip = textStringToStrip.replace("\n","")
    textStringToStrip = textStringToStrip.replace(" ","")
    return textStringToStrip

def stripNewlineAndWhitespaceFromList(listToStrip):
    for i in range(0,len(listToStrip)):
        listToStrip[i] = listToStrip[i].replace("\t","")
        listToStrip[i] = listToStrip[i].replace("\n","")
        listToStrip[i] = listToStrip[i].replace(" ","")
    return listToStrip

def regexSearchFile(filePath, regexPattern):
    with open(filePath) as openFile:
        matches = re.findall(regexPattern, openFile.read())
    openFile.close()
    return matches


## Pulling the files from the dataset into the class folders

In [3]:
#Need a DF of every file and it's class for sorting
fullFileNamesListFromCSV = pd.read_csv("/home/eddy/machine-learning/data/trainLabels.csv")
fullFileNamesListFromCSV.set_index("Id",inplace=True)
backupFileList = listFilesInDirectory("/home/eddy/machine-learning/data/dataset-subsetFullInitialSubset/*")

for file in backupFileList: # file is the full path to the file, fileClean is just the name of the file without extension
    fileClean = stripFilePathAndExtension(file,"/home/eddy/machine-learning/data/dataset-subsetFullInitialSubset/",".asm")
    #shutil.copyfile(file,"/home/eddy/machine-learning/data/dataset-subset/class-"+str(fullFileNamesListFromCSV.loc[fileClean,"Class"])+"/"+str(fullFileNamesListFromCSV.loc[fileClean].name)+".asm")
    #print("from: "+file+" ------------- to: "+"/home/eddy/machine-learning/data/dataset-subset/class-"+str(fullFileNamesListFromCSV.loc[fileClean,"Class"])+"/"+str(fullFileNamesListFromCSV.loc[fileClean].name)+".asm")
        

    


## Creating the Pandas DataFrame for the malware classes

In [4]:
instructionList = fileNewlineIntoList("/home/eddy/machine-learning/instructionListComplete.txt")
instructionList = [instruction.lower() for instruction in instructionList] # Making all instructions lowercase

filePathToNameDict = {}
classOneFileList = listFilesInDirectory("/home/eddy/machine-learning/data/dataset-subset/class-1/*.asm")
fileNameList = classOneFileList

for i in range(0, len(fileNameList)): 
    strippedFile = stripFilePathAndExtension(fileNameList[i], "/home/eddy/machine-learning/data/dataset-subset/class-1/", ".asm")
    filePathToNameDict[strippedFile] = fileNameList[i]
    fileNameList[i] = strippedFile

dataframeClassOne = zeroOutDataframe(pd.DataFrame(columns=instructionList,index=fileNameList))

In [5]:
#for instruction in instructionList:
for file in filePathToNameDict.keys(): # Go through every file in our directory
    fileDirectory = filePathToNameDict[file] # Convert using dict here
    instructionsForThisFile = stripNewlineAndWhitespaceFromList(regexSearchFile(fileDirectory,"(?:\t{3,7}       (?!db|dd)[a-zA-Z]{2,6} {1,})")) # cleaning and pulling instructions

    pandasSeriesTest = pd.Series(instructionsForThisFile).value_counts().index, pd.Series(instructionsForThisFile).value_counts().values # Counting each instruction up   
    for i in range(0, len(pandasSeriesTest[0])):
        dataframeClassOne.loc[file,pandasSeriesTest[0][i]] = pandasSeriesTest[1][i]  #0 = instruction and 1 = count columns ||| Second value is index within that column
    
    #Optional cleaning options for my DF to merge dupe columns and group them up
    dataframeClassOne = dataframeClassOne.groupby(axis=1, level=0).sum()
    dataframeClassOne = dataframeClassOne.loc[:, (dataframeClassOne != 0).any(axis=0)]


  dataframeClassOne.loc[file,pandasSeriesTest[0][i]] = pandasSeriesTest[1][i]  #0 = instruction and 1 = count columns ||| Second value is index within that column
  dataframeClassOne.loc[file,pandasSeriesTest[0][i]] = pandasSeriesTest[1][i]  #0 = instruction and 1 = count columns ||| Second value is index within that column
  dataframeClassOne.loc[file,pandasSeriesTest[0][i]] = pandasSeriesTest[1][i]  #0 = instruction and 1 = count columns ||| Second value is index within that column
  dataframeClassOne.loc[file,pandasSeriesTest[0][i]] = pandasSeriesTest[1][i]  #0 = instruction and 1 = count columns ||| Second value is index within that column
  dataframeClassOne.loc[file,pandasSeriesTest[0][i]] = pandasSeriesTest[1][i]  #0 = instruction and 1 = count columns ||| Second value is index within that column
  dataframeClassOne.loc[file,pandasSeriesTest[0][i]] = pandasSeriesTest[1][i]  #0 = instruction and 1 = count columns ||| Second value is index within that column
  dataframeClassOne.lo

  pandasSeriesTest = pd.Series(instructionsForThisFile).value_counts().index, pd.Series(instructionsForThisFile).value_counts().values # Counting each instruction up
  pandasSeriesTest = pd.Series(instructionsForThisFile).value_counts().index, pd.Series(instructionsForThisFile).value_counts().values # Counting each instruction up


In [6]:
dataframeClassOne.to_csv("/home/eddy/machine-learning/data/datasetClassOne.csv")
print(dataframeClassOne)