In [None]:
#IMPORT LIBRARIES AND INTRODUCE USER-DEFINED FUNCTIONS

import numpy as np
import pandas as pd
import random
import ast
import csv
from sklearn import svm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import *
from sklearn.naive_bayes import *
from sklearn.metrics import confusion_matrix, classification_report
from ast import literal_eval
import matplotlib.pyplot as plt


def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        pass
        #print('Confusion matrix, without normalization')

    #print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax
  

print('Libraries imported.')

Libraries imported.


In [None]:
#MOUNT DRIVE

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
#IMPORT DATASET

filepath = "/content/drive/My Drive/train_dataset.jsonl"
db=pd.read_json(filepath, lines=True)
print('File imported')
#db.info

File imported


In [None]:
#IMPORT TESTSET
filepath2 = "/content/drive/My Drive/test_dataset_blind.jsonl"
bs=pd.read_json(filepath2, lines=True)
print('File imported')
#bs.info

File imported


In [None]:
#GENERATE LIST OF BLIND DATA

blind_instructions = list ()
for i in range (0,3000):
    blind_instruction = bs.iloc[i,0]
    blind_instructions.insert(i, blind_instruction)

In [None]:
#GENERATE LISTS OF SAMPLES AND LABELS

instructions = list()
compilers = list()
optimization = list()
for i in range (0,30000):
    instruction= db.iloc[i,0]
    opt= db.iloc[i,1]
    compiler= db.iloc[i,2]
    instructions.insert(i, instruction)
    compilers.insert(i, compiler)
    optimization.insert(i, opt)

    
print(instructions[1])
print(compilers[1])
print('Instructions and compilers lists created')
print(len(instructions))

['xor edx edx', 'cmp rdi rsi', 'mov eax 0xffffffff', 'seta dl', 'cmovae eax edx', 'ret']
gcc
Instructions and compilers lists created
30000


In [None]:
#CONSIDER LIMITED PART OF DATA

instructions = list()
compilers = list()
for i in range (0,1000):
    instructions.insert(i,db.iloc[i,0])
    compilers.insert(i,db.iloc[i,2])
for i in range (1000,2000):
    instructions.insert(i,db.iloc[i+10000,0])
    compilers.insert(i,db.iloc[i+10000,2])
for i in range (2000,3000):
    instructions.insert(i,db.iloc[i+20000,0])
    compilers.insert(i,db.iloc[i+20000,2])    

print(compilers[0])
print(compilers[1000])
print(compilers[2000])
print(len(instructions))
print(instructions[1])

gcc
icc
clang
3000
['xor edx edx', 'cmp rdi rsi', 'mov eax 0xffffffff', 'seta dl', 'cmovae eax edx', 'ret']


In [None]:
#GENERATE LIST OF SPLIT INSTRUCTIONS

instructions_split=[]
for i in range (0, len(instructions)):
    instruction_split=list()
    j=0
    for string in instructions[i]:
    #print(string)
    #print(len(string))
    for k in range (0, len(string.split())):
        tmp=string
        tmp_split=tmp.split()
        instruction_split.insert(j,tmp_split[k])
        j+=1
    instructions_split.append(instruction_split)

print(instructions[0])
print(instructions_split[0])  
print(instructions[0])

['push r12', 'push rbp', 'push rbx', 'test byte [rdi + 0x46] 1', 'je 0x80003cf', 'mov edx 0', 'mov esi 0xf2', 'mov edi 0', 'call 0x80003cf', 'mov rbp rdi', 'mov r12d esi', 'mov rbx rdi', 'jmp 0x80003dd', 'mov rbx rax', 'mov rax qword [rbx + 0xd0]', 'test rax rax', 'je 0x80003ef', 'test byte [rax + 0x48] 1', 'je 0x80003da', 'cmp qword [rbx + 0x10] 0', 'je 0x8000418', 'cmp rbx qword [0x080003fd]', 'je 0x800040f', 'mov rdi rbx', 'call sym.snip', 'mov rdi rbx', 'call entry0', 'mov rax qword [rbx + 0x10]', 'jmp 0x800049f', 'mov eax 0', 'test r12b 1', 'jne 0x800049f', 'mov rdi rbx', 'call 0x800042b', 'test rax rax', 'je 0x8000464', 'test r12b 2', 'jne 0x800045e', 'mov rsi qword [rbx + 0x30]', 'mov rdi qword [rbx + 0x10]', 'mov edx 0', 'call 0x8000448', 'test eax eax', 'je 0x800045e', 'test r12b 4', 'jne 0x800045e', 'mov edi 1', 'call 0x800045c', 'jmp 0x8000464', 'mov rax qword [rbx + 0x10]', 'jmp 0x800049f', 'call 0x8000469', 'mov edi eax', 'call 0x8000470', 'mov rbx rax', 'mov edx 5', 'mov 

In [None]:
#GENERATE LIST OF MNEMONICS

mnemonics=[]
for i in range (0, len(instructions)):
    mnemonic=list()
    j=0
    for string in instructions[i]:
        tmp=string
        tmp_split=tmp.split()
        mnemonic.insert(j,tmp_split[0])
        j+=1
    mnemonics.append(mnemonic)  
    
print(mnemonics[0])
print(mnemonics[1])
print(mnemonics[2])

['push', 'push', 'push', 'test', 'je', 'mov', 'mov', 'mov', 'call', 'mov', 'mov', 'mov', 'jmp', 'mov', 'mov', 'test', 'je', 'test', 'je', 'cmp', 'je', 'cmp', 'je', 'mov', 'call', 'mov', 'call', 'mov', 'jmp', 'mov', 'test', 'jne', 'mov', 'call', 'test', 'je', 'test', 'jne', 'mov', 'mov', 'mov', 'call', 'test', 'je', 'test', 'jne', 'mov', 'call', 'jmp', 'mov', 'jmp', 'call', 'mov', 'call', 'mov', 'mov', 'mov', 'mov', 'call', 'mov', 'mov', 'mov', 'mov', 'call', 'mov', 'pop', 'pop', 'pop', 'ret']
['xor', 'cmp', 'mov', 'seta', 'cmovae', 'ret']
['mov', 'add', 'mov', 'call', 'mov', 'movzx', 'mov', 'call', 'test', 'je', 'movzx', 'mov', 'call', 'mov', 'call', 'mov', 'movzx', 'mov', 'call', 'mov', 'mov', 'call', 'mov', 'and', 'test', 'je', 'movzx', 'mov', 'call', 'mov', 'call', 'mov', 'movzx', 'mov', 'call', 'mov', 'call', 'mov', 'mov', 'mov', 'mov', 'call', 'add', 'jmp', 'mov', 'lea', 'movzx', 'lea', 'lea', 'mov', 'mov', 'mov', 'call', 'mov', 'mov', 'mov', 'call', 'mov', 'mov', 'call', 'mov', '

In [None]:
#GENERATE CUTTED LIST OF MNEMONICS

mnemonics=[]
for i in range (0, len(instructions)):
    mnemonic=list()
    j=0
    k=10
    if len(instructions[i])>20:
        for string1 in instructions[i]:
            tmp1=string1
            tmp_split1=tmp1.split()
            mnemonic.insert(j,tmp_split1[0])
            j+=1
            if j>9:
                break
        for string2 in reversed(instructions[i]):
            tmp2=string2
            tmp_split2=tmp2.split()
            mnemonic.insert(k,tmp_split2[0])
            k+=1
            if k>19:
                break
    elif len(instructions[i])<=20:
        for string in instructions[i]:  
            tmp=string
            tmp_split=tmp.split()
            mnemonic.insert(j,tmp_split[0])
            j+=1
    mnemonics.append(mnemonic)  
    
print(mnemonics[0])

['push', 'push', 'push', 'test', 'je', 'mov', 'mov', 'mov', 'call', 'mov', 'ret', 'pop', 'pop', 'pop', 'mov', 'call', 'mov', 'mov', 'mov', 'mov']


In [None]:
#CONVERT LISTS INTO LISTS OF STRINGS AND ARRAYS

ins = list()
mnem = list()
blind_ins = list()
for i in range (0, len(instructions)):
    ins.insert(i, str(instructions[i]))
    mnem.insert(i, str(mnemonics[i]))
    blind_ins.insert(i, str(blind_instructions[i]))

cmp_array = np.asarray(compilers)
  
print('Instructions lists converted into strings')
print('Compilers list converted into array')
print(mnem[1])
print(mnem[0])
print(ins[1])
print(len(ins))

Instructions lists converted into strings
Compilers list converted into array
['xor', 'cmp', 'mov', 'seta', 'cmovae', 'ret']
['push', 'push', 'push', 'test', 'je', 'mov', 'mov', 'mov', 'call', 'mov', 'mov', 'mov', 'jmp', 'mov', 'mov', 'test', 'je', 'test', 'je', 'cmp', 'je', 'cmp', 'je', 'mov', 'call', 'mov', 'call', 'mov', 'jmp', 'mov', 'test', 'jne', 'mov', 'call', 'test', 'je', 'test', 'jne', 'mov', 'mov', 'mov', 'call', 'test', 'je', 'test', 'jne', 'mov', 'call', 'jmp', 'mov', 'jmp', 'call', 'mov', 'call', 'mov', 'mov', 'mov', 'mov', 'call', 'mov', 'mov', 'mov', 'mov', 'call', 'mov', 'pop', 'pop', 'pop', 'ret']
['xor edx edx', 'cmp rdi rsi', 'mov eax 0xffffffff', 'seta dl', 'cmovae eax edx', 'ret']
3000


In [None]:
#FEATURE EXTRACTION

#StopWords= ['ret','jump']

print(ins[1])
#vectorizer = HashingVectorizer() # multivariate
#vectorizer = CountVectorizer(ngram_range=(2,7), binary=True) # multinomial
vectorizer = TfidfVectorizer(ngram_range=(2,7), binary=True)
X_all = vectorizer.fit_transform(ins)
y_all= cmp_array
print(X_all.shape)
print(y_all.shape)
print(y_all.shape)
print(X_all[1])
print(y_all[1])

['xor edx edx', 'cmp rdi rsi', 'mov eax 0xffffffff', 'seta dl', 'cmovae eax edx', 'ret']
(3000, 6286721)
(3000,)
(3000,)
  (0, 1104975)	0.1361969669267073
  (0, 2052869)	0.1361969669267073
  (0, 3655096)	0.1361969669267073
  (0, 5850693)	0.1361969669267073
  (0, 5701506)	0.1361969669267073
  (0, 1765141)	0.1361969669267073
  (0, 2569721)	0.1361969669267073
  (0, 2591909)	0.1361969669267073
  (0, 6273319)	0.1361969669267073
  (0, 5964750)	0.1361969669267073
  (0, 1104974)	0.1361969669267073
  (0, 2052868)	0.1361969669267073
  (0, 3655095)	0.1361969669267073
  (0, 5850692)	0.1361969669267073
  (0, 5701505)	0.1361969669267073
  (0, 1765140)	0.1361969669267073
  (0, 2569720)	0.1361969669267073
  (0, 2591908)	0.1361969669267073
  (0, 6273318)	0.1361969669267073
  (0, 1789087)	0.1361969669267073
  (0, 5964749)	0.1361969669267073
  (0, 1104973)	0.1361969669267073
  (0, 2052867)	0.1361969669267073
  (0, 3655094)	0.1361969669267073
  (0, 5850691)	0.1361969669267073
  :	:
  (0, 1789085)	0.136196

In [None]:
#TRAINING

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=15)

print("Size of Train set: %d - Size of Test set: %d" %(X_train.shape[0],X_test.shape[0]))
#print('%d %s %s' %(id,str(y_train[id]),str(X_train[id])))
print(y_train)

Size of Train set: 2400 - Size of Test set: 600
['gcc' 'icc' 'icc' ... 'clang' 'clang' 'clang']


In [None]:
#BERNOULLI 

model = BernoulliNB().fit(X_train, y_train)
print('Bernoulli Model created')

Bernoulli Model created


In [None]:
#MULTINOMIAL

model = MultinomialNB().fit(X_train, y_train)
print('Multinomial Model created')

Multinomial Model created


In [None]:
#SVM

model = svm.SVC(kernel='linear', C=1)
print('SVM Model created')
model.fit(X_train, y_train)

SVM Model created


In [None]:
#ACCURACY, PRECISION, RECALL AND CONFUSION MATRIX

acc = model.score(X_test, y_test)
print("Accuracy: %.3f \n" %acc)
y_pred = model.predict(X_test)
print("Confusion Matrix: \n")
print(confusion_matrix(y_test, y_pred))
print(" \n\t\t\t Calssification Report: \n")
print(classification_report(y_test, y_pred))

Accuracy: 0.922 

Confusion Matrix: 

[[178  10   3]
 [ 15 182   3]
 [ 14   2 193]]
 
			 Calssification Report: 

              precision    recall  f1-score   support

       clang       0.86      0.93      0.89       191
         gcc       0.94      0.91      0.92       200
         icc       0.97      0.92      0.95       209

    accuracy                           0.92       600
   macro avg       0.92      0.92      0.92       600
weighted avg       0.92      0.92      0.92       600



In [None]:
#PREDICTIONS

#create a list of predictions

predictions = list()
X_all_blind = vectorizer.transform(blind_ins)
gcc=0
icc=0
clang=0
for i in range (0, len(blind_instructions)):
    ynew = model.predict(X_all_blind[i])
    predictions.insert(i, ynew)
    if predictions[i] == 'gcc':
        gcc+=1
    elif predictions[i] == 'icc':
        icc+=1   
    elif predictions[i] == 'clang':
        clang+=1   
print(predictions)
print(len(predictions))
print(gcc)
print(icc)
print(clang)

#Write those predictions on a csv file

with open('predictions2.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(predictions)
csvFile.close()

[array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['clang'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['clang'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['gcc'], dtype='<U5'), array(['g