In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection, linear_model, metrics
import pickle

### Predict class labels for test samples

In [5]:
# read test data
df = pd.read_csv('test.tsv',sep='\t')
libs = df.libs
df.head()

Unnamed: 0,libs
0,"user32.dll,kernel32.dll"
1,"cygwin1.dll,cygiconv-2.dll,cygintl-8.dll,cygz...."
2,"winmm.dll,tapi32.dll,ws2_32.dll,setupapi.dll,v..."
3,kernel32.dll
4,kernel32.dll


In [6]:
df.shape

(1200, 1)

In [7]:
# convert text data to a matrix of token counts
df_train = pd.read_csv('train.tsv',sep='\t')
libs_train = df_train.libs
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(','))
vectorizer.fit_transform(libs_train)

# select test subset
V = vectorizer.transform(libs)
X_test = V.toarray()

In [8]:
# load the classifier and make a prediction based on test sample

classifier = pickle.load(open('Classifier.joblib', 'rb'))
prediction = classifier.predict(X_test)

In [9]:
# save the prediction as a file
with open ('prediction.txt','w') as file:
    file.writelines('prediction' + '\n')
    file.writelines(str(i) + '\n' for i in prediction)

### Prediction's explanation

In [31]:
# read the threshold
with open ('threshold.txt','r') as file:
    thr = float(file.read())

In [None]:
# probability prediction for test data
predict_proba = classifier.predict_proba(X_test)[:,1]

# an array of indexes where probability exceeds the threshold
proba_indexes = np.where(predict_proba > thr)

In [35]:
lines = []
for index in range(X_test.shape[0]):
    if index in proba_indexes[0]:
        line = 'virus ({:.2%} probability)'.format(predict_proba[index])
        lines.append(line)
    else:
        lines.append('')

for line in lines[:10]:
    print(line)

virus (92.43% probability)


virus (90.04% probability)
virus (90.04% probability)


virus (100.00% probability)
virus (71.58% probability)
virus (100.00% probability)


In [36]:
# create an explanation file
with open ('explain.txt','w') as file:
    file.writelines(line + '\n' for line in lines)