In [68]:
import pandas as pd
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 1000)
from glob import glob
import json
from pprint import pprint
import numpy as np
import os
from pathlib import Path
from sklearn.tree import DecisionTreeClassifier
import sklearn
import pickle

In [12]:
train_dataset_path = "c:\\hackathon\\dataset"
os.chdir(train_dataset_path)
def get_all_dataset_files(path):
    return [os.path.join(dp, f) for dp, dn, filenames in os.walk(path) for f in filenames if os.path.splitext(f)[1] in ['.txt','.json'] ]

In [13]:
train_files = get_all_dataset_files(train_dataset_path)
train_files

['c:\\hackathon\\dataset\\Dynamic_Analysis_Data_Part1\\Benign\\0028674d11b4995333b419ca593b6596ed482c28fba06b9349718a838a43618e.json',
 'c:\\hackathon\\dataset\\Dynamic_Analysis_Data_Part1\\Benign\\005754ced6f73a197a4a21c58da39d5e3ee84e484640765dbda2475f4ba2d3bd.json',
 'c:\\hackathon\\dataset\\Dynamic_Analysis_Data_Part1\\Benign\\009997e531dc3f4f5c02ed2de7cfa06de6cf66d6945899646d0a1aa60f3187c0.json',
 'c:\\hackathon\\dataset\\Dynamic_Analysis_Data_Part1\\Benign\\01090324565d0c453fb6a5cc95f5b0b49353184cc475e301391032296f590cac.json',
 'c:\\hackathon\\dataset\\Dynamic_Analysis_Data_Part1\\Benign\\010990b2459a598d1bab97167fe23d0777f282f32e4ff2f86cf08b0808f68ca9.json',
 'c:\\hackathon\\dataset\\Dynamic_Analysis_Data_Part1\\Benign\\0119247be3c7707b1421dc5de6457eae359ac7c493daaf0e68d3288e8a2296b8.json',
 'c:\\hackathon\\dataset\\Dynamic_Analysis_Data_Part1\\Benign\\017101b3c538cf9368dcb91b9f8b252322183dcbca90ee01ed338e331f029166.json',
 'c:\\hackathon\\dataset\\Dynamic_Analysis_Data_Part1\\

In [14]:
for tr in train_files:
    if "String" in tr:
        print(tr)
        break

c:\hackathon\dataset\Static_Analysis_Data\Benign\00060e7df9844d69e0548fc6b6787c9f3c7f6048507516ff5c83c134884a04a1\String.txt


In [15]:
hashes = {}
for train_file in train_files:
    hash_val = Path(train_file).parts[-1].replace(".json","") if os.path.splitext(train_file)[1] == ".json" else Path(train_file).parts[-2]
    file_paths = hashes.get(hash_val, {})
    p = Path(train_file).parts
    if "Malware" in train_file:
        file_paths["category"] = "Malware"
        file_paths["sub_category"] = p[-3]
    elif "Benign" in train_file: 
        file_paths["category"] = "Benign"
        file_paths["sub_category"] = "Benign"

    if os.path.splitext(train_file)[1] == ".json":
        file_paths["dynamic_analysis"] = train_file
    elif p[-1] == "String.txt":
        file_paths["static_analysis_strings"] = train_file
    elif p[-1] == "Structure_Info.txt":
        file_paths["static_analysis_structure"] = train_file
    hashes[hash_val] = file_paths
    
rows = []
for hash_val in hashes:
    row = {"hash":hash_val}
    row.update(hashes[hash_val])
    rows.append(row)
    


In [17]:
df = pd.DataFrame(rows)
print(df.columns)
df["dynamic_analysis"] = df["dynamic_analysis"].fillna("")
df["static_analysis_strings"] = df["static_analysis_strings"].fillna("")
df["static_analysis_structure"] = df["static_analysis_structure"].fillna("")

Index(['hash', 'category', 'sub_category', 'dynamic_analysis', 'static_analysis_strings', 'static_analysis_structure'], dtype='object')


In [18]:
# Feature extraction
def check_string_presence(file_path, string=""):
    if os.path.isfile(file_path):
        if string in open(file_path).read():
            return 1
        else:
            return 0
    return 0
check_string_presence = np.vectorize(check_string_presence)

df["virus_total"] = check_string_presence(df["dynamic_analysis"],string="VirusTotal")
df["icmp_traffic_presence"] = check_string_presence(df["dynamic_analysis"],string="Generates some ICMP traffic")
df["high_port_presence"] = check_string_presence(df["dynamic_analysis"],string="Communication to multiple IPs on high port numbers")
df["shit_over_string_presence"] = check_string_presence(df["static_analysis_strings"],string="Shit,IsOver!!") # For worms

In [58]:
feature_cols = ["virus_total","icmp_traffic_presence","high_port_presence","shit_over_string_presence"]
category_mappings = {"Malware":1,"Benign":0}
df["category"] = df["category"].map(category_mappings)

In [66]:
clf = DecisionTreeClassifier(random_state=1234)
x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split(df[feature_cols],df["category"],random_state=1234)
clf.fit(x_train, y_train)

DecisionTreeClassifier(random_state=1234)

In [67]:
print(sklearn.metrics.accuracy_score(clf.predict(x_test),y_test))
print(sklearn.metrics.precision_score(clf.predict(x_test),y_test))
print(sklearn.metrics.recall_score(clf.predict(x_test),y_test))
print(sklearn.metrics.f1_score(clf.predict(x_test),y_test))
print(sklearn.metrics.confusion_matrix(clf.predict(x_test),y_test))

0.9968
0.993214588634436
1.0
0.9965957446808511
[[1321    8]
 [   0 1171]]


In [71]:
pickle.dump(clf, open("malware_detection.model","wb"))