In [201]:
import requests
import shodan 
import datetime
import pandas as pd
import numpy as np
import re
import sys
import requests
import json
import time
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
VTApiKey = ""
BazaarApiKey = ""

# Studying Maicious Documents and Their Actions
___

By pulling from public sources such as MalwareBazaar and cross-referencing behavior data from VirusTotal, we can gather relevant information concerning how attackers utilizing malicious documents to drop and execute their payloads. Both MalwareBazaar and VirusTotal have free and public APIs that can be queried to gather the necessary information needed to create the tables and required metrics for building detections.

___
We can first begin by gathering hashes from MalwareBazaar, which have been tagged as ```docs```. In this case, we will collect 50 document reports and save them as a JSON object, and read them into a Pandas DataFrame 

___
_Note: The number of documents analyzed is arbitrary, but was limited to 50 due to query restrictions._

In [143]:
url = "https://mb-api.abuse.ch/api/v1/"
payload={'query': 'get_taginfo',
'tag': 'doc',
'limit': '50'}
files=[
]
headers = {
  'API-KEY': BazaarApiKey
}
response = requests.request("POST", url, headers=headers, data=payload, files=files)
if response.status_code == 200:
    print("MalwareBazaar data received")
else:
    print(str(response.status_code))
BazaarJsonobj = json.loads(response.text)

Document data received


___

Once receiving the document data, we will iterate through the list of returned hashes and request a behavior report from VirusTotal. Because of the query restrictions for free accounts on VirusTotal, a timer of 30 seconds is set. Fifty documents can be processed in VT in about 25 mins in this case. However, if an account has a higher threshold for queries, then that limiter can be removed.
___

In [5]:
payload={}
files=[
]
headers = {
  'x-apikey': VTApiKey
}
behavioursList = []
for i in BazaarJsonobj["data"]:
    response = requests.request("GET","https://www.virustotal.com/api/v3/files/{}/behaviours".format(i["sha256_hash"]),headers=headers, data=payload, files=files)
    behavioursList.append(json.loads(response.text))
    print(str(response.status_code) + " " + i["sha256_hash"])
    time.sleep(30)
    
if len(behavioursList) > 1:
    print("\nAttributes in list")

200 2e5eabee7001e4a5adec24a18e680548ce354128edfdb10946fe4ee8a15b92cf
200 289c19cf74628058d7012f643351f2bcc6bb0add93689ec7a6248ed4c55a3d01
200 018cfbeb6f155503fbe7a7e4dfd09f899fcf8d96ad8711b9295153b66945381d
200 a6258b46adfb734f248d3aa3d0f7b57c0d1a5bbfde1e8eb665d6ee21548ec089
200 6b2e23e38be7ad27c11af03599f5caaf69dff237e39a5ffb1904db398e613221
200 06b686985f4246819d7fed52a2b9fc1dbed7406d80f902d655866aed61392cbd
200 a1b7cd862762ff80cf95b544e80dfc6f887d9e0e9a8fffeec7c2574812b917d6
200 454b8a1236e5c819a5eb73785d04de86f951ee7dc99d00698a60f2b7abdad407
200 7b4dd1f8ad728126f05c9abf07ad34daf3ff31307f90a8b57cd2cae3c220d4de
200 f6a8163b680a0e66e368319d353a9b6f5bd83ecdabc2299e51b1881fe688f204
200 f72c86685e49999377ce344c9fa547abb9c7fa979f68bbc0e9a7bfe28aee309d
200 3fee9dd9ad7eeb2357adc1d38e030485a52de34f9b2fc823558d263d23b498e7
200 d48ae331b7f91bea05d05ee995f5d6e78fb8321c3490bfe926596f2e2d17d287
200 192bd09f84bb87300ee9c86aefb2f47e4c27ae9709a5a44329024b2dd2f273f0
200 f6447113407e7229303779e6688ab9

___

One issue that will commonly come up as data is gathered from VirusTotal is the heterogeneous manner in which it is presented. Some data will return the absolute path, while others will present data as environment variables. In this case, we need the absolute path for data to be processed correctly. So we will create several regular expressions to find and replace the non-uniform data. 

The data we are interested in is:
- what data was written to disk?
- What, if any, processes were created?

With this, we can start setting up the DataFrames used later to present the data.
___

In [144]:
#print(json.dumps(behavioursList,indent=4))
#Pass along and santize the list
DroppedFiles = []
CommandExecutions = []
CommandDict = {}
for i in behavioursList:
    for j in i["data"]:
        if "files_dropped" in j["attributes"]:
            for k in j["attributes"]["files_dropped"]:
                x = re.search("^\/", k["path"])
                if x:
                   pass
                else:
                    translated = ""
                    if re.search(r"%APPDATA%", k["path"]):
                        translated = re.sub(r"%APPDATA%",r"c:\\users\\infected\\AppData\\Roaming", k["path"])
                    elif re.search(r"%LOCALAPPDATA%",k["path"]):
                        translated = re.sub(r"%LOCALAPPDATA%",r"c:\\users\\infected\\AppData\\Local", k["path"])
                    elif re.search(r"%HOMEPATH%",k["path"]):
                        ranslated = re.sub(r"%HOMEPATH%",r"c:\\users\\infected", k["path"])
                    elif re.search(r"%TEMP%", k["path"]):
                        translated = re.sub(r"%TEMP%",r"c:\\windows\\temp", k["path"])
                    elif re.search(r"%ALLUSERSPROFILE%", k["path"]):
                        translated = re.sub(r"%ALLUSERSPROFILE%",r"c:\\programdata", k["path"])
                    elif re.search(r"<SYSTEM32>", k["path"]):
                        translated = re.sub(r"<SYSTEM32>",r"c:\\windows\\system32", k["path"])
                    elif re.search(r"(%windir%|%WINDIR%)", k["path"]):
                        translated = re.sub(r"(%windir%|%WINDIR%)",r"c:\\windows", k["path"])
                    elif re.search(r"<USER>", k["path"]):
                        translated = re.sub(r"<USER>",r"infected", k["path"])
                    elif re.search(r"<DRIVERS>", k["path"]):
                        translated = re.sub(r"<DRIVERS>",r"c:\\windows\\system32\\Drivers", k["path"])
                    elif re.search(r"^C:\\", k["path"]) != True:
                        pass
                    elif re.search(r"(<CURRENT_DIR>|<PATH_SAMPLE>)", k["path"]):
                        pass              
                    else:
                        DroppedFiles.append(k["path"])
                    if translated != "":
                        DroppedFiles.append(translated.lower())
                        
FilesWritten = pd.DataFrame({"Files Written":DroppedFiles})

___

We can also look at the command lines to determine what processes were launched by the malicious document. In this case, we take the same data set and iterate through all of the process executions, and put the data into a DataFrame for viewing.
___

_Note: Due to the size of the DataFrames, these are commented out using #, to view the data, just uncomment the code_

In [196]:
CommandLines = []
for i in behavioursList:
    for j in i["data"]:
        if "command_executions" in j["attributes"]:
            for k in j["attributes"]["command_executions"]:
                x = re.search("\(.*\.exe\)", k)
                if x:
                    split = re.split(r"(?<=\))\s",k)
                    CommandLines.append(split[1])
                else:
                    pass

CommandsDataFrame = pd.DataFrame({"Command Lines":CommandLines})

In [176]:
#FilesWritten.style.set_properties(**{'text-align': 'left'})

In [182]:
#CommandsDataFrame.style.set_properties(**{'text-align': 'left','column-align':'left'})

___

Using Regular Expressions, we can match on directories of interest to prioritize what to build detections on as well as look at what extensions that should be included to cut down on the number of false positives.
___

In [198]:
System32Count = 0
TempCount = 0
AppdataCount = 0
ProgramdataCount = 0

System32List = []
TempList = []
AppdataList = []
ProgramdataList = []

for i in DroppedFiles:
        if re.search(r"c:\\windows\\system32", i):
            System32Count+=1
            System32List.append(i)
        elif re.search(r"c:\\windows\\temp", i):
            TempCount+=1
            TempList.append(i)
        elif re.search(r"c:\\users\\infected\\appdata", i):
            AppdataCount+=1
            AppdataList.append(i)
        elif re.search(r"c:\\programdata", i):
            ProgramdataCount+=1
            ProgramdataList.append(i)
PathMetrics = pd.DataFrame(
    {
        "System32":[System32Count],
        "Windows\\Temp":[TempCount],
        "Appdata":[AppdataCount],
        "ProgramData":[ProgramdataCount]
                          
    }
)

System32DF = pd.DataFrame(
    {
        "System32":System32List    
    }
)

TempDF = pd.DataFrame(
    {
        "Windows\\Temp":TempList   
    }
)
AppDataDF = pd.DataFrame(
    {
        "Appdata":AppdataList   
    }
)
ProgramdataDF = pd.DataFrame(
    {
        "ProgramData":ProgramdataList  
    }
)



## Number of Files Written to Paths of Interest

In [184]:
PathMetrics

Unnamed: 0,System32,Windows\Temp,Appdata,ProgramData
0,9,29,54,23


## Files Written to System32

In [199]:
System32DF.style.set_properties(**{'text-align': 'left'})

Unnamed: 0,System32
0,c:\windows\system32\tasks\opera scheduled autoupdate 3131961357
1,c:\windows\system32\tasks\opera scheduled autoupdate 3131961373
2,c:\windows\system32\tasks\updates\cafiuewyvqpje
3,c:\windows\system32\tasks\updates\yonupifni
4,c:\windows\system32\tasks\updates\shxjar
5,c:\windows\system32\tasks\updates\ikhjhnpnttekys
6,c:\windows\system32\drivers\etc\hosts
7,c:\windows\system32\tasks\updates\bwwiry
8,c:\windows\system32\tasks\updates\fnjibdddilh


## Files Written to Windows\\Temp

In [200]:
TempDF.style.set_properties(**{'text-align': 'left'})

Unnamed: 0,Windows\Temp
0,c:\windows\temp\69577.exe
1,c:\windows\temp\c300.tmp
2,c:\windows\temp\c301.tmp
3,c:\windows\temp\c38f.tmp
4,c:\windows\temp\c390.tmp
5,c:\windows\temp\yfqrb.exe
6,c:\windows\temp\69577.exe
7,c:\windows\temp\c80f.tmp
8,c:\windows\temp\c820.tmp
9,c:\windows\temp\c830.tmp


## Files Written to any AppData Subdirectory

In [192]:
AppDataDF.style.set_properties(**{'text-align': 'left'})

Unnamed: 0,Appdata
0,c:\users\infected\appdata\roaming\microsoft\windows\start menu\programs\notepod\notepod.exe
1,c:\users\infected\appdata\local\microsoft\vault\4bf4c442-9b8a-41a0-b380-dd4a704ddb28\policy.vpol
2,c:\users\infected\appdata\roaming\microsoft\windows\gduvujte\gduvujte
3,c:\users\infected\appdata\roaming\microsoft\windows\gduvujte\tsjtwjst.exe
4,c:\users\infected\appdata\roaming\microsoft\windows\start menu\programs\startup\gduvujte.lnk
5,c:\users\infected\appdata\local\microsoft\vault\4bf4c442-9b8a-41a0-b380-dd4a704ddb28\policy.vpol
6,c:\users\infected\appdata\roaming\damiano2749.exe
7,c:\users\infected\appdata\local\microsoft\vault\4bf4c442-9b8a-41a0-b380-dd4a704ddb28\policy.vpol
8,c:\users\infected\appdata\roaming\microsoft\windows\start menu\programs\startup\wfsrujfs.lnk
9,c:\users\infected\appdata\roaming\microsoft\windows\wfsrujfs\sitvdwfw.exe


## Files Written to ProgramData

In [172]:
ProgramdataDF.style.set_properties(**{'text-align': 'left'})

Unnamed: 0,ProgramData
0,c:\programdata\microsoft\vault\ac658cb4-9126-49bd-b877-31eedab3f204\2f1a6504-0641-44cf-8bb5-3612d865f2e5.vsch
1,c:\programdata\microsoft\vault\ac658cb4-9126-49bd-b877-31eedab3f204\3ccd5499-87a8-4b10-a215-608888dd3b55.vsch
2,c:\programdata\microsoft\vault\ac658cb4-9126-49bd-b877-31eedab3f204\policy.vpol
3,c:\programdata\microsoft\vault\ac658cb4-9126-49bd-b877-31eedab3f204\2f1a6504-0641-44cf-8bb5-3612d865f2e5.vsch
4,c:\programdata\microsoft\vault\ac658cb4-9126-49bd-b877-31eedab3f204\3ccd5499-87a8-4b10-a215-608888dd3b55.vsch
5,c:\programdata\microsoft\vault\ac658cb4-9126-49bd-b877-31eedab3f204\policy.vpol
6,c:\programdata\microsoft\vault\ac658cb4-9126-49bd-b877-31eedab3f204\2f1a6504-0641-44cf-8bb5-3612d865f2e5.vsch
7,c:\programdata\microsoft\vault\ac658cb4-9126-49bd-b877-31eedab3f204\3ccd5499-87a8-4b10-a215-608888dd3b55.vsch
8,c:\programdata\microsoft\vault\ac658cb4-9126-49bd-b877-31eedab3f204\policy.vpol
9,c:\programdata:applicationdata
