## Data scraping from the attack.mitre.org website

### needed imports

In [None]:
import requests
import json
from bs4 import BeautifulSoup, SoupStrainer
import csv
import re

### Step 1: saving the links to the software pages in a dictionary 

In [1]:

# save URL in Variable
url = 'https://attack.mitre.org/software/'

r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')

# searchs for table with class name and saves it in software_table
software_table = soup.find('table', class_ = 'table table-bordered table-alternate mt-2')

# going one step deeper, because all the data is in the body of the table
software_data = []

for software in software_table.find_all('tbody'):
    rows = software.find_all('tr')
    
    for row in rows:
        
        # save name, ID and link
        software_name = row.findAll('td')[1].text.strip()
        software_link = row.find('a', href = True)
        software_id = software_link['href'].split('/')[2]
        
        software_url = "https://attack.mitre.org/" + software_link['href']

 
        # save all data in a dict
        software_link_dict = {"software_name": software_name, "software_link": software_url, 
                              "software_id": software_id}
        software_data.append(software_link_dict)


In [5]:
# save the data in a .txt-file
jsonStr = json.dumps(software_data)

myList = open('softwarelist.txt','w')
myList.write(jsonStr)
myList.close()

### 2. Step: reading the softwarelist file and collecting the json-links from the website

In [4]:
#Reading the software-list-File and collecting the json-Links from the websit
complete_data = []
json_list = []

for i in range(0,585):
    technique_data = []
    
    r = requests.get(software_data[i]['software_link'])
    soup = BeautifulSoup(r.text, 'html.parser')
    
    #scraping the link for json-file
    json_link = "https://attack.mitre.org" + soup.findAll('a', class_ = "dropdown-item")[-2]['href']
    
    json_dict = {"json_link": json_link}
    json_list.append(json_dict)
    
print(json_list)

[{'json_link': 'https://attack.mitre.org/software/S0066/S0066-enterprise-layer.json'}, {'json_link': 'https://attack.mitre.org/software/S0065/S0065-enterprise-layer.json'}, {'json_link': 'https://attack.mitre.org/software/S0469/S0469-enterprise-layer.json'}, {'json_link': 'https://attack.mitre.org/software/S0202/S0202-enterprise-layer.json'}, {'json_link': 'https://attack.mitre.org/software/S0552/S0552-enterprise-layer.json'}, {'json_link': 'https://attack.mitre.org/software/S0309/S0309-mobile-layer.json'}, {'json_link': 'https://attack.mitre.org/software/S0045/S0045-enterprise-layer.json'}, {'json_link': 'https://attack.mitre.org/software/S0440/S0440-mobile-layer.json'}, {'json_link': 'https://attack.mitre.org/software/S0331/S0331-enterprise-layer.json'}, {'json_link': 'https://attack.mitre.org/software/S0092/S0092-enterprise-layer.json'}, {'json_link': 'https://attack.mitre.org/software/S0319/S0319-mobile-layer.json'}, {'json_link': 'https://attack.mitre.org/software/S0504/S0504-ente

In [33]:
jsonStr = json.dumps(json_list)

myList = open('json_links.txt','w')
myList.write(jsonStr)
myList.close()

In [35]:
#reads the json-Files and extracts the software-name and -id, as well as all used techniques and subtechniques

technique_data = []
complete_data = []

for i in range(0,585):
    technique_data = []
    
    r = requests.get(json_list[i]['json_link'])
    soup = BeautifulSoup(r.text, 'html.parser')

    #reads name and ID from  softwarelist.txt 
    software_name = software_data[i]['software_name']
    software_id = software_data[i]['software_id']
    soup_string = str(soup)

    json_data = json.loads(soup_string)
    
    for technique in json_data['techniques']:
        technique_id = technique['techniqueID']
        show_subtechnique = technique['showSubtechniques']
        technique_dict = {"technique_id": technique_id, "show_subtechnique": show_subtechnique}
        technique_data.append(technique_dict)

    software_dict = {"software_name": software_name, "software_id": software_id, "technique_data": technique_data}
    complete_data.append(software_dict)
    
print(complete_data)

[{'software_name': '3PARA RAT', 'software_id': 'S0066', 'technique_data': [{'technique_id': 'T1071', 'show_subtechnique': True}, {'technique_id': 'T1071.001', 'show_subtechnique': True}, {'technique_id': 'T1573', 'show_subtechnique': True}, {'technique_id': 'T1573.001', 'show_subtechnique': True}, {'technique_id': 'T1083', 'show_subtechnique': False}, {'technique_id': 'T1070', 'show_subtechnique': True}, {'technique_id': 'T1070.006', 'show_subtechnique': True}]}, {'software_name': '4H RAT', 'software_id': 'S0065', 'technique_data': [{'technique_id': 'T1071', 'show_subtechnique': True}, {'technique_id': 'T1071.001', 'show_subtechnique': True}, {'technique_id': 'T1059', 'show_subtechnique': True}, {'technique_id': 'T1059.003', 'show_subtechnique': True}, {'technique_id': 'T1573', 'show_subtechnique': True}, {'technique_id': 'T1573.001', 'show_subtechnique': True}, {'technique_id': 'T1083', 'show_subtechnique': False}, {'technique_id': 'T1057', 'show_subtechnique': False}, {'technique_id'

In [37]:
#saves the information collected above in a txt-file
jsonStr = json.dumps(complete_data)

myList = open('software-technique_data.txt','w')
myList.write(jsonStr)
myList.close()

## Comment

At this point, MITRE published an update with ready-to-go Excelfiles. Only the integration of the timestamp and the fomatting into a binary matrix is left to be done at this point. 