# Web Scraping Cybersecurity Data

In [1]:
import requests 
from bs4 import BeautifulSoup 
import json
import os

In [2]:
%pwd

'c:\\Users\\amman\\Documents\\Generative AI\\End-to-End-AI-Cyber-Security-Assistant\\notebooks'

In [3]:
os.chdir("../")
%pwd

'c:\\Users\\amman\\Documents\\Generative AI\\End-to-End-AI-Cyber-Security-Assistant'

In [4]:
URL_LIST = ["https://attack.mitre.org/tactics/enterprise/", "https://attack.mitre.org/tactics/mobile/", 
            "https://attack.mitre.org/tactics/ics/"]


### Scrape Tactics Data

In [5]:
def get_tactic_urls(base_url_list):
    base_url = "https://attack.mitre.org/tactics/"
    tactic_urls = []
    for url in base_url_list:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")

        # Get tactic IDS
        for row in soup.find_all("tr")[1:]:
            cols = row.find_all("td")
            tactic_id = cols[0].text.strip()
            tactic_url = base_url + tactic_id + "/"
            tactic_urls.append(tactic_url)
            

    return tactic_urls

In [6]:
tactic_urls = get_tactic_urls(URL_LIST)
tactic_urls

['https://attack.mitre.org/tactics/TA0043/',
 'https://attack.mitre.org/tactics/TA0042/',
 'https://attack.mitre.org/tactics/TA0001/',
 'https://attack.mitre.org/tactics/TA0002/',
 'https://attack.mitre.org/tactics/TA0003/',
 'https://attack.mitre.org/tactics/TA0004/',
 'https://attack.mitre.org/tactics/TA0005/',
 'https://attack.mitre.org/tactics/TA0006/',
 'https://attack.mitre.org/tactics/TA0007/',
 'https://attack.mitre.org/tactics/TA0008/',
 'https://attack.mitre.org/tactics/TA0009/',
 'https://attack.mitre.org/tactics/TA0011/',
 'https://attack.mitre.org/tactics/TA0010/',
 'https://attack.mitre.org/tactics/TA0040/',
 'https://attack.mitre.org/tactics/TA0027/',
 'https://attack.mitre.org/tactics/TA0041/',
 'https://attack.mitre.org/tactics/TA0028/',
 'https://attack.mitre.org/tactics/TA0029/',
 'https://attack.mitre.org/tactics/TA0030/',
 'https://attack.mitre.org/tactics/TA0031/',
 'https://attack.mitre.org/tactics/TA0032/',
 'https://attack.mitre.org/tactics/TA0033/',
 'https://

In [7]:
def get_tactic_data(tactic_urls):

    data = []
    

    for url in tactic_urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        tactic_name = soup.find("h1")
        tactic_desc = soup.find_all("p")
        tactic_text = ""
        for desc in tactic_desc:
            tactic_text = tactic_text + desc.text.strip()
        
        data.append(tactic_name.text.strip() + ": " + tactic_text)

    return data
        

data = get_tactic_data(tactic_urls)
data



['Reconnaissance: The adversary is trying to gather information they can use to plan future operations.Reconnaissance consists of techniques that involve adversaries actively or passively gathering information that can be used to support targeting. Such information may include details of the victim organization, infrastructure, or staff/personnel. This information can be leveraged by the adversary to aid in other phases of the adversary lifecycle, such as using gathered information to plan and execute Initial Access, to scope and prioritize post-compromise objectives, or to drive and lead further Reconnaissance efforts.',
 'Resource Development: The adversary is trying to establish resources they can use to support operations.Resource Development consists of techniques that involve adversaries creating, purchasing, or compromising/stealing resources that can be used to support targeting. Such resources include infrastructure, accounts, or capabilities. These resources can be leveraged 

In [None]:
def save_data_to_txt(filepath, data: list):
    if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0):
        with open(filepath, "w") as f:
            for i in data:
                f.write(i + "\n")
        print("Data saved in text file successfully!")

    else:
        print("Text file already exists!")

In [None]:
save_data_to_txt("data/raw/mitre_tactics.csv", data)

Data saved in CSV successfully!


### Scrape Techniques Data

In [11]:
URL_LIST = ["https://attack.mitre.org/techniques/enterprise/", "https://attack.mitre.org/techniques/mobile/",
            "https://attack.mitre.org/techniques/ics/"]


def get_techniques_data(URL_list):
    data = []

    for url in URL_list:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        
        for row in soup.select("tr", class_="technique"):
            cols = row.find_all("td")
            
            for col in cols[1:]:
                data.append(col.text.strip())
    
    return data
                
data = get_techniques_data(URL_LIST)


In [12]:
print(len(set(data)))

1592


In [None]:
save_data_to_txt("data/raw/mitre_techniques.csv", data)

Data saved in CSV successfully!


### Scrape Mitigations Data

In [14]:
URL_LIST = ["https://attack.mitre.org/mitigations/enterprise/", "https://attack.mitre.org/mitigations/mobile/",
            "https://attack.mitre.org/mitigations/ics/"]

In [16]:
def get_mitigations_data(URL_list):
    data = []
    for url in URL_list:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        for row in soup.select("tr")[1:]:
            cols = row.find_all("td")
            mitigation_name = cols[1].text.strip()
            mitigation_desc = cols[2].text.strip()
            data.append(mitigation_name + ": " + mitigation_desc)
    
    return data

data = get_mitigations_data(URL_LIST)

In [None]:
save_data_to_txt("data/raw/mitre_mitgations.csv", data)

Data saved in CSV successfully!
