In [1]:
%load_ext lab_black

```python
import sys

sys.path.append("../")
```

In [2]:
import json

from functional import seq
from operator import itemgetter
from itertools import product

In [3]:
feeds = (
    seq.open("../pulse_20210310.json.gz")
    .map(lambda feed: feed.decode())
    .map(lambda feed: json.loads(feed))
    .cache()
)

In [4]:
"""
feeds.take(2).map(lambda feed: itemgetter("id", "references")(feed)).starmap(
    lambda feed_id, reference_urls: product([feed_id], reference_urls)
).flatten()
"""

'\nfeeds.take(2).map(lambda feed: itemgetter("id", "references")(feed)).starmap(\n    lambda feed_id, reference_urls: product([feed_id], reference_urls)\n).flatten()\n'

---

In [5]:
import requests
from pathlib import Path, PurePosixPath
from urllib.parse import urlparse, unquote

In [6]:
downloading_list = (
    feeds.map(lambda feed: itemgetter("id", "references")(feed))
    .starmap(lambda feed_id, reference_urls: product([feed_id], reference_urls))
    .flatten()
    .to_list()
)

---

In [7]:
import multiprocessing.pool
import functools


def timeout(max_timeout):
    """Timeout decorator, parameter in seconds."""

    def timeout_decorator(item):
        """Wrap the original function."""

        @functools.wraps(item)
        def func_wrapper(*args, **kwargs):
            """Closure for function."""
            pool = multiprocessing.pool.ThreadPool(processes=1)
            async_result = pool.apply_async(item, args, kwargs)
            # raises a TimeoutError if execution exceeds max_timeout
            return async_result.get(max_timeout)

        return func_wrapper

    return timeout_decorator

---

In [8]:
from pyparsing import Combine, Keyword, Word, alphanums, ZeroOrMore

In [9]:
# @timeout(5.0)  # if execution takes longer than 5 seconds, raise a TimeoutError
def download_report(feed_id, report_url):

    IGNORED_DOMAINS = ["github.com", "twitter.com"]

    def extract(url):
        """
        extract domain and resource name
        """
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        indentifier = parsed_url.path
        resource = PurePosixPath(indentifier).parts[-1]
        return domain, resource

    def is_html(string):
        """
        check string is or not html document
        """
        parser = Combine(
            Word("<") + Keyword("!DOCTYPE html") + ZeroOrMore(alphanums) + Word(">")
        )
        parsing_results = parser.scanString(string)

        if not parsing_results:  # check empty list
            return False
        else:
            return True

    # download
    domain, resource = extract(report_url)

    try:
        if domain not in IGNORED_DOMAINS:

            # download
            resp = requests.get(report_url, timeout=3)

            # write into file
            if is_html(resp.text):
                # setup file path and name
                base_dir = Path("./reports")
                file_path = Path(base_dir, Path(feed_id), Path(resource))
                file_path.parent.mkdir(parents=True, exist_ok=True)

                with file_path.open("w", encoding="utf-8") as file:
                    file.write(resp.text)

                return True

            else:
                return False

        else:
            return False
    except:
        return False

In [10]:
from tqdm import tqdm

# from tqdm.asyncio import tqdm

results = []

for reference in tqdm(downloading_list):
    try:
        results.append((downloading_list, download_report(*reference)))
    except:
        results.append((downloading_list, False))

In [11]:
# results = [download_report(*reference) for reference in tqdm(downloading_list)]

In [12]:
# reversed_downloading_list = seq(downloading_list).reverse().to_list()

In [13]:
results = []

for reference in tqdm(downloading_list):
    try:
        results.append((downloading_list, download_report(*reference)))
    except:
        results.append((downloading_list, False))

100%|██████████| 3998/3998 [1:45:43<00:00,  1.59s/it]    


In [18]:
results[1]

([('603eb1abdd4812819c64e197',
   'https://www.microsoft.com/security/blog/2021/03/02/hafnium-targeting-exchange-servers/'),
  ('603eb1abdd4812819c64e197',
   'https://www.volexity.com/blog/2021/03/02/active-exploitation-of-microsoft-exchange-zero-day-vulnerabilities/'),
  ('603eb1abdd4812819c64e197',
   'https://us-cert.cisa.gov/ncas/alerts/aa21-062a'),
  ('603eb1abdd4812819c64e197',
   'https://unit42.paloaltonetworks.com/microsoft-exchange-server-vulnerabilities/'),
  ('603eb1abdd4812819c64e197',
   'https://www.fireeye.com/blog/threat-research/2021/03/detection-response-to-exploitation-of-microsoft-exchange-zero-day-vulnerabilities.html'),
  ('603eb1abdd4812819c64e197',
   'https://github.com/cert-lv/exchange_webshell_detection'),
  ('603eb1abdd4812819c64e197',
   'https://github.com/nsacyber/Mitigating-Web-Shells'),
  ('603eb1abdd4812819c64e197',
   'https://blog.truesec.com/2021/03/07/exchange-zero-day-proxylogon-and-hafnium/'),
  ('603eb1abdd4812819c64e197',
   'https://twitter.