In [1]:
%load_ext lab_black

In [2]:
# !jupyter nbextension enable --py widgetsnbextension

In [3]:
from transformers import AutoTokenizer

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [4]:
model_checkpoint = "distilbert-base-uncased"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
tokenizer._tokenizer.id_to_token(5000)

'knight'

---

In [7]:
import json
from functional import seq

In [8]:
feeds = seq.open("../pulse_20210310.json.gz").map(lambda feed: json.loads(feed)).cache()

==> Installing minio from minio/stable
Warning: A newer Command Line Tools release is available.
Update them from Software Update in System Preferences or run:
  softwareupdate --all --install --force

If that doesn't show you any updates, run:
  sudo rm -rf /Library/Developer/CommandLineTools
  sudo xcode-select --install

Alternatively, manually download them from:
  https://developer.apple.com/download/all/.
You should download the Command Line Tools for Xcode 13.0.

==> Download complete!
==> Useful links:
Command-line Access: https://docs.min.io/docs/minio-client-quickstart-guide

Object API (Amazon S3 compatible):
   Go:         https://docs.min.io/docs/golang-client-quickstart-guide
   Java:       https://docs.min.io/docs/java-client-quickstart-guide
   Python:     https://docs.min.io/docs/python-client-quickstart-guide
   JavaScript: https://docs.min.io/docs/javascript-client-quickstart-guide
   .NET:       https://docs.min.io/docs/dotnet-client-quickstart-guide

Talk to the community: https://slack.min.io
==> Get started:
NAME:
  minio server - start object storage server

USAGE:
  minio server [FLAGS] DIR1 [DIR2..]
  minio server [FLAGS] DIR{1...64}
  minio server [FLAGS] DIR{1...64} DIR{65...128}

DIR:
  DIR points to a directory on a filesystem. When you want to combine
  multiple drives into a single large system, pass one directory per
  filesystem separated by space. You may also use a '...' convention
  to abbreviate the directory arguments. Remote directories in a
  distributed setup are encoded as HTTP(s) URIs.

FLAGS:
  --address value              bind to a specific ADDRESS:PORT, ADDRESS can be an IP or hostname (default: ":9000")
  --listeners value            bind N number of listeners per ADDRESS:PORT (default: 1)
  --console-address value      bind to a specific ADDRESS:PORT for embedded Console UI, ADDRESS can be an IP or hostname
  --certs-dir value, -S value  path to certs directory (default: "/Users/200722-csti-mac/.minio/certs")
  --quiet                      disable startup information
  --anonymous                  hide sensitive information from logging
  --json                       output server logs and startup information in json format
  --help, -h                   show help
  
EXAMPLES:
  1. Start minio server on "/home/shared" directory.
     $ minio server /home/shared

  2. Start single node server with 64 local drives "/mnt/data1" to "/mnt/data64".
     $ minio server /mnt/data{1...64}

  3. Start distributed minio server on an 32 node setup with 32 drives each, run following command on all the nodes
     $ export MINIO_ROOT_USER=minio
     $ export MINIO_ROOT_PASSWORD=miniostorage
     $ minio server http://node{1...32}.example.com/mnt/export{1...32}

  4. Start distributed minio server in an expanded setup, run the following command on all the nodes
     $ export MINIO_ROOT_USER=minio
     $ export MINIO_ROOT_PASSWORD=miniostorage
     $ minio server http://node{1...16}.example.com/mnt/export{1...32} \
            http://node{17...64}.example.com/mnt/export{1...64}
🍺  /usr/local/Cellar/minio/RELEASE.2021-11-09T03-21-45Z_1: 3 files, 98MB, built in 7 seconds

To migrate existing data from a previous major version of PostgreSQL run:
  brew postgresql-upgrade-database

This formula has created a default database cluster with:
  initdb --locale=C -E UTF-8 /usr/local/var/postgres
For more details, read:
  https://www.postgresql.org/docs/14/app-initdb.html

To restart postgresql after an upgrade:
  brew services restart postgresql
Or, if you don't want/need a background service you can just run:
  /usr/local/opt/postgresql/bin/postgres -D /usr/local/var/postgres

---

In [9]:
import io
import requests

from minio import Minio
from collections import namedtuple
from urllib.parse import urlparse
from pathlib import PurePosixPath
from pyparsing import Combine, Keyword, Word, alphanums, nums, ZeroOrMore

In [10]:
# minio default setup
client = Minio(
    "127.0.0.1:9000", access_key="minioadmin", secret_key="minioadmin", secure=False
)

---

In [11]:
import logging

logging.basicConfig(filename="feed_download.log", filemode="w", level=logging.INFO)

from itertools import product
from tqdm import tqdm

---

In [40]:
class FeedReferencesIngestion:

    IgnoredDomains = [
        "github.com",
        "gist.github.com",
        "raw.githubusercontent.com",
        "twitter.com",
    ]

    def __init__(self, minio_client, bucket="feed_references"):
        self.client = minio_client
        self.bucket = bucket

    @staticmethod
    def check_mimetype(response):

        # check mimetype with different ways
        def guess_mimetype(response):
            original_url = (
                urlparse(response.url)
                if response.history == []
                else urlparse(response.history[0].url)
            )
            filename = PurePosixPath(original_url.path).parts[-1]
            return "application/pdf" if ".pdf" in filename.lower() else "text/html"

        def get_mimetype(response):
            content_type = response.headers.get("Content-Type")
            mimetype, sep, charset = content_type.partition(";")
            return mimetype

        def parse_mimetype(response):
            # setup parser
            pdf = Combine(
                Word("%pdf-") + Word(nums) + "." + Word(nums) + "\r"
            ).setResultsName("application/pdf")

            html = Combine(
                Word("<") + Keyword("!doctype html") + ZeroOrMore(alphanums) + Word(">")
            ).setResultsName("text/html")

            parser = html | pdf
            # parse
            input_chunk = response.text.lower()[:20]
            try:
                parsing_result = parser.parseString(input_chunk, parseAll=False)
                return parsing_result.getName()
            except:
                return "application/octet-stream"

        # get all checking results
        mimetypes = (
            guess_mimetype(response),
            get_mimetype(response),
            parse_mimetype(response),
        )

        if all(mimetype == "application/pdf" for mimetype in mimetypes):
            return "application/pdf"

        elif all(mimetype == "text/html" for mimetype in mimetypes):
            return "text/html"

        else:
            return "application/octet-stream"

    def download(self, feed_id, reference_url):

        try:
            # check url
            parsed_url = urlparse(reference_url)
            source_domain = parsed_url.netloc

            # download
            response = (
                requests.get(reference_url, timeout=3)
                if source_domain not in FeedReferencesIngestion.IgnoredDomains
                else None
            )

            # get reference
            file_name = PurePosixPath(parsed_url.path).parts[-1]
            reference = {
                "bucket_name": self.bucket,
                "object_name": "{}/{}".format(feed_id, file_name),
                "data": io.BytesIO(response.content),
                "content_type": FeedReferencesIngestion.check_mimetype(response),
                "length": (
                    int(response.headers.get("Content-Length"))
                    if response.headers.get("Content-Length") is not None
                    else len(response.content)
                ),
                "metadata": {
                    "feed": feed_id,
                    "url": reference_url,
                    "charset": response.encoding,
                },
            }

            # check and save
            if (
                response.ok
                and reference.get("content_type") != "application/octet-stream"
            ):
                self.client.put_object(**reference)  # save
                message = "{} {} {}".format(feed_id, reference_url, response.reason)
                logging.info(message)
            else:
                message = "{} {} {}".format(feed_id, reference_url, response.reason)
                logging.warning(message)
        except:
            message = "{} {} {}".format(feed_id, reference_url, "ERROR")
            logging.error(message)

    async def async_download(self, feed_id, reference_url):

        """
        # Todo:
            aiohttp.get behavoir is not same as requests.get when check_mimetype,
            it has to refactor in production stage.

        # Sample Code:
            import asyncio
            import aiohttp

            async def call_url(url):
                response = await aiohttp.ClientSession().get(url)
                data = await response.text()
                return data

            futures = [call_url(url) for url in urls]

            # python 3.7+
            # asyncio.run(asyncio.wait(futures))

            loop = asyncio.get_event_loop()
            loop.run_until_complete(asyncio.wait(futures))
        """
        pass

    def download_references(self, feed):
        references = product([feed.get("id")], feed.get("references"))
        for ref in references:
            self.download(*ref)

    def list_references(self, feed_id, content_type=["text/html", "application/pdf"]):
        # list references by feed id and get reference objectts
        try:
            objects_gen = self.client.list_objects(
                bucket_name=self.bucket,
                prefix=feed_id,
                include_user_meta=True,
                recursive=True,
            )

            reference_objects = (
                seq(objects_gen)
                .filter(lambda reference: reference.is_dir is False)
                .filter(
                    lambda reference: reference.metadata.get("content-type")
                    in content_type
                )
                .to_list()
            )

        except TypeError:
            reference_objects = []

        return reference_objects

    def read_references(self, feed, content_type=["text/html", "application/pdf"]):
        
        self.list_references(feed.get("id"), content_type)
        
        try:
            object_names = (
                seq(objects)
                .filter(lambda ref: ref.is_dir is False)
                .filter(lambda ref: ref.metadata.get("content-type") in content_type)
                .map(
                    lambda ref: (
                        ref.metadata.get("X-Amz-Meta-Feed"),
                        ref.metadata.get("X-Amz-Meta-Url"),
                        # ref.object_name,
                        client.get_object(
                            bucket_name="feed_references",
                            object_name=ref.object_name,
                        ),
                    )
                )
                .to_list()
            )

        except:
            object_names = []

        return object_names

In [41]:
fri = FeedReferencesIngestion(client)

```python
for feed in tqdm(feeds.to_list()):
    fri.download_references(feed)
```

---

In [42]:
fri.list_references("60413cd1bb43dcf1d22c274b")

[<minio.datatypes.Object at 0x1282877b8>,
 <minio.datatypes.Object at 0x128287a58>]

In [14]:
client.list_buckets()

[Bucket('feed_references')]

In [15]:
client.bucket_exists("feed_references")

True

In [16]:
# list_objects
seq(
    client.list_objects(
        bucket_name="feed_references",
        prefix="60413cd1bb43dcf1d22c274b",  # remember "/"
        include_user_meta=True,
        recursive=True,
    )
).map(lambda obj: (obj.object_name, obj.content_type, obj.metadata))

0,1,2
60413cd1bb43dcf1d22c274b/goldmax-goldfinder-sibot-analyzing-nobelium-malware,,"{'content-type': 'text/html', 'X-Amz-Meta-Charset': 'UTF-8', 'X-Amz-Meta-Feed': '60413cd1bb43dcf1d22c274b', 'X-Amz-Meta-Url': 'https://www.microsoft.com/security/blog/2021/03/04/goldmax-goldfinder-sibot-analyzing-nobelium-malware/'}"
60413cd1bb43dcf1d22c274b/sunshuttle-second-stage-backdoor-targeting-us-based-entity.html,,"{'X-Amz-Meta-Feed': '60413cd1bb43dcf1d22c274b', 'X-Amz-Meta-Url': 'https://www.fireeye.com/blog/threat-research/2021/03/sunshuttle-second-stage-backdoor-targeting-us-based-entity.html', 'content-type': 'text/html', 'X-Amz-Meta-Charset': 'UTF-8'}"


In [17]:
# get_object
try:
    response = client.get_object(
        bucket_name="feed_references",
        object_name="60413cd1bb43dcf1d22c274b/goldmax-goldfinder-sibot-analyzing-nobelium-malware",
    )
    print(response.headers, "\n", response.read()[:50])

# Read data from response.
finally:
    response.close()
    response.release_conn()

HTTPHeaderDict({'Accept-Ranges': 'bytes', 'Content-Length': '40357', 'Content-Security-Policy': 'block-all-mixed-content', 'Content-Type': 'text/html', 'ETag': '"ee3132eb3132e0bc4dc5beff229b8645"', 'Last-Modified': 'Tue, 07 Dec 2021 09:39:11 GMT', 'Server': 'MinIO', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains', 'Vary': 'Origin', 'X-Amz-Request-Id': '16BEF5F690897E28', 'X-Content-Type-Options': 'nosniff', 'X-Xss-Protection': '1; mode=block', 'x-amz-meta-charset': 'UTF-8', 'x-amz-meta-feed': '60413cd1bb43dcf1d22c274b', 'x-amz-meta-url': 'https://www.microsoft.com/security/blog/2021/03/04/goldmax-goldfinder-sibot-analyzing-nobelium-malware/', 'Date': 'Thu, 09 Dec 2021 02:42:33 GMT'}) 
 b'<!DOCTYPE html>\n<!--[if lt IE 7]>      <html class'


---

In [43]:
def list_references(
    feed={"id": "60413cd1bb43dcf1d22c274b"},
    content_type=["text/html", "application/pdf"],
):
    try:
        objects_generator = client.list_objects(
            bucket_name="feed_references",
            prefix=feed.get("id"),
            include_user_meta=True,
            recursive=True,
        )
        reference_objects = (
            seq(objects_generator)
            .filter(lambda ref: ref.is_dir is False)
            .filter(lambda ref: ref.metadata.get("content-type") in content_type)
            .to_list()
        )

    except TypeError:
        reference_objects = []

    return reference_objects

In [44]:
list(list_references())

[<minio.datatypes.Object at 0x12730ac88>,
 <minio.datatypes.Object at 0x12730afd0>]

In [45]:
list(list_references(feed={"id": "223456"}))

[]

In [18]:
def read_references(
    feed={"id": "60413cd1bb43dcf1d22c274b"},
    content_type=["text/html", "application/pdf"],
):

    objects = client.list_objects(
        bucket_name="feed_references",
        prefix=feed.get("id"),
        include_user_meta=True,
        recursive=True,
    )

    try:
        object_names = (
            seq(objects)
            .filter(lambda ref: ref.is_dir is False)
            .filter(lambda ref: ref.metadata.get("content-type") in content_type)
            .map(
                lambda ref: (
                    ref.metadata.get("X-Amz-Meta-Feed"),
                    ref.metadata.get("X-Amz-Meta-Url"),
                    # ref.object_name,
                    client.get_object(
                        bucket_name="feed_references",
                        object_name=ref.object_name,
                    ),
                )
            )
            .to_list()
        )

    except:
        object_names = []

    return object_names

In [19]:
read_references()

[('60413cd1bb43dcf1d22c274b',
  'https://www.microsoft.com/security/blog/2021/03/04/goldmax-goldfinder-sibot-analyzing-nobelium-malware/',
  <urllib3.response.HTTPResponse at 0x1273b7390>),
 ('60413cd1bb43dcf1d22c274b',
  'https://www.fireeye.com/blog/threat-research/2021/03/sunshuttle-second-stage-backdoor-targeting-us-based-entity.html',
  <urllib3.response.HTTPResponse at 0x1273b7780>)]

In [21]:
o = client.list_objects(
    bucket_name="feed_references",
    prefix="60413cd1bb43dcf1d22c274b",
    include_user_meta=True,
    recursive=True,
)

In [23]:
if o:
    print(list(o))

[<minio.datatypes.Object object at 0x1283394a8>, <minio.datatypes.Object object at 0x1283394e0>]


In [None]:
client.ge

In [172]:
client.stat_object(bucket_name="feed_references", object_name="60413cd1bb43dcf1d22c274b")

S3Error: S3 operation failed; code: NoSuchKey, message: Object does not exist, resource: /feed_references/60413cd1bb43dcf1d22c274b, request_id: 16BEBE7C683B0218, host_id: None, bucket_name: feed_references, object_name: 60413cd1bb43dcf1d22c274b

In [167]:
help(client.stat_object)

Help on method stat_object in module minio.api:

stat_object(bucket_name, object_name, ssec=None, version_id=None, extra_query_params=None) method of minio.api.Minio instance
    Get object information and metadata of an object.
    
    :param bucket_name: Name of the bucket.
    :param object_name: Object name in the bucket.
    :param ssec: Server-side encryption customer key.
    :param version_id: Version ID of the object.
    :param extra_query_params: Extra query parameters for advanced usage.
    :return: :class:`Object <Object>`.
    
    Example::
        # Get object information.
        result = client.stat_object("my-bucket", "my-object")
    
        # Get object information of version-ID.
        result = client.stat_object(
            "my-bucket", "my-object",
            version_id="dfbd25b3-abec-4184-a4e8-5a35a5c1174d",
        )
    
        # Get SSE-C encrypted object information.
        result = client.stat_object(
            "my-bucket", "my-object",
         

In [154]:
client.list_objects(bucket_name="feed_references", object_name="60413cd1bb43dcf1d22c274b", recursive=True)

TypeError: list_objects() got an unexpected keyword argument 'object_name'

In [155]:
client.stat_object(
    bucket_name="feed_references",
    object_name="60413cd1bb43dcf1d22c274b",
)

S3Error: S3 operation failed; code: NoSuchKey, message: Object does not exist, resource: /feed_references/60413cd1bb43dcf1d22c274b, request_id: 16BEBE1E2AD6ED00, host_id: None, bucket_name: feed_references, object_name: 60413cd1bb43dcf1d22c274b

In [143]:
client.stat_object(
    bucket_name="feed_references",
    object_name="60413cd1bb43dcf1d22c274b/goldmax-goldfinder-sibot-analyzing-nobelium-malware",
)

<minio.datatypes.Object at 0x12ce1ea58>

In [144]:
dir(
    client.stat_object(
        bucket_name="feed_references",
        object_name="60413cd1bb43dcf1d22c274b/goldmax-goldfinder-sibot-analyzing-nobelium-malware",
    )
)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_bucket_name',
 '_content_type',
 '_etag',
 '_is_delete_marker',
 '_is_latest',
 '_last_modified',
 '_metadata',
 '_object_name',
 '_owner_id',
 '_owner_name',
 '_size',
 '_storage_class',
 '_version_id',
 'bucket_name',
 'content_type',
 'etag',
 'fromxml',
 'is_delete_marker',
 'is_dir',
 'is_latest',
 'last_modified',
 'metadata',
 'object_name',
 'owner_id',
 'owner_name',
 'size',
 'storage_class',
 'version_id']

In [139]:
help(client.stat_object)

Help on method stat_object in module minio.api:

stat_object(bucket_name, object_name, ssec=None, version_id=None, extra_query_params=None) method of minio.api.Minio instance
    Get object information and metadata of an object.
    
    :param bucket_name: Name of the bucket.
    :param object_name: Object name in the bucket.
    :param ssec: Server-side encryption customer key.
    :param version_id: Version ID of the object.
    :param extra_query_params: Extra query parameters for advanced usage.
    :return: :class:`Object <Object>`.
    
    Example::
        # Get object information.
        result = client.stat_object("my-bucket", "my-object")
    
        # Get object information of version-ID.
        result = client.stat_object(
            "my-bucket", "my-object",
            version_id="dfbd25b3-abec-4184-a4e8-5a35a5c1174d",
        )
    
        # Get SSE-C encrypted object information.
        result = client.stat_object(
            "my-bucket", "my-object",
         

# get

---

In [None]:
FeedReference(feeds)

In [32]:
client.get_object(
    bucket_name="feed_references",
    object_name="546fc7bf11d4083bc021c37f/operation_doubletap.html",
)

<urllib3.response.HTTPResponse at 0x12bc71d68>

In [22]:
# feeds.first()

In [23]:
references = FeedReferences(feeds.first(), client)

In [24]:
seq(references.feed_references).map(lambda url: (url, FeedReferences.extract(url)))

0,1
https://www.microsoft.com/security/blog/2021/03/02/hafnium-targeting-exchange-servers/,"('www.microsoft.com', 'hafnium-targeting-exchange-servers')"
https://www.volexity.com/blog/2021/03/02/active-exploitation-of-microsoft-exchange-zero-day-vulnerabilities/,"('www.volexity.com', 'active-exploitation-of-microsoft-exchange-zero-day-vulnerabilities')"
https://us-cert.cisa.gov/ncas/alerts/aa21-062a,"('us-cert.cisa.gov', 'aa21-062a')"
https://unit42.paloaltonetworks.com/microsoft-exchange-server-vulnerabilities/,"('unit42.paloaltonetworks.com', 'microsoft-exchange-server-vulnerabilities')"
https://www.fireeye.com/blog/threat-research/2021/03/detection-response-to-exploitation-of-microsoft-exchange-zero-day-vulnerabilities.html,"('www.fireeye.com', 'detection-response-to-exploitation-of-microsoft-exchange-zero-day-vulnerabilities.html')"
https://github.com/cert-lv/exchange_webshell_detection,"('github.com', 'exchange_webshell_detection')"
https://github.com/nsacyber/Mitigating-Web-Shells,"('github.com', 'Mitigating-Web-Shells')"
https://blog.truesec.com/2021/03/07/exchange-zero-day-proxylogon-and-hafnium/,"('blog.truesec.com', 'exchange-zero-day-proxylogon-and-hafnium')"
https://twitter.com/SBousseaden/status/1368241345454870528,"('twitter.com', '1368241345454870528')"
https://twitter.com/JohnLaTwC/status/1368952992221700096,"('twitter.com', '1368952992221700096')"


In [25]:
client.list_objects(
    bucket_name="feed_references", prefix="546fc7bf11d4083bc021c37f", recursive=True
)

<generator object Minio._list_objects at 0x12abe65c8>

In [13]:
seq(
    client.list_objects(
        bucket_name="feed_references", prefix="546fc7bf11d4083bc021c37f", recursive=True
    )
).map(lambda obj: dir(obj))

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53
__class__,__delattr__,__dict__,__dir__,__doc__,__eq__,__format__,__ge__,__getattribute__,__gt__,__hash__,__init__,__init_subclass__,__le__,__lt__,__module__,__ne__,__new__,__reduce__,__reduce_ex__,__repr__,__setattr__,__sizeof__,__str__,__subclasshook__,__weakref__,_bucket_name,_content_type,_etag,_is_delete_marker,_is_latest,_last_modified,_metadata,_object_name,_owner_id,_owner_name,_size,_storage_class,_version_id,bucket_name,content_type,etag,fromxml,is_delete_marker,is_dir,is_latest,last_modified,metadata,object_name,owner_id,owner_name,size,storage_class,version_id


In [14]:
seq(client.list_objects(bucket_name="feed_references")).map(
    lambda obj: (obj.object_name, obj.is_dir)
)

0,1
.DS_Store,False
546fc7bf11d4083bc021c37f/,True
5473709d11d4083bc021c387/,True
547e0a9511d4080d5a98d83f/,True
548885ef11d40843c065f6ee/,True
54948dda11d408634421e4e0/,True
54a12d6311d4080471a1a386/,True
54b6b08d11d4080471a1a38e/,True
54b6c01611d4080471a1a390/,True
54bd4b0411d4087235fb7130/,True


In [177]:
# list all objects under dictionary
seq(
    client.list_objects(
        bucket_name="feed_references", prefix="546fc7bf11d4083bc021c37f", recursive=True
    )
).map(lambda obj: obj.object_name)

['546fc7bf11d4083bc021c37f/operation_doubletap.html']

---

# Todo: Refactor with readability
- https://github.com/buriy/python-readability (python)
- https://github.com/mozilla/readability (nodejs)

In [74]:
# with open("./reports/546fc7bf11d4083bc021c37f/operation_doubletap.html", "r") as f:
with open(
    "../data/hellokitty-ransomware-lacks-stealth-but-still-strikes-home.html", "r"
) as f:
    t = f.read()

In [75]:
from readability import Document

In [76]:
doc = Document(t)

In [81]:
doc.title()

'HelloKitty Ransomware Lacks Stealth, But Still Strikes Home - SentinelLabs'

In [77]:
doc.summary()

'<html><body><div><section class="entry-content " itemprop="articleBody">\n\t\t\t\t\n\t\t\t\t<p>Game studio CD Projekt Red recently <a href="https://twitter.com/CDPROJEKTRED/status/1359048125403590660/photo/1" target="_blank" rel="noopener noreferrer">disclosed</a> that it became a victim of a targeted, highly-impactful ransomware. In the days following the disclosure, it was revealed that the ransomware family most likely behind the attack was “HelloKitty”.</p>\n<p><img loading="lazy" src="https://labs.sentinelone.com/wp-content/uploads/2021/03/cdprojekt.jpg" alt="" class="alignnone size-full wp-image-28576" srcset="https://labs.sentinelone.com/wp-content/uploads/2021/03/cdprojekt.jpg 936w, https://labs.sentinelone.com/wp-content/uploads/2021/03/cdprojekt-300x155.jpg 300w, https://labs.sentinelone.com/wp-content/uploads/2021/03/cdprojekt-768x397.jpg 768w, https://labs.sentinelone.com/wp-content/uploads/2021/03/cdprojekt-96x50.jpg 96w" sizes="(max-width: 936px) 100vw, 936px"></p>\n<p>H

In [79]:
from IPython.core.display import display, HTML

# display(HTML("<h1>Hello, world!</h1>"))

In [80]:
display(HTML(doc.summary()))

In [89]:
doc.xpath

False

In [91]:
doc.content()

  raw_html = str_(tostring(doc.body or doc))


'b\'<body class="post-template-default single single-post postid-28566 single-format-standard custom-background group-blog header-sticky" itemscope="itemscope" itemtype="http://schema.org/WebPage">\\n\\n<div id="page" class="site">\\n\\n\\t<header id="masthead" class="site-header site-header__default" role="banner" itemscope itemtype="http://schema.org/WPHeader">\\n\\n\\t\\t<div class="container height-100">\\n\\n\\t\\t\\t<div class="site-header__inner height-100">\\n\\n\\t\\t\\t\\t<div class="site-branding height-100">\\n\\n\\t\\t\\t\\t\\t<div class="display-table height-100">\\n\\n\\t\\t\\t\\t\\t\\t<div class="display-table-cell va-middle">\\n\\n\\t\\t\\t\\t\\t\\t\\t<div class="logo-img">\\n\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t<a href="https://labs.sentinelone.com/">\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t<img src="https://labs.sentinelone.com/wp-content/uploads/2019/10/SentinelLabs_Logo_RGB_WhitePurp.png" alt="SentinelLabs">\\n\\t\\t\\t\\t\\t\