In [1]:
!pwd

/opt/workspace


In [2]:
cc_dir = "/opt/workspace/datasets/common-crawl/"
!ls $cc_dir

CC-MAIN-20240724014956-20240724044956-00798.warc     get_files.sh
CC-MAIN-20240725114544-20240725144544-00476.warc     warc.paths
CC-MAIN-20240725114544-20240725144544-00476_extract


In [3]:
!pip install selectolax

Collecting selectolax
  Downloading selectolax-0.3.21-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading selectolax-0.3.21-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: selectolax
Successfully installed selectolax-0.3.21
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import os
import re
import time

from warcio.archiveiterator import ArchiveIterator
from warcio.recordloader import ArcWarcRecord

from selectolax.parser import HTMLParser

from pyspark.sql.types import StructField, StructType, StringType, ArrayType
from pyspark.sql.functions import col, sum, when, size, round, concat_ws

In [5]:
from pyspark.sql import SparkSession, Row
spark = SparkSession.builder \
    .appName("script_extraction") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", 1) \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.initialExecutors", 2) \
    .config("spark.dynamicAllocation.minExecutors", 0) \
    .config("spark.dynamicAllocation.maxExecutors", 10) \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/24 14:42:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/24 14:42:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
warc_files = [file for file in os.listdir(cc_dir) if file.endswith(".warc")]
warc_files

['CC-MAIN-20240724014956-20240724044956-00798.warc',
 'CC-MAIN-20240725114544-20240725144544-00476.warc']

In [10]:
max_records = 4
input_file = cc_dir + warc_files[1]

with open(input_file, 'rb') as stream:
    print(f"Parsing .warc file {input_file}")
    record_idx = 0
    for record in ArchiveIterator(stream):
        print(type(record))
        record_idx += 1
        print(f"Record Type: {record.rec_type}")
        print(f"Record Headers: {record.rec_headers}")
        if "WARC-Target-URI" in record.rec_headers:
            print(f"Target URI: {record.rec_headers.get_header('WARC-Target-URI')}")
        if record_idx == max_records: break

Parsing .warc file /opt/workspace/datasets/common-crawl/CC-MAIN-20240725114544-20240725144544-00476.warc
<class 'warcio.recordloader.ArcWarcRecord'>
Record Type: warcinfo
Record Headers: WARC/1.0
WARC-Type: warcinfo
WARC-Date: 2024-07-25T11:45:44Z
WARC-Record-ID: <urn:uuid:4310b454-baf3-414f-99af-d0049e37d45d>
Content-Length: 487
Content-Type: application/warc-fields
WARC-Filename: CC-MAIN-20240725114544-20240725144544-00476.warc.gz

<class 'warcio.recordloader.ArcWarcRecord'>
Record Type: request
Record Headers: WARC/1.0
WARC-Type: request
WARC-Date: 2024-07-25T13:48:10Z
WARC-Record-ID: <urn:uuid:8370f230-97b9-4070-8639-b66757902bdb>
Content-Length: 272
Content-Type: application/http; msgtype=request
WARC-Warcinfo-ID: <urn:uuid:4310b454-baf3-414f-99af-d0049e37d45d>
WARC-IP-Address: 52.68.28.174
WARC-Target-URI: http://0081.b-ch.com/blog/0081/archives/2111
WARC-Protocol: http/1.1

Target URI: http://0081.b-ch.com/blog/0081/archives/2111
<class 'warcio.recordloader.ArcWarcRecord'>
Recor

In [11]:
def encode_byte_stream(input_stream):
    return input_stream.encode('utf-8').decode("unicode_escape").encode("latin-1").decode("utf-8", errors="replace")

def warc_script_extraction(input_file):
    data = []
    email_regex = re.compile(r"(mailto:)?([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", re.IGNORECASE)

    schema = StructType([
        StructField("warc_target_uri", StringType(), True),
        StructField("warc_ip_address", StringType(), True),
        StructField("content_type", StringType(), True),
        StructField("content_length", StringType(), True),
        StructField("server", StringType(), True),
        StructField("page_title", StringType(), True),
        StructField("page_description", StringType(), True),
        StructField("emails", ArrayType(StringType()), True),
        StructField("keywords", StringType(), True),
        StructField("script_src_attrs", ArrayType(StringType()), True),
        StructField("script_type_attrs", ArrayType(StringType()), True)
    ])

    with open(input_file, 'rb') as stream:
        print(f"Parsing .warc file: {input_file} \n")
        start_time = time.perf_counter()
        for record in ArchiveIterator(stream):
            if record.rec_type == "response":
                
                raw_text = record.raw_stream.read()
                str_text = str(raw_text).strip().lower().replace('\t', " ").replace('\n', "")
                slax_txt = HTMLParser(str_text)

                target_uri = record.rec_headers.get_header('WARC-Target-URI')
                ip_address = record.rec_headers.get_header("WARC-IP-Address")
                content_type = record.rec_headers.get_header("Content-Type")
                content_length = record.rec_headers.get_header("Content-Length")
                server = record.http_headers.get_header("Server")
                
                title = slax_txt.tags('title')[0].text().strip() if slax_txt.tags('title') else "\uFFFD"
                title = encode_byte_stream(title)

                # Extract emails from anchor tags
                emails = [
                    encode_byte_stream(atag.attributes.get("href"))
                    for atag in slax_txt.tags("a")
                    if atag.attributes.get("href") and email_regex.match(atag.attributes.get("href"))
                ]

                # Extract meta tag descriptions and keywords
                desc, keywords = None, None
                meta_tags = slax_txt.tags('meta')
                for m_idx, mtag in enumerate(meta_tags):
                    desc_tag = mtag.css_first('meta[name="description"]')
                    key_tag = mtag.css_first('meta[name="keywords"]')
                    if desc_tag:
                        desc = desc_tag.attributes.get("content")
                        if desc:
                            desc = encode_byte_stream(desc)
                    if key_tag:
                        keywords = key_tag.attributes.get("content")
                        if keywords:
                            keywords = encode_byte_stream(keywords)
                

                # Extract script src and type attributes
                src_attrs = [
                    script.attributes.get('src')
                    for script in slax_txt.tags('script')
                    if script.attributes.get('src')
                ]

                type_attrs = [
                    script.attributes.get('type')
                    for script in slax_txt.tags('script')
                    if script.attributes.get('type')
                ]

                record_data = {
                    "warc_target_uri": target_uri,
                    "warc_ip_address": ip_address,
                    "content_type": content_type,
                    "content_length": content_length,
                    "server": server,
                    "page_title": title,
                    "page_description": desc,
                    "emails": emails,
                    "keywords": keywords,
                    "script_src_attrs": src_attrs,
                    "script_type_attrs": type_attrs
                }
                data.append(record_data)
                
    df = spark.createDataFrame(data, schema)
    end_time = time.perf_counter()
    print(f"Total time: {(end_time - start_time):.4f}s")
    return df
    
df = warc_script_extraction(input_file).coalesce(1)
df.show()

Parsing .warc file: /opt/workspace/datasets/common-crawl/CC-MAIN-20240725114544-20240725144544-00476.warc 

Total time: 44.5791s


                                                                                

+--------------------+---------------+--------------------+--------------+--------------------+---------------------------------+---------------------------------+--------------------+---------------------------------+--------------------+--------------------+
|     warc_target_uri|warc_ip_address|        content_type|content_length|              server|                       page_title|                 page_description|              emails|                         keywords|    script_src_attrs|   script_type_attrs|
+--------------------+---------------+--------------------+--------------+--------------------+---------------------------------+---------------------------------+--------------------+---------------------------------+--------------------+--------------------+
|http://0081.b-ch....|   52.68.28.174|application/http;...|         13739|Apache/2.2.34 (Am...|             playstation(r)3�ֵ...|                             NULL|                  []|                             NULL

In [12]:
type(df)

pyspark.sql.dataframe.DataFrame

In [73]:
df.select('page_description', 'emails').show(n=10, truncate=True)

[Stage 36:>                                                         (0 + 1) / 1]

+-------------------------+--------------------+
|         page_description|              emails|
+-------------------------+--------------------+
|                     NULL|                  []|
|   发现 our vegan, ewg...|                  []|
|            �������t��ѫ� |                  []|
|    ���⼤�� - sogou�׾¬...|                  []|
|天天看片高清影视在线观看 |                  []|
|     տ���˲������ķ���տ�...|                  []|
|                     NULL|[mailto:info@0xbe...|
|                     NULL|                  []|
|                     NULL|                  []|
|                     NULL|                  []|
+-------------------------+--------------------+
only showing top 10 rows



                                                                                

In [74]:
total_rows = df.count()
total_rows

28178

In [75]:
df.printSchema()

root
 |-- warc_target_uri: string (nullable = true)
 |-- warc_ip_address: string (nullable = true)
 |-- content_type: string (nullable = true)
 |-- content_length: string (nullable = true)
 |-- server: string (nullable = true)
 |-- page_title: string (nullable = true)
 |-- page_description: string (nullable = true)
 |-- emails: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- keywords: string (nullable = true)
 |-- script_src_attrs: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- script_type_attrs: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [80]:
array_columns = ['emails', 'script_src_attrs', 'script_type_attrs']
null_counts = df.select(
    [
        sum(
            when(
                col(c).isNull() | (size(col(c)) == 0), 1
            ).otherwise(0)
        ).alias(c)
        if c in array_columns else
        sum(col(c).isNull().cast("int")).alias(c)
        for c in df.columns
    ]
)
null_counts.show()

+---------------+---------------+------------+--------------+------+----------+----------------+------+--------+----------------+-----------------+
|warc_target_uri|warc_ip_address|content_type|content_length|server|page_title|page_description|emails|keywords|script_src_attrs|script_type_attrs|
+---------------+---------------+------------+--------------+------+----------+----------------+------+--------+----------------+-----------------+
|              0|              0|           0|             0|  1989|         0|           13390| 22995|   22025|            2565|             4215|
+---------------+---------------+------------+--------------+------+----------+----------------+------+--------+----------------+-----------------+



In [81]:
null_frac = null_counts.select(
    [
        round((col(c) / total_rows) * 100, 3).alias(c)
        for c in null_counts.columns
    ]
)
null_frac.show()

+---------------+---------------+------------+--------------+------+----------+----------------+------+--------+----------------+-----------------+
|warc_target_uri|warc_ip_address|content_type|content_length|server|page_title|page_description|emails|keywords|script_src_attrs|script_type_attrs|
+---------------+---------------+------------+--------------+------+----------+----------------+------+--------+----------------+-----------------+
|            0.0|            0.0|         0.0|           0.0| 7.059|       0.0|          47.519|81.606|  78.164|           9.103|           14.958|
+---------------+---------------+------------+--------------+------+----------+----------------+------+--------+----------------+-----------------+



In [82]:
df.printSchema()

root
 |-- warc_target_uri: string (nullable = true)
 |-- warc_ip_address: string (nullable = true)
 |-- content_type: string (nullable = true)
 |-- content_length: string (nullable = true)
 |-- server: string (nullable = true)
 |-- page_title: string (nullable = true)
 |-- page_description: string (nullable = true)
 |-- emails: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- keywords: string (nullable = true)
 |-- script_src_attrs: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- script_type_attrs: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [83]:
array_columns = ['emails', 'script_src_attrs', 'script_type_attrs']
for col_name in array_columns:
    df = df.withColumn(col_name, concat_ws("|", col(col_name)))
df.show()

[Stage 42:>                                                         (0 + 1) / 1]

+--------------------+---------------+--------------------+--------------+--------------------+---------------------------------+---------------------------------+--------------------+---------------------------------+--------------------+--------------------+
|     warc_target_uri|warc_ip_address|        content_type|content_length|              server|                       page_title|                 page_description|              emails|                         keywords|    script_src_attrs|   script_type_attrs|
+--------------------+---------------+--------------------+--------------+--------------------+---------------------------------+---------------------------------+--------------------+---------------------------------+--------------------+--------------------+
|http://0081.b-ch....|   52.68.28.174|application/http;...|         13739|Apache/2.2.34 (Am...|             playstation(r)3�ֵ...|                             NULL|                    |                             NULL

                                                                                

In [60]:
df.printSchema()

root
 |-- warc_target_uri: string (nullable = true)
 |-- warc_ip_address: string (nullable = true)
 |-- content_type: string (nullable = true)
 |-- content_length: string (nullable = true)
 |-- server: string (nullable = true)
 |-- page_title: string (nullable = true)
 |-- page_description: string (nullable = true)
 |-- emails: string (nullable = false)
 |-- keywords: string (nullable = true)
 |-- script_src_attrs: string (nullable = false)
 |-- script_type_attrs: string (nullable = false)



In [86]:
input_file.replace(".warc", "_extract")

'/opt/workspace/datasets/common-crawl/CC-MAIN-20240725114544-20240725144544-00476_extract'

In [87]:
output_file = input_file.replace(".warc", "_extract")
df.write.option("delimiter", "\t").mode("overwrite").csv(output_file, header=True)

                                                                                

In [64]:
!du -hs datasets/common-crawl/CC-MAIN-20240725114544-20240725144544-00476.warc_extract.tsv

103M	datasets/common-crawl/CC-MAIN-20240725114544-20240725144544-00476.warc_extract.tsv
