In [None]:
import re
import time

from pyspark.sql.functions import col, sum, when, size, round, concat_ws
from pyspark.sql.types import StructField, StructType, StringType, ArrayType
from selectolax.parser import HTMLParser
from warcio.archiveiterator import ArchiveIterator

In [16]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("script-extraction") \
    .master("spark://spark-master:7077") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "file:///opt/spark/spark-events/") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", 1) \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.initialExecutors", 0) \
    .config("spark.dynamicAllocation.minExecutors", 1) \
    .config("spark.dynamicAllocation.maxExecutors", 15) \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.scheduler.mode", "FAIR") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [20]:
max_records = 5
input_file = "CC-MAIN-20221126080725-20221126110725-00000.warc"

distinct_record_types = set()

with open(input_file, 'rb') as stream:
    print(f"Parsing .warc file {input_file}")
    record_idx = 0
    for record in ArchiveIterator(stream):
        record_idx += 1
        record_type = record.rec_type
        distinct_record_types.add(record_type)
        
        # print(f"Record Type: {record_type}")
        print(f"Record Headers: {record.rec_headers}")
        # print(f"HTTP Headers: {record.http_headers}")
        
        # if "WARC-Target-URI" in record.rec_headers:
        #     print(f"Target URI: {record.rec_headers.get_header('WARC-Target-URI')}")
        
        if record_idx == max_records: 
            break

print("\nDistinct record types found:")
for rec_type in distinct_record_types:
    print(rec_type)

Parsing .warc file CC-MAIN-20221126080725-20221126110725-00000.warc
Record Headers: WARC/1.0
WARC-Type: warcinfo
WARC-Date: 2022-11-26T08:07:25Z
WARC-Record-ID: <urn:uuid:a6bb155b-41c1-4e16-988c-5a3efb66dc53>
Content-Length: 500
Content-Type: application/warc-fields
WARC-Filename: CC-MAIN-20221126080725-20221126110725-00000.warc.gz

Record Headers: WARC/1.0
WARC-Type: request
WARC-Date: 2022-11-26T09:33:48Z
WARC-Record-ID: <urn:uuid:db75ce35-e5d7-4386-b7cd-13b12ca838a4>
Content-Length: 321
Content-Type: application/http; msgtype=request
WARC-Warcinfo-ID: <urn:uuid:a6bb155b-41c1-4e16-988c-5a3efb66dc53>
WARC-IP-Address: 47.252.36.221
WARC-Target-URI: http://0130.com.cn/news/shownews.php?id=188

Record Headers: WARC/1.0
WARC-Type: response
WARC-Date: 2022-11-26T09:33:48Z
WARC-Record-ID: <urn:uuid:230340ee-4f5c-4c5a-b1d6-d98a986f47d9>
Content-Length: 37265
Content-Type: application/http; msgtype=response
WARC-Warcinfo-ID: <urn:uuid:a6bb155b-41c1-4e16-988c-5a3efb66dc53>
WARC-Concurrent-To: 

In [11]:
def encode_byte_stream(input_stream):
    return input_stream.encode('utf-8').decode("unicode_escape").encode("latin-1").decode("utf-8", errors="replace")

def warc_script_extraction(input_file):
    data = []
    email_regex = re.compile(r"(mailto:)?([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", re.IGNORECASE)

    schema = StructType([
        StructField("warc_target_uri", StringType(), True),
        StructField("warc_ip_address", StringType(), True),
        StructField("content_type", StringType(), True),
        StructField("content_length", StringType(), True),
        StructField("server", StringType(), True),
        StructField("page_title", StringType(), True),
        StructField("page_description", StringType(), True),
        StructField("emails", ArrayType(StringType()), True),
        StructField("keywords", StringType(), True),
        StructField("script_src_attrs", ArrayType(StringType()), True),
        StructField("script_type_attrs", ArrayType(StringType()), True)
    ])

    with open(input_file, 'rb') as stream:
        print(f"Parsing .warc file: {input_file} \n")
        start_time = time.perf_counter()
        for record in ArchiveIterator(stream):
            if record.rec_type == "response":
                
                raw_text = record.raw_stream.read()
                str_text = str(raw_text).strip().lower().replace('\t', " ").replace('\n', "")
                slax_txt = HTMLParser(str_text)

                target_uri = record.rec_headers.get_header('WARC-Target-URI')
                ip_address = record.rec_headers.get_header("WARC-IP-Address")
                content_type = record.rec_headers.get_header("Content-Type")
                content_length = record.rec_headers.get_header("Content-Length")
                server = record.http_headers.get_header("Server")
                
                title = slax_txt.tags('title')[0].text().strip() if slax_txt.tags('title') else "\uFFFD"
                title = encode_byte_stream(title)

                # Extract emails from anchor tags
                emails = [
                    encode_byte_stream(atag.attributes.get("href"))
                    for atag in slax_txt.tags("a")
                    if atag.attributes.get("href") and email_regex.match(atag.attributes.get("href"))
                ]

                # Extract meta tag descriptions and keywords
                desc, keywords = None, None
                meta_tags = slax_txt.tags('meta')
                for m_idx, mtag in enumerate(meta_tags):
                    desc_tag = mtag.css_first('meta[name="description"]')
                    key_tag = mtag.css_first('meta[name="keywords"]')
                    if desc_tag:
                        desc = desc_tag.attributes.get("content")
                        if desc:
                            desc = encode_byte_stream(desc)
                    if key_tag:
                        keywords = key_tag.attributes.get("content")
                        if keywords:
                            keywords = encode_byte_stream(keywords)
                

                # Extract script src and type attributes
                src_attrs = [
                    script.attributes.get('src')
                    for script in slax_txt.tags('script')
                    if script.attributes.get('src')
                ]

                type_attrs = [
                    script.attributes.get('type')
                    for script in slax_txt.tags('script')
                    if script.attributes.get('type')
                ]

                record_data = {
                    "warc_target_uri": target_uri,
                    "warc_ip_address": ip_address,
                    "content_type": content_type,
                    "content_length": content_length,
                    "server": server,
                    "page_title": title,
                    "page_description": desc,
                    "emails": emails,
                    "keywords": keywords,
                    "script_src_attrs": src_attrs,
                    "script_type_attrs": type_attrs
                }
                data.append(record_data)
                
    df = spark.createDataFrame(data, schema)
    end_time = time.perf_counter()
    print(f"Total time: {(end_time - start_time):.4f}s")
    return df
    
df = warc_script_extraction(input_file).coalesce(1)
df.show()

Parsing .warc file: /opt/workspace/datasets/common-crawl/CC-MAIN-20240725114544-20240725144544-00476.warc 

Total time: 44.5791s


                                                                                

+--------------------+---------------+--------------------+--------------+--------------------+---------------------------------+---------------------------------+--------------------+---------------------------------+--------------------+--------------------+
|     warc_target_uri|warc_ip_address|        content_type|content_length|              server|                       page_title|                 page_description|              emails|                         keywords|    script_src_attrs|   script_type_attrs|
+--------------------+---------------+--------------------+--------------+--------------------+---------------------------------+---------------------------------+--------------------+---------------------------------+--------------------+--------------------+
|http://0081.b-ch....|   52.68.28.174|application/http;...|         13739|Apache/2.2.34 (Am...|             playstation(r)3�ֵ...|                             NULL|                  []|                             NULL

In [12]:
type(df)

pyspark.sql.dataframe.DataFrame

In [73]:
df.select('page_description', 'emails').show(n=10, truncate=True)

[Stage 36:>                                                         (0 + 1) / 1]

+-------------------------+--------------------+
|         page_description|              emails|
+-------------------------+--------------------+
|                     NULL|                  []|
|   发现 our vegan, ewg...|                  []|
|            �������t��ѫ� |                  []|
|    ���⼤�� - sogou�׾¬...|                  []|
|天天看片高清影视在线观看 |                  []|
|     տ���˲������ķ���տ�...|                  []|
|                     NULL|[mailto:info@0xbe...|
|                     NULL|                  []|
|                     NULL|                  []|
|                     NULL|                  []|
+-------------------------+--------------------+
only showing top 10 rows



                                                                                

In [74]:
total_rows = df.count()
total_rows

28178

In [75]:
df.printSchema()

root
 |-- warc_target_uri: string (nullable = true)
 |-- warc_ip_address: string (nullable = true)
 |-- content_type: string (nullable = true)
 |-- content_length: string (nullable = true)
 |-- server: string (nullable = true)
 |-- page_title: string (nullable = true)
 |-- page_description: string (nullable = true)
 |-- emails: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- keywords: string (nullable = true)
 |-- script_src_attrs: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- script_type_attrs: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [80]:
array_columns = ['emails', 'script_src_attrs', 'script_type_attrs']
null_counts = df.select(
    [
        sum(
            when(
                col(c).isNull() | (size(col(c)) == 0), 1
            ).otherwise(0)
        ).alias(c)
        if c in array_columns else
        sum(col(c).isNull().cast("int")).alias(c)
        for c in df.columns
    ]
)
null_counts.show()

+---------------+---------------+------------+--------------+------+----------+----------------+------+--------+----------------+-----------------+
|warc_target_uri|warc_ip_address|content_type|content_length|server|page_title|page_description|emails|keywords|script_src_attrs|script_type_attrs|
+---------------+---------------+------------+--------------+------+----------+----------------+------+--------+----------------+-----------------+
|              0|              0|           0|             0|  1989|         0|           13390| 22995|   22025|            2565|             4215|
+---------------+---------------+------------+--------------+------+----------+----------------+------+--------+----------------+-----------------+



In [81]:
null_frac = null_counts.select(
    [
        round((col(c) / total_rows) * 100, 3).alias(c)
        for c in null_counts.columns
    ]
)
null_frac.show()

+---------------+---------------+------------+--------------+------+----------+----------------+------+--------+----------------+-----------------+
|warc_target_uri|warc_ip_address|content_type|content_length|server|page_title|page_description|emails|keywords|script_src_attrs|script_type_attrs|
+---------------+---------------+------------+--------------+------+----------+----------------+------+--------+----------------+-----------------+
|            0.0|            0.0|         0.0|           0.0| 7.059|       0.0|          47.519|81.606|  78.164|           9.103|           14.958|
+---------------+---------------+------------+--------------+------+----------+----------------+------+--------+----------------+-----------------+



In [82]:
df.printSchema()

root
 |-- warc_target_uri: string (nullable = true)
 |-- warc_ip_address: string (nullable = true)
 |-- content_type: string (nullable = true)
 |-- content_length: string (nullable = true)
 |-- server: string (nullable = true)
 |-- page_title: string (nullable = true)
 |-- page_description: string (nullable = true)
 |-- emails: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- keywords: string (nullable = true)
 |-- script_src_attrs: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- script_type_attrs: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [83]:
array_columns = ['emails', 'script_src_attrs', 'script_type_attrs']
for col_name in array_columns:
    df = df.withColumn(col_name, concat_ws("|", col(col_name)))
df.show()

[Stage 42:>                                                         (0 + 1) / 1]

+--------------------+---------------+--------------------+--------------+--------------------+---------------------------------+---------------------------------+--------------------+---------------------------------+--------------------+--------------------+
|     warc_target_uri|warc_ip_address|        content_type|content_length|              server|                       page_title|                 page_description|              emails|                         keywords|    script_src_attrs|   script_type_attrs|
+--------------------+---------------+--------------------+--------------+--------------------+---------------------------------+---------------------------------+--------------------+---------------------------------+--------------------+--------------------+
|http://0081.b-ch....|   52.68.28.174|application/http;...|         13739|Apache/2.2.34 (Am...|             playstation(r)3�ֵ...|                             NULL|                    |                             NULL

                                                                                

In [60]:
df.printSchema()

root
 |-- warc_target_uri: string (nullable = true)
 |-- warc_ip_address: string (nullable = true)
 |-- content_type: string (nullable = true)
 |-- content_length: string (nullable = true)
 |-- server: string (nullable = true)
 |-- page_title: string (nullable = true)
 |-- page_description: string (nullable = true)
 |-- emails: string (nullable = false)
 |-- keywords: string (nullable = true)
 |-- script_src_attrs: string (nullable = false)
 |-- script_type_attrs: string (nullable = false)



In [86]:
input_file.replace(".warc", "_extract")

'/opt/workspace/datasets/common-crawl/CC-MAIN-20240725114544-20240725144544-00476_extract'

In [87]:
output_file = input_file.replace(".warc", "_extract")
df.write.option("delimiter", "\t").mode("overwrite").csv(output_file, header=True)

                                                                                

In [64]:
!du -hs datasets/common-crawl/CC-MAIN-20240725114544-20240725144544-00476.warc_extract.tsv

103M	datasets/common-crawl/CC-MAIN-20240725114544-20240725144544-00476.warc_extract.tsv
