In [1]:
import re
from urllib.parse import urlparse

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType

In [2]:
from huggingface_hub import snapshot_download

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
spark = SparkSession.builder \
    .appName("top-20-jslibs") \
    .master("spark://spark-master:7077") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "file:///opt/spark/spark-events/") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", 1) \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.initialExecutors", 0) \
    .config("spark.dynamicAllocation.minExecutors", 1) \
    .config("spark.dynamicAllocation.maxExecutors", 15) \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.scheduler.mode", "FAIR") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/01 14:03:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/01 14:03:41 WARN Utils: spark.dynamicAllocation.initialExecutors less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.
24/10/01 14:03:41 WARN Utils: spark.executor.instances less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.


In [4]:
snapshot_download(repo_id="big-banyan-tree/BBT_CommonCrawl_2024", repo_type="dataset",
                  allow_patterns="script_extraction_out/*.parquet", local_dir=".")

Fetching 10 files: 100%|██████████| 10/10 [02:12<00:00, 13.25s/it]


'/opt/workspace/spark_qa'

In [7]:
df = spark.read.parquet("script_extraction_out/")

In [8]:
df.show()

+---------------+--------------------+------------------+--------------------+--------------------+----+
|             ip|                host|            server|              emails|    script_src_attrs|year|
+---------------+--------------------+------------------+--------------------+--------------------+----+
|   217.160.0.83|https://nadinesim...|            Apache|mailto:nadine@nad...|https://nadinesim...|2024|
| 185.127.236.35|https://azionecat...|         Glaucorm3|                    |\'https://azionec...|2024|
|   52.15.227.12|https://secureftp...|              NULL|                    |js/dist/jquery.mi...|2024|
|   2.59.135.142|https://www.zeitk...|             nginx|                    |/_assets/a739cde7...|2024|
|    104.21.60.7|http://www.250r.r...|        cloudflare|                    |http://www.250r.r...|2024|
|  46.183.11.162|https://taliarand...|             nginx|mailto:hello@tali...|https://taliarand...|2024|
|    65.60.61.13|https://almost4x4...|            Apach

In [9]:
# df = spark.read.parquet("/opt/workspace/warc_yearly/script_extraction_out_2022/")

In [9]:
jslib_regex = re.compile(r"/?(?:js\.)?([^/?]+\.js)")
regex_broadcast = spark.sparkContext.broadcast(jslib_regex)

In [10]:
@F.udf(StringType())
def extract_domain(host):
    parsed_url = urlparse(host)
    return parsed_url.netloc

In [11]:
df = df.withColumn("domain", extract_domain("host"))

In [17]:
# @F.udf(ArrayType(StringType()))
# def attrs_to_libs(src_attrs):
#     """Parse list of src attrs to have JS libs ONLY."""
#     splits = src_attrs.split('|')
#     parser = parser_broadcast.value
#     for i, s in enumerate(splits):
#         # handles cases like: "\'http://armenia.pl/wp-content/plugins/uk-cookie-consent/assets/js/uk-cookie-consent-js.js?ver=2.3.0\'"
#         s = s.strip("\'")
#         s = s.strip("\\")

#         try:
#             s = parser(s).path.split('/')[-1]
#         except:
#             # handles cases like "https://wpsuspension.hujquery(document).off('click.fb-start', '[data-trigger]');?8600c2288b94838e7ddbb879e6329a62"
#             continue

#         if not "js" in s:
#             splits[i] = None
#             continue

#         # jquery.min.js and jquery.js are the same
#         s = re.sub(r"\.min\.", '.', s)

#         # handles cases like: js.cookie.min.js?ver=2.1.4-wc.9.1.4
#         s = re.sub(r"^js\.", '', s)
#         if s == "js":
#             continue
#         splits[i] = s

#     return splits

In [12]:
@F.udf(ArrayType(StringType()))
def attrs_to_libs(src_attrs):
    """Parse list of src attrs to have JS libs ONLY."""
    splits = src_attrs.split('|')
    regex_obj = regex_broadcast.value
    for i, s in enumerate(splits):
        # m = re.search(r"/?(?:js\.)?([^/?]+\.js)", s)
        m = regex_obj.search(s)
        if m:
            splits[i] = m.group(1)
        else:
            splits[i] = None
            continue
        splits[i] = re.sub(r"\.min\.", '.', splits[i])

    return splits

In [13]:
domain_lib_df = df.withColumn("js_lib", attrs_to_libs("script_src_attrs"))

In [14]:
domain_lib_df = domain_lib_df.select(domain_lib_df.domain, F.explode(domain_lib_df.js_lib).alias("js_lib")).dropna()

In [15]:
domain_lib_df = domain_lib_df.dropDuplicates(["domain", "js_lib"])

In [16]:
count_df = domain_lib_df.groupby("js_lib").agg(F.count("domain").alias("domain_count"))

In [17]:
sorted_df = count_df.sort("domain_count", ascending=False)

In [18]:
sorted_df.show(n=100, truncate=False)



+--------------------------------------------------------------------------------+------------+
|js_lib                                                                          |domain_count|
+--------------------------------------------------------------------------------+------------+
|jquery.js                                                                       |3649381     |
|jquery-migrate.js                                                               |2291335     |
|core.js                                                                         |959250      |
|index.js                                                                        |937461      |
|bootstrap.js                                                                    |892675      |
|hooks.js                                                                        |798132      |
|main.js                                                                         |787037      |
|i18n.js                                

                                                                                