In [1]:
import re
from urllib.parse import urlparse

import pyspark.sql.functions as F
from pyspark.sql.types import StructField, StructType, StructType, StringType, ArrayType
import pandas as pd

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("top-20-jslibs") \
    .master("spark://spark-master:7077") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "file:///opt/spark/spark-events/") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", 1) \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.initialExecutors", 0) \
    .config("spark.dynamicAllocation.minExecutors", 1) \
    .config("spark.dynamicAllocation.maxExecutors", 15) \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.scheduler.mode", "FAIR") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/25 12:01:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/25 12:01:35 WARN Utils: spark.dynamicAllocation.initialExecutors less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.
24/09/25 12:01:35 WARN Utils: spark.executor.instances less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.


In [3]:
df = spark.read.parquet("/opt/workspace/warc_yearly/script_extraction_out_2022/")

In [10]:
jslib_regex = re.compile(r"/?(?:js\.)?([^/?]+\.js)")
regex_broadcast = spark.sparkContext.broadcast(jslib_regex)

In [9]:
@F.udf(StringType())
def extract_domain(host):
    parsed_url = urlparse(host)
    return parsed_url.netloc

In [6]:
df = df.withColumn("domain", extract_domain("host"))

In [7]:
# @F.udf(ArrayType(StringType()))
# def attrs_to_libs(src_attrs):
#     """Parse list of src attrs to have JS libs ONLY."""
#     splits = src_attrs.split('|')
#     parser = parser_broadcast.value
#     for i, s in enumerate(splits):
#         # handles cases like: "\'http://armenia.pl/wp-content/plugins/uk-cookie-consent/assets/js/uk-cookie-consent-js.js?ver=2.3.0\'"
#         s = s.strip("\'")
#         s = s.strip("\\")
        
#         try:
#             s = parser(s).path.split('/')[-1]
#         except:
#             # handles cases like "https://wpsuspension.hujquery(document).off('click.fb-start', '[data-trigger]');?8600c2288b94838e7ddbb879e6329a62"
#             continue
            
#         if not "js" in s:
#             splits[i] = None
#             continue

#         # jquery.min.js and jquery.js are the same
#         s = re.sub(r"\.min\.", '.', s)
        
#         # handles cases like: js.cookie.min.js?ver=2.1.4-wc.9.1.4
#         s = re.sub(r"^js\.", '', s)
#         if s == "js":
#             continue
#         splits[i] = s
        
#     return splits

In [8]:
@F.udf(ArrayType(StringType()))
def attrs_to_libs(src_attrs):
    """Parse list of src attrs to have JS libs ONLY."""
    splits = src_attrs.split('|')
    regex_obj = regex_broadcast.value
    for i, s in enumerate(splits):
        # m = re.search(r"/?(?:js\.)?([^/?]+\.js)", s)
        m = regex_obj.search(s)
        if m:
            splits[i] = m.group(1)
        else:
            splits[i] = None
            continue
        splits[i] = re.sub(r"\.min\.", '.', splits[i])
            
    return splits

In [9]:
domain_lib_df = df.withColumn("js_lib", attrs_to_libs("script_src_attrs")).limit(746096//2)

In [10]:
domain_lib_df = domain_lib_df.select(domain_lib_df.domain, F.explode(domain_lib_df.js_lib).alias("js_lib")).dropna()

In [11]:
domain_lib_df = domain_lib_df.dropDuplicates(["domain", "js_lib"])

In [30]:
# domain_lib_df.show(n=100, truncate=False)

In [25]:
count_df = domain_lib_df.groupby("js_lib").agg(F.count("domain").alias("domain_count"))

In [26]:
count_df.sort("domain_count", ascending=False).show(n=50, truncate=False)

[Stage 15:>                                                         (0 + 1) / 1]

+--------------------------------------------------------------------------------+------------+
|js_lib                                                                          |domain_count|
+--------------------------------------------------------------------------------+------------+
|jquery.js                                                                       |150047      |
|jquery-migrate.js                                                               |89902       |
|bootstrap.js                                                                    |37372       |
|core.js                                                                         |33776       |
|index.js                                                                        |32533       |
|comment-reply.js                                                                |30968       |
|wp-polyfill.js                                                                  |29149       |
|scripts.js                             

24/09/25 08:20:14 ERROR TransportResponseHandler: Still have 1 requests outstanding when connection from /172.18.0.7:40948 is closed
24/09/25 08:20:14 WARN BlockManagerMasterEndpoint: Error trying to remove shuffle 4 from block manager BlockManagerId(42, 172.18.0.7, 34059, None)
java.io.IOException: Connection from /172.18.0.7:40948 closed
	at org.apache.spark.network.client.TransportResponseHandler.channelInactive(TransportResponseHandler.java:147)
	at org.apache.spark.network.server.TransportChannelHandler.channelInactive(TransportChannelHandler.java:117)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:305)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:281)
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelInactive(AbstractChannelHandlerContext.java:274)
	at io.netty.channel.ChannelInboundHandlerAdapter.channelInactive(ChannelInboundHandlerAdapter.java