In [11]:
import re
from urllib.parse import urlparse

import pyspark.sql.functions as F
from pyspark.sql.types import StructField, StructType, StructType, StringType, ArrayType
import pandas as pd

In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("top-20-jslibs") \
    .master("spark://spark-master:7077") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "file:///opt/spark/spark-events/") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", 1) \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.initialExecutors", 0) \
    .config("spark.dynamicAllocation.minExecutors", 1) \
    .config("spark.dynamicAllocation.maxExecutors", 15) \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.scheduler.mode", "FAIR") \
    .getOrCreate()

In [13]:
df = spark.read.parquet("/opt/workspace/warc_yearly/script_extraction_out_2022/")

In [14]:
jslib_regex = re.compile(r"/?(?:js\.)?([^/?]+\.js)")
regex_broadcast = spark.sparkContext.broadcast(jslib_regex)

In [15]:
@F.udf(StringType())
def extract_domain(host):
    parsed_url = urlparse(host)
    return parsed_url.netloc

In [16]:
df = df.withColumn("domain", extract_domain("host"))

In [17]:
# @F.udf(ArrayType(StringType()))
# def attrs_to_libs(src_attrs):
#     """Parse list of src attrs to have JS libs ONLY."""
#     splits = src_attrs.split('|')
#     parser = parser_broadcast.value
#     for i, s in enumerate(splits):
#         # handles cases like: "\'http://armenia.pl/wp-content/plugins/uk-cookie-consent/assets/js/uk-cookie-consent-js.js?ver=2.3.0\'"
#         s = s.strip("\'")
#         s = s.strip("\\")
        
#         try:
#             s = parser(s).path.split('/')[-1]
#         except:
#             # handles cases like "https://wpsuspension.hujquery(document).off('click.fb-start', '[data-trigger]');?8600c2288b94838e7ddbb879e6329a62"
#             continue
            
#         if not "js" in s:
#             splits[i] = None
#             continue

#         # jquery.min.js and jquery.js are the same
#         s = re.sub(r"\.min\.", '.', s)
        
#         # handles cases like: js.cookie.min.js?ver=2.1.4-wc.9.1.4
#         s = re.sub(r"^js\.", '', s)
#         if s == "js":
#             continue
#         splits[i] = s
        
#     return splits

In [18]:
@F.udf(ArrayType(StringType()))
def attrs_to_libs(src_attrs):
    """Parse list of src attrs to have JS libs ONLY."""
    splits = src_attrs.split('|')
    regex_obj = regex_broadcast.value
    for i, s in enumerate(splits):
        # m = re.search(r"/?(?:js\.)?([^/?]+\.js)", s)
        m = regex_obj.search(s)
        if m:
            splits[i] = m.group(1)
        else:
            splits[i] = None
            continue
        splits[i] = re.sub(r"\.min\.", '.', splits[i])
            
    return splits

In [19]:
domain_lib_df = df.withColumn("js_lib", attrs_to_libs("script_src_attrs")).limit(746096//2)

In [20]:
domain_lib_df = domain_lib_df.select(domain_lib_df.domain, F.explode(domain_lib_df.js_lib).alias("js_lib")).dropna()

In [21]:
domain_lib_df = domain_lib_df.dropDuplicates(["domain", "js_lib"])

In [23]:
count_df = domain_lib_df.groupby("js_lib").agg(F.count("domain").alias("domain_count"))

In [25]:
sorted_df = count_df.sort("domain_count", ascending=False)

In [28]:
sorted_df.show(n=100, truncate=False)

[Stage 15:>                                                         (0 + 1) / 1]

+--------------------------------------------------------------------------------+------------+
|js_lib                                                                          |domain_count|
+--------------------------------------------------------------------------------+------------+
|jquery.js                                                                       |151156      |
|jquery-migrate.js                                                               |90603       |
|bootstrap.js                                                                    |37613       |
|core.js                                                                         |34099       |
|index.js                                                                        |32928       |
|comment-reply.js                                                                |31605       |
|wp-polyfill.js                                                                  |29462       |
|scripts.js                             

                                                                                