In [1]:
import re
from urllib.parse import urlparse

import pyspark.sql.functions as F
from pyspark.sql.types import StringType, ArrayType

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("co-occurrence") \
    .master("spark://spark-master:7077") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "file:///opt/spark/spark-events/") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", 1) \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.initialExecutors", 0) \
    .config("spark.dynamicAllocation.minExecutors", 1) \
    .config("spark.dynamicAllocation.maxExecutors", 10) \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.scheduler.mode", "FAIR") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/01 14:30:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/01 14:30:46 WARN Utils: spark.dynamicAllocation.initialExecutors less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.
24/10/01 14:30:46 WARN Utils: spark.executor.instances less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.


In [3]:
df = spark.read.parquet("script_extraction_out/")

In [4]:
df.show(n=5)

                                                                                

+--------------+--------------------+----------+--------------------+--------------------+----+
|            ip|                host|    server|              emails|    script_src_attrs|year|
+--------------+--------------------+----------+--------------------+--------------------+----+
|  217.160.0.83|https://nadinesim...|    Apache|mailto:nadine@nad...|https://nadinesim...|2024|
|185.127.236.35|https://azionecat...| Glaucorm3|                    |\'https://azionec...|2024|
|  52.15.227.12|https://secureftp...|      NULL|                    |js/dist/jquery.mi...|2024|
|  2.59.135.142|https://www.zeitk...|     nginx|                    |/_assets/a739cde7...|2024|
|   104.21.60.7|http://www.250r.r...|cloudflare|                    |http://www.250r.r...|2024|
+--------------+--------------------+----------+--------------------+--------------------+----+
only showing top 5 rows



In [5]:
jslib_regex = re.compile(r"/?(?:js\.)?([^/?]+\.js)")
regex_broadcast = spark.sparkContext.broadcast(jslib_regex)

In [6]:
@F.udf(StringType())
def extract_domain(host):
    parsed_url = urlparse(host)
    return parsed_url.netloc

In [7]:
df = df.withColumn("domain", extract_domain("host"))

to remove:
- "js" not in string
- "js"
- having ','

In [9]:
# @udf(ArrayType(StringType()))
# def attrs_to_libs(src_attrs):
#     """Parse list of src attrs to have JS libs ONLY."""
#     splits = src_attrs.split('|')
#     for i, s in enumerate(splits):
#         if s.strip('.') == "js":
#             print(s, file=open("/opt/workspace/test.txt", 'a'))
#         s = s.split('/')[-1]
#         if "js" not in s:
#             splits[i] = ''
#             continue
#         # handles this case: js.cookie.min.js?ver=2.1.4-wc.9.1.4
#         s = s.replace(".min", '')
#         s = s.split("js")[-2] + "js"
#         splits[i] = s.strip('.')
#     return splits

In [8]:
@F.udf(ArrayType(StringType()))
def attrs_to_libs(src_attrs):
    """Parse list of src attrs to have JS libs ONLY."""
    splits = src_attrs.split('|')
    regex_obj = regex_broadcast.value
    for i, s in enumerate(splits):
        # m = re.search(r"/?(?:js\.)?([^/?]+\.js)", s)
        m = regex_obj.search(s)
        if m:
            splits[i] = m.group(1)
        else:
            splits[i] = None
            continue
        splits[i] = re.sub(r"\.min\.", '.', splits[i])

    return splits

In [19]:
# @F.udf(ArrayType(StringType()))
# def attrs_to_libs(src_attrs):
#     """Parse list of src attrs to have JS libs ONLY."""
#     splits = src_attrs.split('|')
#     parser = parser_broadcast.value
#     for i, s in enumerate(splits):
#         try:
#             s = parser(s).path.split('/')[-1]
#         except:
#             # this handles cases like "https://wpsuspension.hujquery(document).off('click.fb-start', '[data-trigger]');?8600c2288b94838e7ddbb879e6329a62"
#             pass
#         s = s.split(',')[0]
#         s = s.replace('/', '')

#         if not "js" in s \
#         or s == "js":
#             splits[i] = None
#             continue

#         # handles this case: js.cookie.min.js?ver=2.1.4-wc.9.1.4
#         try:
#             splits[i] = s.split("js")[-2].strip('.') + ".js"
#         except:
#             pass
#     return splits

In [12]:
domain_lib_df = df.withColumn("js_lib", attrs_to_libs("script_src_attrs"))

In [13]:
domain_lib_df.select("js_lib").show(n=5)

+--------------------+
|              js_lib|
+--------------------+
|[timeme.js, burst...|
|[jquery.js, boots...|
|[jquery.js, jquer...|
|[optout.js, NULL,...|
|[script.js, theme...|
+--------------------+
only showing top 5 rows



In [22]:
domain_lib_df.count()

29843875

In [23]:
29843875 // 4

7460968

In [11]:
# domain_lib_df = domain_lib_df.limit(7460968)

In [14]:
domain_lib_df = domain_lib_df.select(domain_lib_df.domain, F.explode(domain_lib_df.js_lib).alias("js_lib")).dropna()

In [18]:
domain_lib_df.show(truncate=False)



+-----------------+----------------------------------+
|domain           |js_libs                           |
+-----------------+----------------------------------+
|ecosupplements.nl|jquery.min.js                     |
|ecosupplements.nl|jquery.easy-autocomplete.min.js   |
|ecosupplements.nl|husky.js                          |
|ecosupplements.nl|jquery.blockui.min.js             |
|ecosupplements.nl|add-to-cart.min.js                |
|ecosupplements.nl|jquery.flexslider.min.js          |
|ecosupplements.nl|photoswipe.min.js                 |
|ecosupplements.nl|photoswipe-ui-default.min.js      |
|ecosupplements.nl|cookie.min.js                     |
|ecosupplements.nl|main-front.js                     |
|ecosupplements.nl|wpm-public__premium_only.p1.min.js|
|ecosupplements.nl|tokenization-form.min.js          |
|ecosupplements.nl|underscore.min.js                 |
|ecosupplements.nl|wp-util.min.js                    |
|ecosupplements.nl|add-to-cart-variation.min.js      |
|ecosupple

                                                                                

In [16]:
domain_lib_df = domain_lib_df.dropDuplicates(["domain", "js_lib"])

In [18]:
joined_df = domain_lib_df.alias("df1").join(
    domain_lib_df.alias("df2"),
    F.col("df1.domain") == F.col("df2.domain"),
    "inner"
).select(
    F.col("df1.domain"),
    F.col("df1.js_lib").alias("js_lib1"),
    F.col("df2.js_lib").alias("js_lib2"),
)

In [19]:
joined_df = joined_df.filter(F.col("js_lib1") != F.col("js_lib2"))

In [20]:
co_occ_df = joined_df.groupBy("js_lib1", "js_lib2").agg(F.count_distinct("domain").alias("domain_count"))

In [21]:
co_occ_df.sort("domain_count", ascending=False).write.mode("overwrite").parquet("jslibs_co_occ_2024")

                                                                                

In [22]:
temp = spark.read.parquet("./jslibs_co_occ_2024/")

In [24]:
temp.sort("domain_count", ascending=False).show(n=50, truncate=False)



+-----------------+-----------------+------------+
|js_lib1          |js_lib2          |domain_count|
+-----------------+-----------------+------------+
|jquery-migrate.js|jquery.js        |2285881     |
|jquery.js        |jquery-migrate.js|2285881     |
|jquery.js        |core.js          |847560      |
|core.js          |jquery.js        |847560      |
|index.js         |jquery.js        |806591      |
|jquery.js        |index.js         |806591      |
|i18n.js          |hooks.js         |765630      |
|hooks.js         |i18n.js          |765630      |
|jquery.js        |hooks.js         |763411      |
|hooks.js         |jquery.js        |763411      |
|core.js          |jquery-migrate.js|758554      |
|jquery-migrate.js|core.js          |758554      |
|i18n.js          |jquery.js        |738677      |
|jquery.js        |i18n.js          |738677      |
|jquery-migrate.js|index.js         |714294      |
|index.js         |jquery-migrate.js|714294      |
|comment-reply.js |jquery.js   

                                                                                