In [1]:
from urllib.parse import urlparse

import pyspark.sql.functions as F
from pyspark.sql.types import StructField, StructType, StructType, StringType, ArrayType
import pandas as pd

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("co-occurrence") \
    .master("spark://spark-master:7077") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "file:///opt/spark/spark-events/") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", 1) \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.initialExecutors", 0) \
    .config("spark.dynamicAllocation.minExecutors", 1) \
    .config("spark.dynamicAllocation.maxExecutors", 10) \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.scheduler.mode", "FAIR") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/21 11:10:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/21 11:10:17 WARN Utils: spark.dynamicAllocation.initialExecutors less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.
24/09/21 11:10:17 WARN Utils: spark.executor.instances less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 52186)
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/local/lib/python3.11/socketserver.py", line 348, in process_request
    self.fini

In [3]:
df = spark.read.parquet("/opt/workspace/warc_yearly/script_extraction_out_2023/")

In [4]:
df.show(n=5)

+--------------+--------------------+---------+------+--------------------+
|            ip|                host|   server|emails|    script_src_attrs|
+--------------+--------------------+---------+------+--------------------+
| 168.76.191.88|http://00-am.com/...|    nginx|      |/template/mxone/m...|
| 168.76.191.88|http://00-am.com/...|    nginx|      |/template/mxone/m...|
|  103.43.188.3|http://0731dj.com...|wts/1.7.0|      |\'/packs/layui/la...|
|  103.43.188.3|http://0731dj.com...|wts/1.7.0|      |                    |
|157.112.176.59|http://100-meizan...|    nginx|      |\'http://100-meiz...|
+--------------+--------------------+---------+------+--------------------+
only showing top 5 rows



In [5]:
parser_broadcast = spark.sparkContext.broadcast(urlparse)

In [6]:
@F.udf(StringType())
def extract_domain(host):
    parser = parser_broadcast.value
    parsed_url = parser(host)
    return parsed_url.netloc

In [7]:
df = df.withColumn("domain", extract_domain("host"))

In [8]:
df.show(n=5)

[Stage 2:>                                                          (0 + 1) / 1]

+--------------+--------------------+---------+------+--------------------+--------------+
|            ip|                host|   server|emails|    script_src_attrs|        domain|
+--------------+--------------------+---------+------+--------------------+--------------+
| 168.76.191.88|http://00-am.com/...|    nginx|      |/template/mxone/m...|     00-am.com|
| 168.76.191.88|http://00-am.com/...|    nginx|      |/template/mxone/m...|     00-am.com|
|  103.43.188.3|http://0731dj.com...|wts/1.7.0|      |\'/packs/layui/la...|    0731dj.com|
|  103.43.188.3|http://0731dj.com...|wts/1.7.0|      |                    |    0731dj.com|
|157.112.176.59|http://100-meizan...|    nginx|      |\'http://100-meiz...|100-meizan.com|
+--------------+--------------------+---------+------+--------------------+--------------+
only showing top 5 rows



                                                                                

to remove:
- "js" not in string
- "js"
- having ','

In [9]:
# @udf(ArrayType(StringType()))
# def attrs_to_libs(src_attrs):
#     """Parse list of src attrs to have JS libs ONLY."""
#     splits = src_attrs.split('|')
#     for i, s in enumerate(splits):
#         if s.strip('.') == "js":
#             print(s, file=open("/opt/workspace/test.txt", 'a'))
#         s = s.split('/')[-1]
#         if "js" not in s:
#             splits[i] = ''
#             continue
#         # handles this case: js.cookie.min.js?ver=2.1.4-wc.9.1.4
#         s = s.replace(".min", '')
#         s = s.split("js")[-2] + "js"
#         splits[i] = s.strip('.')
#     return splits

In [10]:
urlparse("https://img1.shanghaixiaochagu.com/assets/js/redbag-cdown.js?zone=asia/hong_kong&1").path.split('/')[-1]

'redbag-cdown.js'

In [19]:
@F.udf(ArrayType(StringType()))
def attrs_to_libs(src_attrs):
    """Parse list of src attrs to have JS libs ONLY."""
    splits = src_attrs.split('|')
    parser = parser_broadcast.value
    for i, s in enumerate(splits):
        try:
            s = parser(s).path.split('/')[-1]
        except:
            # this handles cases like "https://wpsuspension.hujquery(document).off('click.fb-start', '[data-trigger]');?8600c2288b94838e7ddbb879e6329a62"
            pass
        s = s.split(',')[0]
        s = s.replace('/', '')
        
        if not "js" in s \
        or s == "js":
            splits[i] = None
            continue

        # handles this case: js.cookie.min.js?ver=2.1.4-wc.9.1.4
        try:
            splits[i] = s.split("js")[-2].strip('.') + ".js"
        except:
            pass
    return splits

In [20]:
domain_lib_df = df.withColumn("js_libs", attrs_to_libs("script_src_attrs"))

In [21]:
domain_lib_df.select("js_libs").show(n=5)

+--------------------+
|             js_libs|
+--------------------+
|[jquery.js, jquer...|
|[jquery.js, jquer...|
|[layui.js, jquery...|
|              [NULL]|
|[leaflet.js, leaf...|
+--------------------+
only showing top 5 rows



In [22]:
domain_lib_df.count()

29843875

In [23]:
29843875 // 4

7460968

In [24]:
domain_lib_df = domain_lib_df.limit(7460968)

In [25]:
domain_lib_df = domain_lib_df.select(domain_lib_df.domain, F.explode(domain_lib_df.js_libs).alias("js_libs")).dropna()

In [18]:
domain_lib_df.show(truncate=False)



+-----------------+----------------------------------+
|domain           |js_libs                           |
+-----------------+----------------------------------+
|ecosupplements.nl|jquery.min.js                     |
|ecosupplements.nl|jquery.easy-autocomplete.min.js   |
|ecosupplements.nl|husky.js                          |
|ecosupplements.nl|jquery.blockui.min.js             |
|ecosupplements.nl|add-to-cart.min.js                |
|ecosupplements.nl|jquery.flexslider.min.js          |
|ecosupplements.nl|photoswipe.min.js                 |
|ecosupplements.nl|photoswipe-ui-default.min.js      |
|ecosupplements.nl|cookie.min.js                     |
|ecosupplements.nl|main-front.js                     |
|ecosupplements.nl|wpm-public__premium_only.p1.min.js|
|ecosupplements.nl|tokenization-form.min.js          |
|ecosupplements.nl|underscore.min.js                 |
|ecosupplements.nl|wp-util.min.js                    |
|ecosupplements.nl|add-to-cart-variation.min.js      |
|ecosupple

                                                                                

In [26]:
domain_lib_df = domain_lib_df.dropDuplicates(["domain", "js_libs"])

In [27]:
joined_df = domain_lib_df.alias("df1").join(
    domain_lib_df.alias("df2"),
    F.col("df1.domain") == F.col("df2.domain"),
    "inner"
).select(
    F.col("df1.domain"),
    F.col("df1.js_libs").alias("js_lib1"),
    F.col("df2.js_libs").alias("js_lib2"),
)

In [28]:
joined_df = joined_df.filter(F.col("js_lib1") != F.col("js_lib2"))

In [30]:
co_occ_df = joined_df.groupBy("js_lib1", "js_lib2").agg(F.count_distinct("domain").alias("domain_count"))

In [18]:
co_occ_df.sort("domain_count", ascending=False).write.mode("overwrite").parquet("jslibs_co_occ_2023")

                                                                                

In [19]:
temp = spark.read.parquet("./jslibs_co_occ_2023/")

In [24]:
temp.sort("domain_count", ascending=False).show(n=50, truncate=False)



+--------------------------+--------------------------+------------+
|js_lib1                   |js_lib2                   |domain_count|
+--------------------------+--------------------------+------------+
|jquery.min.js             |jquery.min.js             |4005823     |
|jquery-migrate.min.js     |jquery-migrate.min.js     |2862142     |
|jquery.min.js             |jquery-migrate.min.js     |2604344     |
|jquery-migrate.min.js     |jquery.min.js             |2604344     |
|index.js                  |index.js                  |1125419     |
|wp-polyfill.min.js        |wp-polyfill.min.js        |1035894     |
|bootstrap.min.js          |bootstrap.min.js          |994347      |
|core.min.js               |core.min.js               |989264      |
|comment-reply.min.js      |comment-reply.min.js      |983022      |
|wp-polyfill.min.js        |jquery.min.js             |972877      |
|jquery.min.js             |wp-polyfill.min.js        |972877      |
|index.js                  |jquery

                                                                                

In [75]:
s = "https://img1.shanghaixiaochagu.com/assets/js/redbag-cdown.js?zone=asia/hong_kong&1"

In [127]:
urlparse("wp-includes")

ParseResult(scheme='', netloc='', path='wp-includes', params='', query='', fragment='')

In [42]:
s.split("js")[-2]

'cookie.min.'

In [43]:
s.split("js")[-2].strip('.') + ".js"

'cookie.min.js'

In [139]:
if "js" in "wp-content":
    print("hi")