In [3]:
import re
from urllib.parse import urlparse

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, array_contains
from pyspark.sql.types import StringType, ArrayType

In [4]:
spark = SparkSession.builder \
    .appName("url-tokenize") \
    .master("spark://spark-master:7077") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "file:///opt/spark/spark-events/") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", 1) \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.initialExecutors", 2) \
    .config("spark.dynamicAllocation.minExecutors", 2) \
    .config("spark.dynamicAllocation.maxExecutors", 10) \
    .config("spark.scheduler.mode", "FAIR") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/24 14:39:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/24 14:39:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/09/24 14:39:34 WARN Utils: spark.executor.instances less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 54726)
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/local/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/usr/local/lib/python3.11/socketserve

In [5]:
spark_df = spark.read.parquet('/opt/workspace/warc_yearly/script_extraction_out_2024/')
df_rows = spark_df.count()
df_cols = len(spark_df.columns)
(df_rows, df_cols)

                                                                                

(22127510, 5)

In [6]:
spark_df.show()

+--------------+--------------------+--------------------+--------------------+--------------------+
|            ip|                host|              server|              emails|    script_src_attrs|
+--------------+--------------------+--------------------+--------------------+--------------------+
| 38.165.50.153|http://04v.career...|               nginx|mailto:al_andino@...|http://kit.fontaw...|
| 159.75.83.151|http://088022.com...|Apache/2.4.39 (Wi...|                    |http://088022.com...|
| 38.165.49.137|http://1.vapemanz...|               nginx|                    |http://query.aliy...|
|178.254.10.206|http://118527.web...|              Apache|                    |./styles/pronight...|
|219.234.30.122|http://1395656471...|           wts/1.7.4|                    |/pc/js/jquery-1.1...|
|  173.231.4.85|http://14star.com...|Apache/2.2.31 (Un...|mailto:www14star@...|include/js/common...|
| 159.69.237.22|http://15938.home...|               nginx|mailto:emilyy.rod...|//img.homepa

In [7]:
sample_host = spark_df.first()['host']
urlparse(sample_host).netloc

                                                                                

'04v.career-bengoshi.net'

In [8]:
domain_parser = spark.sparkContext.broadcast(urlparse)


@udf(StringType())
def get_domain(host_url):
    parser = domain_parser.value
    parsed_url = parser(host_url)
    return parsed_url.netloc

In [9]:
spark_df_domains = spark_df.withColumn('domain', get_domain('host'))
spark_df_domains.show()

+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|            ip|                host|              server|              emails|    script_src_attrs|              domain|
+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| 38.165.50.153|http://04v.career...|               nginx|mailto:al_andino@...|http://kit.fontaw...|04v.career-bengos...|
| 159.75.83.151|http://088022.com...|Apache/2.4.39 (Wi...|                    |http://088022.com...|          088022.com|
| 38.165.49.137|http://1.vapemanz...|               nginx|                    |http://query.aliy...|    1.vapemanzil.com|
|178.254.10.206|http://118527.web...|              Apache|                    |./styles/pronight...|118527.webhosting...|
|219.234.30.122|http://1395656471...|           wts/1.7.4|                    |/pc/js/jquery-1.1...|     13956564713.com|
|  173.231.4.85|http://1

                                                                                

# tiktoken

In [17]:
import tiktoken

In [18]:
cl_enc = tiktoken.get_encoding("o200k_base")

txt = "04v.career-bengoshi.net"

enc_txt = cl_enc.encode(txt)
outs = []
for token in enc_txt:
    outs.append(cl_enc.decode([token]))
' | '.join(outs)

'04 | v | .c | areer | -b | eng | oshi | .net'

# BERT tokenizer

In [19]:
from transformers import BertTokenizer

In [20]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
txt_tokens = tokenizer.tokenize(txt)
txt_tokens

['04', '##v', '.', 'career', '-', 'ben', '##gos', '##hi', '.', 'net']

In [21]:
word_match_pattern = re.compile(r"\b[a-z]+\b")

In [22]:
words = [word for word in txt_tokens if word_match_pattern.match(word)]
words

['career', 'ben', 'net']

In [23]:
broadcast_pattern = spark.sparkContext.broadcast(word_match_pattern)

In [24]:
@udf(ArrayType(StringType()))
def get_url_topics(host_url):
    url_tokens = tokenizer.tokenize(host_url)
    word_parser = broadcast_pattern.value
    return list(filter(lambda word: word_parser.match(word), url_tokens))

In [25]:
url_topic_df = spark_df_domains.withColumn("domain_topics", get_url_topics("domain"))
url_topic_df.show(n=20)

[Stage 8:>                                                          (0 + 1) / 1]

+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|            ip|                host|              server|              emails|    script_src_attrs|              domain|     domain_topics|
+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
| 38.165.50.153|http://04v.career...|               nginx|mailto:al_andino@...|http://kit.fontaw...|04v.career-bengos...|[career, ben, net]|
| 159.75.83.151|http://088022.com...|Apache/2.4.39 (Wi...|                    |http://088022.com...|          088022.com|             [com]|
| 38.165.49.137|http://1.vapemanz...|               nginx|                    |http://query.aliy...|    1.vapemanzil.com|         [va, com]|
|178.254.10.206|http://118527.web...|              Apache|                    |./styles/pronight...|118527.webhosting...|         [web, de]|
|219.234.30.1

                                                                                

In [27]:
com_cnt = url_topic_df.filter(
    array_contains(col("domain_topics"), "com")
).count()
print(f"Number of domains with '.com':\n{com_cnt}")

                                                                                

"Number of domains with '.com': 10500444"