In [1]:
from sparktools.sparkhandler import SparkHandler
from dataproc.dataproctools import get_extracted_wet, save_rdd, load_rdd
from pyspark import StorageLevel

We use the spark handler class so we can use a unified spark session across our functions. This essentially just helps us initialize the spark session and stores the session and context within itself

In [None]:
handler = SparkHandler(available_cores=4) # look at the parameters for SparkHandler to increase max memory
ss = handler.get_spark_session()
sc = handler.get_spark_context()

Here we can load in our str data. Do note, `get_extracted_wet` doesn't return the sample size exactly, but an approximate amount based on the wet we use

In [3]:
loaded_str_data = get_extracted_wet(spark_context=sc, approx_sample_size=100000, wet_paths_file="../data/wet.paths") #load_rdd(spark_context=sc, path_to_load="../saved_intermediates/rawStrRDD")

                                                                                

Here's what the start of a WET file looks like

And this is what it looks like when printed in a prettier format

In [4]:
def split_sites(string):
    import re
    # The header of every item is WARC/1.0\r\nWARC-Type: conversion, but we assume that the WARC version could be different
    return re.split(r"WARC/\d.\d\r\nWARC-Type: conversion\r\n", string)[1:]

In [5]:
split_rdd = loaded_str_data.flatMap(split_sites)

In [6]:
def regex_split_to_kv(string):
    import re
    return re.split(r"Content-Length: \d+\r\n", string)

In [7]:
KV_mapped = split_rdd.map(lambda x: tuple(regex_split_to_kv(x)))

In [8]:
Ksplit = KV_mapped.map(lambda x: (x[0].split('\r\n') ,x[1]))

In [9]:
def get_warc_target_uri(split_key):
    return split_key[0].split(': ')[1]
def get_warc_target_date(split_key):
    return split_key[1].split(': ')[1]
def get_warc_record_id(split_key):
    return split_key[2].split(': ')[1]
def get_warc_languages(split_key):
    return split_key[5].split(': ')[1].split(',')
def get_tld_from_url(url):
    import re
    # search for strings that start with "." and are followed by a "/" that are at least 2 letters long and also search for also that russian tld approximation that makes no sense
    searched = re.search(string=url, pattern=r"\.((?:xn--[a-z0-9]+)|[a-z]{2,})(?=[/?#:]|$)")
    return searched[0] if searched is not None else "None"
get_tld_from_url("http://022hezi.com/tag/%E4%B9%9D%E5%8D%81%E5%85%AB")

'.com'

In [10]:
with_processed_keys = Ksplit.map(lambda x:
                                 (
                                     get_warc_record_id(x[0]),
                                     get_warc_target_uri(x[0]),
                                     get_warc_target_date(x[0]),
                                     get_warc_languages(x[0]),
                                     x[1]
                                 )
                                 )

In [24]:
ex = with_processed_keys.take(20)

                                                                                

In [11]:
tlds_added = with_processed_keys.map(lambda x: (x[0], x[1], x[2], x[3], get_tld_from_url(x[1]), x[4]))

In [28]:
df = ss.createDataFrame(tlds_added, schema=['warc_id', 'target_uri', 'date', 'languages', 'tld', 'raw_content'])

                                                                                

In [29]:
df.toPandas()

                                                                                

Unnamed: 0,warc_id,target_uri,date,languages,tld,raw_content
0,<urn:uuid:02be2c7b-d20b-4e22-8a93-401bf6614637>,http://022hezi.com/tag/%E4%B9%9D%E5%8D%81%E5%8...,2025-11-15T21:55:56Z,[zho],.com,\r\n安徽禾兹电子交流圈电子网_分享电子电娱最新资讯-安徽禾兹电子交流圈电子网\n安徽禾兹...
1,<urn:uuid:ca033d18-91ca-4ca3-a00e-d3bf42eced7b>,http://0skgxi9.jtznech.com/?3235018.html?xiang...,2025-11-15T21:28:16Z,[zho],.com,\r\n最新版宝马棋牌/稳定版APP-体验极致娱乐，畅享无限乐趣！\n硬件\nAPP下载\n...
2,<urn:uuid:c5f758ee-2df0-4b53-836c-b705cc3d67d6>,http://100-days-of-freedom.com/2015/10/31/tag-...,2025-11-15T21:14:55Z,"[deu, eng]",.com,\r\nTag 117 – Geburtstag bei BMW Marmotor | 10...
3,<urn:uuid:c659d503-6d5c-4905-9eaa-11afebc09edf>,http://10hv.com/docs,2025-11-15T21:37:54Z,[zho],.com,\r\n樱花动漫imo-免费VP加速器\n樱花动漫imo\n工具|时间：2025-11-16...
4,<urn:uuid:e0b888cf-22dc-4cf4-b1df-2772fae69926>,http://10lfgzt.evs888.com/?3359101.html?qingyu...,2025-11-15T21:16:39Z,"[zho, eng]",.com,\r\n竞彩首页专家推荐/平板兼容APP-畅享最佳娱乐体验，尽在此处！\n侨网•正文\n侨宝...
...,...,...,...,...,...,...
22070,<urn:uuid:a6366f62-e2bf-4f43-8b95-44069e0dd306>,https://zustrebisov.sk/udalosti/pozvanka-ldo-7...,2025-11-15T20:57:23Z,[slk],.sk,\r\nPozvánka – LDO 7. ročník súťaže „VLASTNÁ T...
22071,<urn:uuid:5610bead-a464-46b0-bd83-e63355517e7c>,https://zwroty.smile.pl/nadaj-zwrot?domain=mxm...,2025-11-15T20:41:15Z,[pol],.pl,\r\nKod do Paczkomatu InPost | | Smile\nSkorzy...
22072,<urn:uuid:18a28d69-3090-40ec-80a0-c950bca1224f>,https://zyorna.ru/catalog/item/smola-naturalna...,2025-11-15T21:59:45Z,[rus],.ru,"\r\nЛадан натуральный «Dammar» (50 г), цена — ..."
22073,<urn:uuid:7c3f1f36-3a75-4586-9a94-7d022fc1c8a8>,https://zzmicky.com/vod/lgjqp/223337.html,2025-11-15T22:09:11Z,"[zho, eng]",.com,\r\n《731》电影片经典回顾合集-免费高清剧情片系列资源-老光棍电影院\n首页\n电影片...


In [14]:
ss.stop()