In [39]:
import os
import time
import shutil
import pathlib
import subprocess
from urllib.request import urlretrieve

In [32]:
os.makedirs("warc_paths", exist_ok=True)
with open("yearly_crawls.txt", 'r') as f:
    for cc_crawl in f:
        year = cc_crawl.split('-')[-2]
        file_name = f"./warc_paths/warc_{year}.paths.gz"
        urlretrieve(cc_crawl, file_name)
        os.system(f"gzip -d {file_name}")

gzip: ./warc_paths/warc_2013.paths already exists;	not overwritten
gzip: ./warc_paths/warc_2014.paths already exists;	not overwritten
gzip: ./warc_paths/warc_2015.paths already exists;	not overwritten
gzip: ./warc_paths/warc_2016.paths already exists;	not overwritten
gzip: ./warc_paths/warc_2017.paths already exists;	not overwritten
gzip: ./warc_paths/warc_2018.paths already exists;	not overwritten
gzip: ./warc_paths/warc_2019.paths already exists;	not overwritten
gzip: ./warc_paths/warc_2020.paths already exists;	not overwritten
gzip: ./warc_paths/warc_2021.paths already exists;	not overwritten
gzip: ./warc_paths/warc_2022.paths already exists;	not overwritten
gzip: ./warc_paths/warc_2023.paths already exists;	not overwritten
gzip: ./warc_paths/warc_2024.paths already exists;	not overwritten


In [40]:
def num_warcs_to_proc(wp_file: str) -> int:
    """Returns the number of lines in the warc.paths file."""
    with open(f"./warc_paths/{wp_file}", 'r') as f:
        for count,_ in enumerate(f):
            pass
    return count + 1

In [51]:
def gen_file_splits(wp_file: str):
    """Given a warc.paths file, generates `.txt` files having specified number of WARC filepaths"""
    warc_sample_len = num_warcs_to_proc(wp_file) // 100
    os.system(f"./file_split.sh warc_paths/{wp_file} warc_splits/ {warc_sample_len} {wp_file.split('_')[-1].split('.')[0]}")

In [42]:
def to_paths(input_txt):
    """Converts the WARC URLs to their corresponding paths on the device."""
    updated = []
    with open(input_txt, 'r') as f:
        for l in f:
            l = l.split('/')[-1]
            updated.append("/opt/workspace/datasets/common_crawl/" + '.'.join(l.split('.')[:-1]))

    with open(input_txt, 'w') as f:
        for l in updated:
            f.write(l + "\n")

In [43]:
def submit_job(input_txt: str):
    """Submits two spark jobs and waits for them to finish."""
    cmd1 = ["spark-submit", "ipwarc_mmdb_pdudf.py", "--input_file", f"warc_splits/{input_txt}", "--output_dir", "ipmaxmind_out"]
    cmd2 = ["spark-submit", "script_extraction.py", "--input_file", f"warc_splits/{input_txt}", "--output_dir", "script_extraction_out"]

    process1 = subprocess.Popen(cmd1)
    process2 = subprocess.Popen(cmd2)

    process1.wait()
    process2.wait()

In [38]:
def process_wp(wp_file: str):
    """Process a warc.paths file by generating splits, and submitting each of the split `.txt` file to spark."""
    start_time = time.time()
    os.makedirs("warc_splits", exist_ok=True)
    gen_file_splits(wp)
    
    ckpt_dir = pathlib.Path("warc_splits/.ipynb_checkpoints/")
    if ckpt_dir.exists() and ckpt_dir.is_dir():
        shutil.rmtree(ckpt_dir)

    data_dir = "/opt/workspace/datasets/common_crawl/"
    for input_txt in sorted(os.listdir("warc_splits")):
        os.makedirs(data_dir)
        os.system(f"./get_files.sh warc_splits/{input_txt} {data_dir}")
        to_paths(f"warc_splits/{input_txt}")
        submit_job(input_txt)
        shutil.rmtree(data_dir)
        
    shutil.rmtree("./warc_splits")
    end_time = time.time()
    total_time = end_time - start_time
    print(f"Total time taken: {total_time:.2f} seconds")

In [30]:
# for wp in sorted(os.listdir("warc_paths")):
#     # remove exist_ok arg in the actual run
#     process_wp(wp)
#     # break

wp = "warc_2024.paths"
process_wp(wp)

File split completed. Files saved in warc_splits/
Downloading files from file: warc_splits/warc_part_001.txt ...
Total files to download: 50
Downloading https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-33/segments/1722640461318.24/warc/CC-MAIN-20240806001923-20240806031923-00708.warc.gz ...
Downloading https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-33/segments/1722640476915.25/warc/CC-MAIN-20240806064139-20240806094139-00035.warc.gz ...
Downloading https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-33/segments/1722641085898.84/warc/CC-MAIN-20240813204036-20240813234036-00442.warc.gz ...
Downloading https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-33/segments/1722641333615.45/warc/CC-MAIN-20240816030812-20240816060812-00313.warc.gz ...
Downloading https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-33/segments/1722641045630.75/warc/CC-MAIN-20240812155418-20240812185418-00075.warc.gz ...
Downloading https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-33/segments/1722641

24/09/07 10:11:14 INFO SparkContext: Running Spark version 3.5.1
24/09/07 10:11:14 INFO SparkContext: OS info Linux, 6.8.0-39-generic, amd64
24/09/07 10:11:14 INFO SparkContext: Java version 11.0.24
24/09/07 10:11:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/07 10:11:14 INFO ResourceUtils: No custom resources configured for spark.driver.
24/09/07 10:11:14 INFO SparkContext: Submitted application: script_extraction
24/09/07 10:11:14 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 1, script: , vendor: , memory -> name: memory, amount: 4096, script: , vendor: , offHeap -> name: offHeap, amount: 0, script: , vendor: ), task resources: Map(cpus -> name: cpus, amount: 1.0)
24/09/07 10:11:14 INFO ResourceProfile: Limiting resource is cpus at 1 tasks per executor
24/09/07 10:11:14 INFO ResourceProfileManager: Added ResourceProfile id: 0
24/09/07 