In [5]:
import os
import time
import shutil
import pathlib
import subprocess
from urllib.request import urlretrieve

In [2]:
os.makedirs("warc_paths", exist_ok=True)
with open("yearly_crawls.txt", 'r') as f:
    for cc_crawl in f:
        year = cc_crawl.split('-')[-2]
        file_name = f"./warc_paths/warc_{year}.paths.gz"
        urlretrieve(cc_crawl, file_name)
        os.system(f"gzip -d {file_name}")

In [3]:
os.makedirs("./unsuccessful/", exist_ok=True)
os.makedirs("./success/", exist_ok=True)

In [4]:
def num_warcs_to_proc(wp_file: str) -> int:
    """Returns the number of lines in the warc.paths file."""
    with open(f"./warc_paths/{wp_file}", 'r') as f:
        for count,_ in enumerate(f):
            pass
    return count + 1

In [5]:
def gen_file_splits(wp_file: str):
    """Given a warc.paths file, generates `.txt` files having specified number of WARC filepaths"""
    warc_sample_len = num_warcs_to_proc(wp_file) // 100
    os.system(f"./file_split.sh warc_paths/{wp_file} warc_splits/ {warc_sample_len} {wp_file.split('_')[-1].split('.')[0]}")

In [6]:
def to_paths(input_txt):
    """Converts the WARC URLs to their corresponding paths on the device."""
    updated = []
    with open(input_txt, 'r') as f:
        for l in f:
            l = l.split('/')[-1]
            updated.append("/opt/workspace/datasets/common_crawl/" + '.'.join(l.split('.')[:-1]))

    with open(input_txt, 'w') as f:
        for l in updated:
            f.write(l + "\n")

In [7]:
def submit_job(input_txt: str):
    """Submits two spark jobs and waits for them to finish. If both jobs succeed, then the `input_txt` file is moved to success/ dir."""
    os.makedirs("tmp/", exist_ok=True)
    cmd1 = ["spark-submit", "ipwarc_mmdb_pdudf-errh.py", "--input_file", f"warc_splits/{input_txt}", "--output_dir", "tmp/ipmaxmind_out"]
    cmd2 = ["spark-submit", "script_extraction-errh.py", "--input_file", f"warc_splits/{input_txt}", "--output_dir", "tmp/script_extraction_out"]

    status_file = "job_status.txt"
    if os.path.exists(status_file):
        os.remove(status_file)

    process1 = subprocess.Popen(cmd1)
    process2 = subprocess.Popen(cmd2)

    process1.wait()
    process2.wait()

    with open(status_file, 'r') as f:
        statuses = f.readlines()

    # Check if both jobs succeeded
    if all("success" in status for status in statuses):
        
        # Move temp output to final directory
        for filename in os.listdir("tmp/ipmaxmind_out/"):
            if filename == ".ipynb_checkpoints": continue
            src_file = os.path.join("tmp/ipmaxmind_out/", filename)
            dst_file = os.path.join("ipmaxmind_out/", filename)
            shutil.move(src_file, dst_file)

        for filename in os.listdir("tmp/script_extraction_out/"):
            if filename == ".ipynb_checkpoints": continue
            src_file = os.path.join("tmp/script_extraction_out/", filename)
            dst_file = os.path.join("script_extraction_out/", filename)
            shutil.move(src_file, dst_file)
            
        print("Both jobs succeeded. Outputs moved to final directories.")
        
        input_dir = os.path.dirname(f"warc_splits/{input_txt}")
        shutil.move(f"warc_splits/{input_txt}", os.path.join("success/", os.path.basename(f"warc_splits/{input_txt}")))
        
        print(f"Processing completed successfully. Input file warc_splits/{input_txt} moved to success/")
        
    else:
        # If any job failed, discard temporary output
        shutil.rmtree('tmp/ipmaxmind_out', ignore_errors=True)
        shutil.rmtree('tmp/script_extraction_out', ignore_errors=True)
        print("One or more jobs failed. Outputs discarded.")


In [9]:
def process_wp(wp_file: str):
    """Process a warc.paths file by generating splits, and submitting each of the split `.txt` file to spark."""
    start_time = time.time()
    os.makedirs("warc_splits", exist_ok=True)
    gen_file_splits(wp_file)
    
    ckpt_dir = pathlib.Path("warc_splits/.ipynb_checkpoints/")
    if ckpt_dir.exists() and ckpt_dir.is_dir():
        shutil.rmtree(ckpt_dir)

    data_dir = "/opt/workspace/datasets/common_crawl/"
    # data_dir = "/opt/workspace/warc_yearly/data/"
    for input_txt in sorted(os.listdir("warc_splits")):
        if input_txt == ".ipynb_checkpoints": continue
        os.makedirs(data_dir)
        os.system(f"./get_files.sh warc_splits/{input_txt} {data_dir}")
        to_paths(f"warc_splits/{input_txt}")
        submit_job(input_txt)
        shutil.rmtree(data_dir)

    # files that are processed successfully are moved to `success/`.
    # remaining files are hence not processed successfully.
    for file in os.listdir("warc_splits"):
        if file == ".ipynb_checkpoints": continue
        shutil.move(f"warc_splits/{file}", os.path.join("unsuccessful/", os.path.basename(file)))
        
    end_time = time.time()
    total_time = end_time - start_time
    with open("times.txt", 'w') as f:
        f.write(f"[{wp_file}]: {total_time:.2f} seconds")


In [16]:
# for wp in sorted(os.listdir("warc_paths")):
#     # remove exist_ok arg in the actual run
#     process_wp(wp)
#     # break

wp = "warc_2024.paths"
process_wp(wp)

File split completed. Files saved in warc_splits/
Downloading files from file: warc_splits/warc_part_001_2024.txt ...
Total files to download: 50
Downloading https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-33/segments/1722640353668.0/warc/CC-MAIN-20240802234508-20240803024508-00404.warc.gz ...
Downloading https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-33/segments/1722641085898.84/warc/CC-MAIN-20240813204036-20240813234036-00257.warc.gz ...
Downloading https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-33/segments/1722640694594.35/warc/CC-MAIN-20240807143134-20240807173134-00080.warc.gz ...
Downloading https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-33/segments/1722640997721.66/warc/CC-MAIN-20240811110531-20240811140531-00889.warc.gz ...
Downloading https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-33/segments/1722640723918.41/warc/CC-MAIN-20240808062406-20240808092406-00440.warc.gz ...
Downloading https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-33/segments/172

24/09/10 17:18:31 INFO SparkContext: Running Spark version 3.5.1
24/09/10 17:18:31 INFO SparkContext: OS info Linux, 6.8.0-39-generic, amd64
24/09/10 17:18:31 INFO SparkContext: Java version 11.0.24
24/09/10 17:18:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/10 17:18:31 INFO ResourceUtils: No custom resources configured for spark.driver.
24/09/10 17:18:31 INFO SparkContext: Submitted application: script_extraction
24/09/10 17:18:31 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 1, script: , vendor: , memory -> name: memory, amount: 4096, script: , vendor: , offHeap -> name: offHeap, amount: 0, script: , vendor: ), task resources: Map(cpus -> name: cpus, amount: 1.0)
24/09/10 17:18:31 INFO ResourceProfile: Limiting resource is cpus at 1 tasks per executor
24/09/10 17:18:31 INFO ResourceProfileManager: Added ResourceProfile id: 0
24/09/10 

Traceback (most recent call last):
  File "/opt/workspace/warc_yearly/script_extraction-errh.py", line 117, in <module>
    df.repartition(1).write.mode("append").parquet(args.output_dir)
  File "/usr/local/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 1721, in parquet
  File "/usr/local/lib/python3.11/site-packages/pyspark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1322, in __call__
  File "/usr/local/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/errors/exceptions/captured.py", line 179, in deco
  File "/usr/local/lib/python3.11/site-packages/pyspark/python/lib/py4j-0.10.9.7-src.zip/py4j/protocol.py", line 326, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o82.parquet.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 4 times, most recent failure: Lost task 0.3 in stage 1.0 (TID 14) (172.18.0.6 executor 1): org.apache.s

24/09/10 17:18:37 INFO SparkUI: Stopped Spark web UI at http://fc5436c4b308:4041
24/09/10 17:18:37 INFO DAGScheduler: ShuffleMapStage 1 (csv at NativeMethodAccessorImpl.java:0) failed in 0.644 s due to Stage cancelled because SparkContext was shut down
24/09/10 17:18:37 INFO SparkContext: SparkContext is stopping with exitCode 0.
24/09/10 17:18:37 INFO SparkContext: SparkContext already stopped.
24/09/10 17:18:37 INFO StandaloneSchedulerBackend: Shutting down all executors
24/09/10 17:18:37 INFO StandaloneSchedulerBackend$StandaloneDriverEndpoint: Asking each executor to shut down
24/09/10 17:18:37 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
24/09/10 17:18:37 INFO MemoryStore: MemoryStore cleared
24/09/10 17:18:37 INFO BlockManager: BlockManager stopped
24/09/10 17:18:37 INFO BlockManagerMaster: BlockManagerMaster stopped
24/09/10 17:18:37 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
24/09/10 17:18:37 IN

KeyboardInterrupt: 

### note: the input.txt file is moved to success/ even if only 1 script runs successfully on it.