In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("email-warc") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", 1) \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.initialExecutors", 2) \
    .config("spark.dynamicAllocation.minExecutors", 0) \
    .config("spark.dynamicAllocation.maxExecutors", 10) \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/14 14:03:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/14 14:03:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark.sparkContext.setLogLevel("ERROR")

In [4]:
import re
from pathlib import Path

from warcio.archiveiterator import ArchiveIterator

from selectolax.parser import HTMLParser

In [5]:
cc_dir = Path("/opt/workspace/datasets/common-crawl/")
! ls $cc_dir

CC-MAIN-20240724014956-20240724044956-00798.warc	      get_files.sh
CC-MAIN-20240725114544-20240725144544-00476.warc	      warc.paths
CC-MAIN-20240725114544-20240725144544-00476.warc_extract.csv


In [6]:
warc_files = []
with open("paths.txt") as f:
    for i, line in enumerate(f):
        warc_files.append(Path(line.strip()))
        print(f"file {i+1}: {line}")

file 1: /opt/workspace/datasets/common-crawl/CC-MAIN-20240724014956-20240724044956-00798.warc

file 2: /opt/workspace/datasets/common-crawl/CC-MAIN-20240725114544-20240725144544-00476.warc


In [7]:
max_records = 2
input_file = cc_dir/warc_files[1]

with open(input_file, 'rb') as file_stream:
    record_num = 0
    for record in ArchiveIterator(file_stream):
        if record.rec_type == "response":
            record_num += 1
            print(f"Record headers: {record.rec_headers}")
        if record_num == max_records: break

Record headers: WARC/1.0
WARC-Type: response
WARC-Date: 2024-07-25T13:48:10Z
WARC-Record-ID: <urn:uuid:bbe49616-1067-4b37-b519-155471da90bd>
Content-Length: 13739
Content-Type: application/http; msgtype=response
WARC-Warcinfo-ID: <urn:uuid:4310b454-baf3-414f-99af-d0049e37d45d>
WARC-Concurrent-To: <urn:uuid:8370f230-97b9-4070-8639-b66757902bdb>
WARC-IP-Address: 52.68.28.174
WARC-Target-URI: http://0081.b-ch.com/blog/0081/archives/2111
WARC-Protocol: http/1.1
WARC-Payload-Digest: sha1:MU75EIRB5T77Q3YS72HG3GNPPAI2N3E3
WARC-Block-Digest: sha1:NVRIFEHQMHR5JRN3YB2WSFFKS7N336IO
WARC-Identified-Payload-Type: text/html

Record headers: WARC/1.0
WARC-Type: response
WARC-Date: 2024-07-25T13:03:49Z
WARC-Record-ID: <urn:uuid:85bd4a43-e728-4634-b1a0-3f87d7ddd16a>
Content-Length: 967850
Content-Type: application/http; msgtype=response
WARC-Warcinfo-ID: <urn:uuid:4310b454-baf3-414f-99af-d0049e37d45d>
WARC-Concurrent-To: <urn:uuid:3b2b7cd6-c8ff-4312-ae02-9645e3ffd0e8>
WARC-IP-Address: 38.165.51.151
WAR

In [8]:
def encode_byte_stream(input_stream):
    return input_stream.encode('utf-8').decode("unicode_escape").encode("latin-1").decode("utf-8", errors="replace")

In [9]:
email_regex = re.compile(r"(mailto:)?([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)")

In [10]:
emails = []
with open(input_file, 'rb') as file_stream:
    for record_idx, record in enumerate(ArchiveIterator(file_stream)):
        if record.rec_type == "response":
            raw_text = record.raw_stream.read()
            str_text = str(raw_text).strip().lower().replace("\t", " ").replace("\n", " ")
            slax_txt = HTMLParser(str_text)

            # get anchor tags and filter emails
            anchor_tags = slax_txt.tags("a")
            for a_idx, atag in enumerate(anchor_tags):
                href_attr = atag.attributes.get("href")
                if href_attr:
                    href = encode_byte_stream(href_attr)
                    if email_regex.match(href):
                        emails.append(href)

In [12]:
emails = set(emails)

In [20]:
list(emails)[5000:6000]

['mailto:tour@gallipolitour.com',
 'mailto:info@nababbosail.com',
 'mailto:mairie@donzere.net',
 'mailto:info@nipztix.com',
 'mailto:provincia.ancona@cert.provincia.ancona.it',
 'mailto:gamecockgirlblog@gmail.com',
 'mailto:info@tophop.ru',
 'info@vegliolux.com',
 'mailto:dwright@pappasgrubbs.com',
 'mailto:coke@pfcexpress.com',
 'mailto:cs@outboundtuban.com',
 'mailto:cn@cncoterie.com',
 'mailto:bond@ieee.org',
 'mailto:sales@icorptechnologies.co.za',
 'mailto:info@robinsun.com',
 'mailto:support@shipmondo.com',
 'mailto:info@dewasport.be',
 'mailto:sales@schwahrtechnology.com',
 'mailto:sales@countryradio.live',
 'mailto:enquiries@argyll-bute.gov.uk',
 'mailto:villaseherzada@gmail.com',
 'mailto:erinrommeck@remc.org',
 'mailto:info@pelastakaalapset.fi',
 'mailto:khalife@lix.polytechnique.fr',
 'mailto:websitesfeedback@gmail.com',
 'mailto:szafirowyprezent.kontakt@gmail.com',
 'mailto:support@playroomavenue.com',
 'mailto:rentals@ajaxgolfcarts.com',
 'hank@gd-jet.com',
 'mailto:team@r