In [2]:
from warcio.recordloader import ArcWarcRecord

from warcio.archiveiterator import ArchiveIterator
import os
import gzip
import shutil
from urllib.parse import urlparse
from urllib.request import urlretrieve
from pyspark.sql.types import StructType, StructField, StringType

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("iphost-warc") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.cores", 1) \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.initialExecutors", 3) \
    .config("spark.dynamicAllocation.minExecutors", 0) \
    .config("spark.dynamicAllocation.maxExecutors", 5) \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/19 17:23:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Downloads sample warc file and extracts it if not already available

warc_url = "https://data.commoncrawl.org/crawl-data/CC-MAIN-2017-13/segments/1490218186353.38/warc/CC-MAIN-20170322212946-00000-ip-10-233-31-227.ec2.internal.warc.gz"
warcgz_filepath = "/opt/workspace/datasets/cc-warc/CC-MAIN-20170322212946-00000-ip-10-233-31-227.ec2.internal.warc.gz"
warc_filepath = ".".join(warcgz_filepath.split(".")[:-1])

if not os.path.isfile(warc_filepath):
    _, _ = urlretrieve(warc_url, warcgz_filepath)
    with gzip.open(warcgz_filepath, 'rb') as f_in:
        with open(warc_filepath, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(warcgz_filepath)

In [5]:
rtypes = set()
ctypes = set()
with open(warc_filepath, 'rb') as stream:
    for i,record in enumerate(ArchiveIterator(stream)):
        rtypes.add(record.rec_type)
        ctypes.add(record.content_type)

print("Record types: ", rtypes)
print("Content types: ", ctypes)

Record types:  {'request', 'warcinfo', 'response', 'metadata'}
Content types:  {'application/http; msgtype=request', 'application/warc-fields', 'application/http; msgtype=response'}


In [10]:
temp_record = None
with open(warc_filepath, 'rb') as stream:
    for i,record in enumerate(ArchiveIterator(stream)):
        if record.rec_type == "response":
            temp_record = record
            print(record.rec_headers)
            break

WARC/1.0
WARC-Type: response
WARC-Date: 2017-03-22T22:16:45Z
WARC-Record-ID: <urn:uuid:1eba28d7-5c50-4520-a58b-b18bb9691201>
Content-Length: 36415
Content-Type: application/http; msgtype=response
WARC-Warcinfo-ID: <urn:uuid:c9737a57-b812-4c1c-b82c-66f820799890>
WARC-Concurrent-To: <urn:uuid:b0f277a6-5b6a-45dc-a17a-dbf2bd24f959>
WARC-IP-Address: 104.244.98.64
WARC-Target-URI: http://00ena00.blog.fc2.com/?tag=SL
WARC-Payload-Digest: sha1:W2ZCZ4N7UPYD3SIOVWQQVJ7RVIEJNQ6A
WARC-Block-Digest: sha1:OCC7ULZJRWXPVVMQNRLNOQ7KY5BH46HQ



In [11]:
def get_header(record: ArcWarcRecord, header: str):
    """Utility function to get header"""
    return record.rec_headers.get_header(header, "na")

ip = get_header(temp_record, "WARC-IP-Address")
url = get_header(temp_record, "WARC-Target-URI")

print(ip, url)

104.244.98.64 http://00ena00.blog.fc2.com/?tag=SL


In [12]:
urlparse(url).hostname

'00ena00.blog.fc2.com'

In [13]:
def process_record(record):
    """Get ip and url headers"""
    ip = get_header(record, "WARC-IP-Address")
    url = get_header(record, "WARC-Target-URI")
    return ip, url

def process_warc(filepath):
    """Yield tuple containing ip, url if record is of response type"""
    with open(filepath, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == "response":
                res = process_record(record)
                yield res

def proc_wrapper(_id, iterator):
    """Wrapper function for `process_warc` to handle multiple `warc` files"""
    for filepath in iterator:
        for res in process_warc(filepath):
            yield res

In [14]:
output_schema = StructType([
    StructField("ip", StringType(), True),
    StructField("host", StringType(), True)
])

In [15]:
data_files = spark.sparkContext.textFile("paths.txt")
output = data_files.mapPartitionsWithIndex(proc_wrapper)

In [16]:
output.collect()[:5]

                                                                                

[('107.163.232.92', 'http://0.furkid.net/'),
 ('192.229.64.48',
  'http://01rt.allstarpestprofessionalstx.com/waibao.html?nid=10&id=102'),
 ('195.170.8.34',
  'http://0807.syzefxis.gov.gr/%CF%80%CF%81%CE%BF%CF%83%CE%BA%CE%BB%CE%B7%CF%83%CE%B7-%CF%83%CF%85%CE%B3%CE%BA%CE%BB%CE%B7%CF%83%CE%B7%CF%83-%CE%B4%CE%B7%CE%BC%CE%BF%CF%84%CE%B9%CE%BA%CE%BF%CF%85-%CF%83%CF%85%CE%BC%CE%B2%CE%BF-28/'),
 ('107.190.226.20', 'http://0krls.9032666.com/appset/detail/96ytw4ScD'),
 ('172.67.142.198',
  'http://101webtemplate.com/entity/rhapsodic722064336.html')]

In [23]:
df = spark.createDataFrame(output, schema=output_schema)
df.show(truncate=False)

+---------------+----------------------------------------------------------------------------------------------------------------------------+
|ip             |host                                                                                                                        |
+---------------+----------------------------------------------------------------------------------------------------------------------------+
|104.244.98.64  |http://00ena00.blog.fc2.com/?tag=SL                                                                                         |
|104.244.98.63  |http://00pon00.blog130.fc2.com/?xml                                                                                         |
|104.244.98.65  |http://00pon00.blog130.fc2.com/blog-category-8.html                                                                         |
|136.243.111.229|http://03online.com/news/dvizhenie_pod_kozhey/2016-1-3-128710                                                               |