In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("maxmind-warc") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", 1) \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.initialExecutors", 2) \
    .config("spark.dynamicAllocation.minExecutors", 0) \
    .config("spark.dynamicAllocation.maxExecutors", 10) \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/23 19:06:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
spark.sparkContext.addPyFile("./ip_utils.py")

In [3]:
from pathlib import Path
from warcio import ArchiveIterator

from pyspark.sql import Row
from pyspark.sql.functions import udf
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    BooleanType,
    FloatType
)

In [4]:
cc_dir = Path("/opt/workspace/datasets/common-crawl/")
!ls $cc_dir

CC-MAIN-20240724014956-20240724044956-00798.warc     get_files.sh
CC-MAIN-20240725114544-20240725144544-00476.warc     warc.paths
CC-MAIN-20240725114544-20240725144544-00476_extract


In [5]:
warc_files = []
with open("paths.txt") as f:
    for i, line in enumerate(f):
        warc_files.append(Path(line.strip()))
        print(f"file {i+1}: {line}")

file 1: /opt/workspace/datasets/common-crawl/CC-MAIN-20240724014956-20240724044956-00798.warc

file 2: /opt/workspace/datasets/common-crawl/CC-MAIN-20240725114544-20240725144544-00476.warc


In [6]:
max_records = 2
input_file = cc_dir/warc_files[0]

with open(input_file, 'rb') as file_stream:
    record_num = 0
    for record in ArchiveIterator(file_stream):
        if record.rec_type == "response":
            record_num += 1
            print(f"Record headers: {record.rec_headers}")
        if record_num == max_records: break

Record headers: WARC/1.0
WARC-Type: response
WARC-Date: 2024-07-24T02:50:03Z
WARC-Record-ID: <urn:uuid:f1ee65d4-9676-42a8-9e58-4255974742d2>
Content-Length: 104808
Content-Type: application/http; msgtype=response
WARC-Warcinfo-ID: <urn:uuid:e9e442c8-3d43-4395-a202-dd2aa8c91b0a>
WARC-Concurrent-To: <urn:uuid:d7712f1c-2f25-4280-9c3c-6a0bb149d67a>
WARC-IP-Address: 107.163.232.92
WARC-Target-URI: http://0.furkid.net/
WARC-Protocol: http/1.1
WARC-Payload-Digest: sha1:7TBKRIWHL3ND2HAIEPKDTARXTOYGYESP
WARC-Block-Digest: sha1:FWGYYUJS6NSB5DLDH2C4PHJDCWQAESPZ
WARC-Identified-Payload-Type: text/html

Record headers: WARC/1.0
WARC-Type: response
WARC-Date: 2024-07-24T02:42:19Z
WARC-Record-ID: <urn:uuid:c0beded9-7d7b-4a3d-b4ef-161431f16cf1>
Content-Length: 154
Content-Type: application/http; msgtype=response
WARC-Warcinfo-ID: <urn:uuid:e9e442c8-3d43-4395-a202-dd2aa8c91b0a>
WARC-Concurrent-To: <urn:uuid:86f9ed18-d923-4e5f-90a7-af5565d2e7bd>
WARC-IP-Address: 192.229.64.48
WARC-Target-URI: http://01r

In [7]:
def process_record(record):
    """Return tuple containing ip, url if record is of response type"""
    if record.rec_type == "response":
        ip = record.rec_headers.get_header("WARC-IP-Address", "-")
        url = record.rec_headers.get_header("WARC-Target-URI", "-")
        return (ip, url)
    return None
        
def process_warc(filepath):
    """Read WARC file and yield processed records"""
    with open(filepath, 'rb') as stream:
        for record in ArchiveIterator(stream):
            result = process_record(record)
            if result:
                yield result

In [8]:
data = []
for record in process_warc(warc_files[0]):
    res = {"ip": record[0], "hostname": record[1]}
    data.append(res)

In [9]:
warciphost_df = spark.createDataFrame([Row(**d) for d in data])

In [11]:
warciphost_df.show()

24/08/23 19:06:25 WARN TaskSetManager: Stage 0 contains a task of very large size (1405 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+---------------+--------------------+
|             ip|            hostname|
+---------------+--------------------+
| 107.163.232.92|http://0.furkid.net/|
|  192.229.64.48|http://01rt.allst...|
|   195.170.8.34|http://0807.syzef...|
| 107.190.226.20|http://0krls.9032...|
| 172.67.142.198|http://101webtemp...|
|   23.108.56.90|http://123any.com...|
| 137.184.244.32|http://137.184.24...|
| 172.67.198.153|http://16trader.c...|
|  220.228.6.123|http://1700866.mk...|
|  220.228.6.241|http://1701057.mw...|
|    220.228.6.6|http://1746747.ut...|
|   61.66.228.75|http://176592.s34...|
|  77.232.40.211|http://2012god.ru...|
|   198.2.232.33|http://201314zz.c...|
|  220.228.6.119|http://2130148.hk...|
| 114.118.10.124|http://21cpm.net/...|
|107.163.236.251|http://25.lasaqls...|
| 208.109.212.43|http://252churcht...|
| 208.109.212.43|http://252churcht...|
|107.163.212.188|http://3.lennonau...|
+---------------+--------------------+
only showing top 20 rows



In [11]:
# reader.city("27.7.108.104").traits.is_anonymous_proxy,\
# reader.city("27.7.108.104").traits.is_satellite_provider,\
# reader.city("27.7.108.104").traits.is_hosting_provider,\
# reader.city("27.7.108.104").postal.code,\
# reader.city("27.7.108.104").location.latitude,\
# reader.city("27.7.108.104").location.longitude,\
# reader.city("27.7.108.104").location.accuracy_radius,\
# reader.city("27.7.108.104").traits.is_anycast,\
# reader.city("27.7.108.104").continent.code,\
# reader.city("27.7.108.104").continent.name,\
# reader.city("27.7.108.104").country.iso_code,\
# reader.city("27.7.108.104").country.name,\
# reader.city("27.7.108.104").subdivisions[0].iso_code,\
# reader.city("27.7.108.104").subdivisions[0].name,\
# reader.city("27.7.108.104").city.name,\
# reader.city("27.7.108.104").location.metro_code,\
# reader.city("27.7.108.104").location.time_zone,\
# reader.city("27.7.108.104").represented_country.is_in_european_union

In [12]:
import ip_utils
reader_broadcast = spark.sparkContext.broadcast(ip_utils.SerializableReader("/opt/workspace/datasets/maxmind/GeoLite2-City_20240820/GeoLite2-City.mmdb"))

In [13]:
ip_info_schema = StructType([
    StructField("is_anonymous_proxy", BooleanType(), True),
    StructField("is_satellite_provider", BooleanType(), True),
    StructField("is_hosting_provider", BooleanType(), True),
    StructField("postal_code", StringType(), True),
    StructField("latitude", FloatType(), True),
    StructField("longitude", FloatType(), True),
    StructField("accuracy_radius", IntegerType(), True),
    StructField("is_anycast", BooleanType(), True),
    StructField("continent_code", StringType(), True),
    StructField("continent_name", StringType(), True),
    StructField("country_iso_code", StringType(), True),
    StructField("country_name", StringType(), True),
    StructField("subdivision_iso_code", StringType(), True),
    StructField("subdivision_name", StringType(), True),
    StructField("city_name", StringType(), True),
    StructField("metro_code", IntegerType(), True),
    StructField("time_zone", StringType(), True),
    StructField("is_in_european_union", BooleanType(), True)
])

In [14]:
def get_ip_info(ip):
    reader = reader_broadcast.value.get_reader()
    response = reader.city(ip)
    return (
        response.traits.is_anonymous_proxy,
        response.traits.is_satellite_provider,
        response.traits.is_hosting_provider,
        response.postal.code,
        response.location.latitude,
        response.location.longitude,
        response.location.accuracy_radius,
        response.traits.is_anycast,
        response.continent.code,
        response.continent.name,
        response.country.iso_code,
        response.country.name,
        response.subdivisions[0].iso_code if response.subdivisions else None,
        response.subdivisions[0].name if response.subdivisions else None,
        response.city.name,
        response.location.metro_code,
        response.location.time_zone,
        response.represented_country.is_in_european_union
    )

get_ip_info_udf = udf(get_ip_info, ip_info_schema)

In [15]:
result_df = warciphost_df.withColumn("ip_info", get_ip_info_udf("ip"))

In [16]:
result_df.show()

24/08/23 19:06:35 WARN TaskSetManager: Stage 1 contains a task of very large size (1405 KiB). The maximum recommended task size is 1000 KiB.


+---------------+--------------------+--------------------+
|             ip|            hostname|             ip_info|
+---------------+--------------------+--------------------+
| 107.163.232.92|http://0.furkid.net/|{false, false, fa...|
|  192.229.64.48|http://01rt.allst...|{false, false, fa...|
|   195.170.8.34|http://0807.syzef...|{false, false, fa...|
| 107.190.226.20|http://0krls.9032...|{false, false, fa...|
| 172.67.142.198|http://101webtemp...|{false, false, fa...|
|   23.108.56.90|http://123any.com...|{false, false, fa...|
| 137.184.244.32|http://137.184.24...|{false, false, fa...|
| 172.67.198.153|http://16trader.c...|{false, false, fa...|
|  220.228.6.123|http://1700866.mk...|{false, false, fa...|
|  220.228.6.241|http://1701057.mw...|{false, false, fa...|
|    220.228.6.6|http://1746747.ut...|{false, false, fa...|
|   61.66.228.75|http://176592.s34...|{false, false, fa...|
|  77.232.40.211|http://2012god.ru...|{false, false, fa...|
|   198.2.232.33|http://201314zz.c...|{f

In [17]:
final_df = result_df.select("ip", "ip_info.*")

In [18]:
final_df.show()

24/08/23 19:06:41 WARN TaskSetManager: Stage 2 contains a task of very large size (1405 KiB). The maximum recommended task size is 1000 KiB.
24/08/23 19:06:42 WARN TaskSetManager: Lost task 0.0 in stage 2.0 (TID 2) (172.18.0.9 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/ipykernel_2133/425540900.py", line 2, in get_ip_info
  File "/opt/spark/python/lib/pyspark.zip/pyspark/broadcast.py", line 279, in value
    self._value = self.load_from_path(self._path)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/lib/pyspark.zip/pyspark/broadcast.py", line 226, in load_from_path
    return self.load(f)
           ^^^^^^^^^^^^
  File "/opt/spark/python/lib/pyspark.zip/pyspark/broadcast.py", line 265, in load
    gc.enable()
  File "/opt/spark/work/app-20240823190605-0003/0/ip_utils.py", line 20, in <module>
    parser = argparse.ArgumentParser()
             ^^^^^^^^
NameError: name 'argparse' is not defined

	

+---------------+------------------+---------------------+-------------------+-----------+--------+---------+---------------+----------+--------------+--------------+----------------+-------------+--------------------+----------------+---------------+----------+-------------------+--------------------+
|             ip|is_anonymous_proxy|is_satellite_provider|is_hosting_provider|postal_code|latitude|longitude|accuracy_radius|is_anycast|continent_code|continent_name|country_iso_code| country_name|subdivision_iso_code|subdivision_name|      city_name|metro_code|          time_zone|is_in_european_union|
+---------------+------------------+---------------------+-------------------+-----------+--------+---------+---------------+----------+--------------+--------------+----------------+-------------+--------------------+----------------+---------------+----------+-------------------+--------------------+
| 107.163.232.92|             false|                false|              false|       NUL

24/08/23 19:07:47 WARN BlockManagerMasterEndpoint: No more replicas available for broadcast_1_python !
