In [None]:
!pip install dnspython

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from dns import resolver
from concurrent.futures import ThreadPoolExecutor
import time

#PostgreSQL access data
host = "bda_gr4_database"
port = "5432"
database = "domainanalysis"
user = "postgres"
password = "postgres"

# PostgreSQL connection url
connection = f"jdbc:postgresql://{host}:{port}/{database}"

# Create a Spark session
spark = SparkSession.builder \
    .appName("domain_analysis") \
    .getOrCreate()

# Read data from the database
domains_df = spark.read \
                .format("jdbc") \
                .option("url", connection) \
                .option("dbtable", "domain") \
                .option("user", user) \
                .option("password", password) \
                .load()

In [None]:
def fn_to_be_threaded(domain): 
    try:
        result = resolver.resolve(domain, 'AAAA')
        return list(map(lambda ipval: ipval.to_text(), result))
    except dns.resolver.NXDOMAIN:
        return "Domain not available"
    except dns.resolver.NoAnswer:
        return None
    except dns.resolver.NoNameservers:
        return "No Nameservers"
    except dns.resolver.Timeout:
        return "Timeout"

In [None]:
def execute_threaded_fn(fn, args, log_at):
    futures, results = [], []

    with ThreadPoolExecutor(max_workers=1024) as executor:
        i, f, s = 0, 0, 0
        for arg in args:
            i = i + 1
            if i % log_at == 0: print("futures:", i)
            futures.append(executor.submit(fn, arg))

        for future in futures:
            try:
                result = future.result(timeout=60)
                results.append(result)
                s = s + 1
            except Exception:
                results.append(None)
                f = f + 1
            finally:
                if (s+f) % log_at == 0: print(f"[Processed: {s + f}] Successful: {s} | Failed: {f}")
    return results

In [None]:
# TODO Combine with UDF?

n = 4860885
args = domains_df.limit(n).select("top_level_domain").toPandas().values.reshape(-1)

In [None]:
t0 = time.time()
execute_threaded_fn(fn_to_be_threaded, args, 10000)
t1 = time.time()
print(f"Total time: {t1-t0} for {n} data")