# 1. Useful functions

This notebook contains useful functions that can be imported and reused within a Jupyter notebook.

In [19]:
# Required installations
!pip install ipynb
!pip install dnspython
!pip install geoip2

# Required imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from ipynb.fs.full.Functions import getARecords, getARecords_error, getMXRecords, getMXRecords_error, fn_remove_dot

# Create a Spark session
spark = SparkSession.builder \
    .appName("domain_analysis") \
    .getOrCreate()

domains_df = spark.read.csv('../data/real_domains.csv', escape = "\"").toDF("top_level_domain", "mx_record", "a_record", "timestamp")



In [20]:
from concurrent.futures import ThreadPoolExecutor

def execute_threaded_fn(fn, args):
    futures = []
    dictionary = {}

    with ThreadPoolExecutor(max_workers=32) as executor:
        for arg in args:
            futures.append((arg, executor.submit(fn, arg)))
        for future in futures:
            try:
                result = future[1].result(timeout=60)
                dictionary[future[0]] = result
            except Exception:
                results.append(None)
    return dictionary 

In [None]:
#udf_getARecords = udf(getARecords, ArrayType(StringType())
args = domains_df.select("top_level_domain").toPandas().values.reshape(-1)
result_map = execute_threaded_fn(getARecords, args)

In [None]:
def getARecords_udf(top_level_domain):
    return result_map[top_level_domain]

In [None]:
udf_getARecords = udf(getARecords_udf, ArrayType(StringType()))

In [None]:
domains_df = domains_df.withColumn("a_record_checked", udf_getARecords("top_level_domain"))
domains_df.show(5, False)