 # Preparation

In [None]:
!pip install ipynb
!pip install dnspython
!pip install geoip2
!pip install psycopg2-binary

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from ipynb.fs.full.Functions import *
from pyspark.sql.types import IntegerType, BooleanType, ArrayType

In [None]:
domains_df = SparkSession.builder \
    .appName("etl_domains") \
    .getOrCreate() \
    .read.csv('../data/real_domains.csv', escape = "\"") \
    .toDF("top_level_domain", "mx_record", "a_record", "timestamp") \
    .drop('timestamp')

# Section I

In [None]:
def clean_data(df, column, to_delete, to_replace): return df.withColumn(column, regexp_replace(column, to_delete, to_replace))
for column in domains_df.schema.names: domains_df = clean_data(domains_df, column, '\\[|\\]|\\"', "")

In [None]:
domains_df = domains_df \
                .withColumn('mx_record', when(domains_df['mx_record'] == '', None).otherwise(split(domains_df['mx_record'], ','))) \
                .withColumn('a_record', when(domains_df['a_record'] == '', None).otherwise(split(domains_df['a_record'], ',')))

In [None]:
upsert("domain", ["top_level_domain"], ["mx_record", "a_record"], domains_df)

# Section II

In [None]:
def count_arr(arr): return 0 if arr == None else len(arr)
count_arr_udf = udf(count_arr, IntegerType())

def uses_localhost(mx_records): return mx_records != None and 'localhost' in mx_records
uses_localhost_udf = udf(uses_localhost, BooleanType())

In [None]:
domains_df_enhanced = domains_df \
    .withColumn("a_record_count", count_arr_udf("a_record")) \
    .withColumn("mx_record_count", count_arr_udf("mx_record")) \
    .withColumn("mx_uses_localhost", uses_localhost_udf("mx_record")) \
    .drop("mx_record") \
    .drop("a_record")

a_record_count_top_ten_df = domains_df.withColumn('a_record', explode(col('a_record'))) \
        .groupBy('a_record') \
        .count() \
        .orderBy(['count'], ascending = [False]) \
        .limit(10)

mx_record_count_top_ten_df = domains_df.withColumn('mx_record', explode(col('mx_record'))) \
        .groupBy('mx_record') \
        .count() \
        .orderBy(['count'], ascending = [False]) \
        .limit(10)

In [None]:
upsert("a_record_count_global", ["a_record"], ["count"], a_record_count_top_ten_df)
upsert("mx_record_count_global", ["mx_record"], ["count"], mx_record_count_top_ten_df)
upsert("domain_enhanced_based_on_existing_data", ["top_level_domain"], ["a_record_count", "mx_record_count", "mx_uses_localhost"], domains_df_enhanced)

In [None]:
del a_record_count_top_ten_df
del mx_record_count_top_ten_df
del domains_df_enhanced

# Section III

In [None]:
args = domains_df.select("top_level_domain").toPandas().values.reshape(-1)

## A

In [None]:
result_map_a_record = execute_threaded_fn(get_a_records, args)
def _get_a_records_err(top_level_domain): return result_map_a_record[top_level_domain][0]
def _get_a_records(top_level_domain): return result_map_a_record[top_level_domain][1]
udf_get_a_records_err = udf(_get_a_records_err, IntegerType())
udf_get_a_records = udf(_get_a_records, ArrayType(StringType()))

result_map_mx_record = execute_threaded_fn(get_mx_records, args)
def _get_mx_records_err(top_level_domain): return result_map_mx_record[top_level_domain][0]
def _get_mx_records(top_level_domain): return result_map_mx_record[top_level_domain][1]
udf_get_mx_records_err = udf(_get_mx_records_err, IntegerType())
udf_get_mx_records = udf(_get_mx_records, ArrayType(StringType()))

udf_remove_last_char_in_array = udf(fn_remove_dot, ArrayType(StringType()))

In [None]:
domains_checked_df = domains_df.withColumn("a_record_checked", udf_get_a_records("top_level_domain")) \
                            .withColumn("a_record_checked_error", udf_get_a_records_err("top_level_domain")) \
                            .withColumn("mx_record_checked", udf_get_mx_records("top_level_domain")) \
                            .withColumn("mx_record_checked", udf_remove_last_char_in_array(col("mx_record_checked"))) \
                            .withColumn("mx_record_checked_error", udf_get_mx_records_err("top_level_domain"))

a_record_count_top_ten_df = domains_checked_df.withColumn('a_record_checked', explode(col('a_record_checked'))) \
        .groupBy('a_record_checked') \
        .count() \
        .orderBy(['count'], ascending = [False]).limit(10)

mx_record_count_top_ten_df = domains_checked_df.withColumn('mx_record_checked', explode(col('mx_record_checked'))) \
        .groupBy('mx_record_checked') \
        .count() \
        .orderBy(['count'], ascending = [False]).limit(10)

domains_checked_df = domains_checked_df.drop("a_record").drop("mx_record")

In [None]:
upsert("a_record_checked_count_global", ["a_record_checked"], ["count"], a_record_count_top_ten_df)
upsert("mx_record_checked_count_global", ["mx_record_checked"], ["count"], mx_record_count_top_ten_df)
upsert("domain_records_checked", ["top_level_domain"], ["a_record_checked", "a_record_checked_error", "mx_record_checked", "mx_record_checked_error"], domains_checked_df)

In [None]:
del result_map_a_record
del result_map_mx_record

del a_record_count_top_ten_df
del mx_record_count_top_ten_df

## B

In [None]:
result_map_redirect = execute_threaded_fn(get_redirect_data, args)
def get_status_code(top_level_domain): return result_map_redirect[top_level_domain][0]
def get_redirect_url(top_level_domain): return result_map_redirect[top_level_domain][1]
udf_get_status_code = udf(get_status_code, IntegerType())
udf_get_redirect_url = udf(get_redirect_url, StringType())

In [None]:
domains_redirect_df = domains_df.withColumn("redirection", udf_get_redirect_url("top_level_domain")) \
                                .withColumn("status_code", udf_get_status_code("top_level_domain")) \
                                .drop("a_record") \
                                .drop("mx_record")

In [None]:
upsert("domain_redirection", ["top_level_domain"], ["redirection", "status_code"], domains_redirect_df)

In [None]:
del result_map_redirect
del domains_redirect_df