In [None]:
!pip install dnspython

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

#PostgreSQL access data
host = "bda_gr4_database"
port = "5432"
database = "domainanalysis"
user = "postgres"
password = "postgres"

# PostgreSQL connection url
connection = f"jdbc:postgresql://{host}:{port}/{database}"

# Create a Spark session
spark = SparkSession.builder \
    .appName("ETL_DomainAnaylsis") \
    .getOrCreate()

# Read data from the database
domains_df = spark.read \
                .format("jdbc") \
                .option("url", connection) \
                .option("dbtable", "domain") \
                .option("user", user) \
                .option("password", password) \
                .load()

# Display the data frame
# domains_df.show()

In [None]:
import dns
from dns import resolver
import requests
from urllib.parse import urlparse


def getARecords(domain):
    try:
        result = dns.resolver.resolve(domain, 'A')
        return list(map(lambda ipval: ipval.to_text(), result))
    except dns.resolver.NXDOMAIN:
        return "Domain not available"
    except dns.resolver.NoAnswer:
        return "No Answer"
    except dns.resolver.NoNameservers:
        return "No Nameservers"
    except dns.resolver.Timeout:
        return "Timeout"

    
def getIPv6Records(domain):
    try:
        result = dns.resolver.resolve(domain, 'AAAA')
        return list(map(lambda ipval: ipval.to_text(), result))
    except dns.resolver.NXDOMAIN:
        return "Domain not available"
    except dns.resolver.NoAnswer:
        return None
    except dns.resolver.NoNameservers:
        return "No Nameservers"
    except dns.resolver.Timeout:
        return "Timeout"

    
def getMXRecords(domain):
    try:
        result = dns.resolver.resolve(domain, 'MX')
        return list(map(lambda mail: mail.exchange.to_text(), result))
    except dns.resolver.NXDOMAIN:
        return None
    except dns.resolver.NoAnswer:
        return None
    except dns.resolver.NoNameservers:
        return None
    except dns.exception.Timeout:
        return None
    # TODO add to Error table
    

def getRedirectUrl(domain):
    try:
        response = requests.get("http://" + domain, timeout=5)
        url = urlparse(response.url).netloc
        url = ('.'.join(url.split('.')[-2:]))
        return url
    except requests.exceptions.ConnectionError:
        return "Connection Error"
    except requests.exceptions.ReadTimeout:
        return "Read Timeout"
    except requests.exceptions.TooManyRedirects:
        return "Too Many Redirects"

def getStatusCodeUrl(domain):
    try:
        response = requests.get("http://" + domain, timeout=5)
        return response.status_code
    except requests.exceptions.ConnectionError:
        return "Connection Error"
    except requests.exceptions.ReadTimeout:
        return "Read Timeout"
    except requests.exceptions.TooManyRedirects:
        return "Too Many Redirects"
    
def getSOAInformation(domain):
    try:
        result = dns.resolver.resolve(domain, 'SOA')
        return list(map(lambda soa: soa.to_text(), result))
    except dns.resolver.NXDOMAIN:
        return list("Domain not available")
    except dns.resolver.NoAnswer:
        return list("No Answer")
    except dns.resolver.NoNameservers:
        return list("No Nameservers")
    except dns.exception.Timeout:
        return list("Timeout")
    
    
def getNameServers(domain):
    try:
        result = dns.resolver.resolve(domain, 'NS')
        return list(map(lambda soa: soa.to_text(), result))
    except dns.resolver.NXDOMAIN:
        return list("Domain not available")
    except dns.resolver.NoAnswer:
        return list("No Answer")
    except dns.resolver.NoNameservers:
        return list("No Nameservers")
    except dns.exception.Timeout:
        return list("Timeout")

In [None]:
udf_getARecords = udf(getARecords, ArrayType(StringType()))
udf_getIPv6Records = udf(getIPv6Records, StringType())
udf_getMXRecords = udf(getMXRecords, ArrayType(StringType()))
udf_getRedirectUrl = udf(getRedirectUrl, StringType())
udf_getStatusCodeUrl = udf(getStatusCodeUrl, StringType())
udf_getSOAInformation = udf(getSOAInformation, ArrayType(StringType()))
udf_getNameServers = udf(getNameServers, ArrayType(StringType()))

In [None]:
domains_df = domains_df.withColumn("a_record_checked", udf_getARecords("top_level_domain"))
domains_df = domains_df.withColumn("aaaa_record", udf_getIPv6Records("top_level_domain"))
domains_df = domains_df.withColumn("mx_record_checked", udf_getMXRecords("top_level_domain"))
domains_df = domains_df.withColumn("redirection", udf_getRedirectUrl("top_level_domain"))
domains_df = domains_df.withColumn("status_code", udf_getStatusCodeUrl("top_level_domain"))
domains_df = domains_df.withColumn("soa_information", udf_getSOAInformation("top_level_domain"))
domains_df = domains_df.withColumn("nameservers", udf_getNameServers("top_level_domain"))

In [None]:
#domains_df.show(50)

In [None]:
# Change ArrayType<String> into String
domains_df = domains_df.withColumn("soa_infos_rep", concat_ws(" ", "soa_information"))

In [None]:
# Split SOA information into separate columns (all String)
split_col = split(domains_df['soa_infos_rep'], ' ')
domains_df = domains_df.withColumn('mname', split_col.getItem(0))
domains_df = domains_df.withColumn('refresh', split_col.getItem(3))
domains_df = domains_df.withColumn('minimum', split_col.getItem(6))

# Remove last sign '.' if available
domains_df = domains_df.withColumn('mname', regexp_replace('mname', '.$', ''))   

In [None]:
# change dtypes of seconds to int
domains_df = domains_df.withColumn("refresh", domains_df["refresh"].cast(IntegerType()))
domains_df = domains_df.withColumn("minimum", domains_df["minimum"].cast(IntegerType()))

domains_df.show()

#domains_df = domains_df.withColumn("refresh", domains_df["refresh"].cast(TimestampType()))

#domains_df.dtypes
#print(domains_df.head(5))

In [None]:
# Count the values of each column with descending sort and show it
# domains_df.groupBy("Top-Level-Domain").count().sort(desc("count")).show(20, False)
# domains_df.groupBy("MX-Record").count().sort(desc("count")).show(20, False)
# domains_df.groupBy("A-Record").count().sort(desc("count")).show(20, False)
# domains_df.groupBy("A-Record_Checked").count().sort(desc("count")).show(20, False)
# domains_df.groupBy("MX-Record_Checked").count().sort(desc("count")).show(20, False)
# domains_df.groupBy("Redirection").count().sort(desc("count")).show(20, False)

In [None]:
# domains_df = domains_df.withColumn('mx_record_count', explode(col('mx_record'))) \
#         .groupBy('mx_record_count') \
#         .count() \
#         .sort(desc("count"))

domains_df = domains_df.withColumn('mx_record_checked_count', explode(col('mx_record_checked'))) \
        .groupBy('mx_record_checked_count') \
        .count() \
        .sort(desc("count"))

# domains_df = domains_df.withColumn('a_record_count', explode(col('a_record'))) \
#         .groupBy('a_record_count') \
#         .count() \
#         .sort(desc("count"))

domains_df = domains_df.withColumn('a_record_checked', explode(col('a_record_checked'))) \
        .groupBy('a_record_checked') \
        .count() \
        .sort(desc("count"))

domains_df = domains_df.withColumn('m_name', explode(col('m_name'))) \
        .groupBy('m_name') \
        .count() \
        .sort(desc("count"))

domains_df = domains_df.withColumn('nameservers_count', explode(col('nameservers'))) \
        .groupBy('nameservers_count') \
        .count() \
        .sort(desc("count"))

domains_df = domains_df.drop('mx_record').drop('a_record').drop('soa_information').drop('soa_infos_rep')

In [None]:
domains_df.show(5)

In [None]:
# Write the data frame to the PostgreSQL database
domains_df.write \
    .format("jdbc") \
    .option("url", connection) \
    .option("dbtable", "domain_enhanced") \
    .option("user", user) \
    .option("password", password) \
    .mode("append") \
    .save()