In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

#PostgreSQL access data
host = "localhost"
port = "5432"
database = "domainanalysis"
user = "postgres"
password = "postgres"
table = "domain"

# PostgreSQL connection url
connection = f"jdbc:postgresql://{host}:{port}/{database}"

# Create a Spark session
spark = SparkSession.builder \
    .appName("ETL_DomainAnaylsis") \
    .getOrCreate()

# Read data from the database
domains_df = spark.read \
                .format("jdbc") \
                .option("url", connection) \
                .option("dbtable", table) \
                .option("user", user) \
                .option("password", password) \
                .load()

# Display the data frame
# domains_df.show()

In [None]:
import dns
from dns import resolver
import requests
from urllib.parse import urlparse

def getARecords(domain):
    try:
        result = dns.resolver.resolve(domain, 'A')
        for ipval in result:
            return ipval.to_text()
    except dns.resolver.NXDOMAIN:
        return "Domain not available"
    except dns.resolver.NoAnswer:
        return "No Answer"
    except dns.resolver.NoNameservers:
        return "No Nameservers"
    except dns.resolver.Timeout:
        return "Timeout"


def getMXRecords(domain):
    try:
        result = dns.resolver.resolve(domain, 'MX')
        for mail in result:
            return mail.exchange.to_text()
    except dns.resolver.NXDOMAIN:
        return "Domain not available"
    except dns.resolver.NoAnswer:
        return "No Answer"
    except dns.resolver.NoNameservers:
        return "No Nameservers"

def getRedirectUrl(domain):
    try:
        response = requests.get("http://" + domain, timeout=5)
        url = urlparse(response.url).netloc
        url = ('.'.join(url.split('.')[-2:]))
        return url
    except requests.exceptions.ConnectionError:
        return "Connection Error"

def getStatusCodeUrl(domain):
    try:
        response = requests.get("http://" + domain, timeout=5)
        return response.status_code
    except requests.exceptions.ConnectionError as e:
        return "Connection Error"
        

In [None]:
udf_getARecords = udf(getARecords, StringType())
udf_getMXRecords = udf(getMXRecords, StringType())
udf_getRedirectUrl = udf(getRedirectUrl, StringType())
udf_getStatusCodeUrl = udf(getStatusCodeUrl, StringType())

In [None]:
domains_df = domains_df.withColumn("A-Record_Checked", udf_getARecords("Top-Level-Domain"))
domains_df = domains_df.withColumn("MX-Record_Checked", udf_getMXRecords("Top-Level-Domain"))
domains_df = domains_df.withColumn("Redirection", udf_getRedirectUrl("Top-Level-Domain"))
domains_df = domains_df.withColumn("Status_Code", udf_getStatusCodeUrl("Top-Level-Domain"))

In [None]:
domains_df.show(20)

In [None]:
# Count the values of each column with descending sort and show it
domains_df.groupBy("Top-Level-Domain").count().sort(desc("count")).show(20, False)
domains_df.groupBy("MX-Record").count().sort(desc("count")).show(20, False)
domains_df.groupBy("A-Record").count().sort(desc("count")).show(20, False)
domains_df.groupBy("A-Record_Checked").count().sort(desc("count")).show(20, False)
domains_df.groupBy("MX-Record_Checked").count().sort(desc("count")).show(20, False)
domains_df.groupBy("Redirection").count().sort(desc("count")).show(20, False)

In [None]:
domains_df.withColumn('MX-Record_Count', explode(split(col('MX-Record'), ','))) \
        .groupBy('MX-Record_Count') \
        .count() \
        .sort(desc("count")) \
        .show(20, False)

domains_df.withColumn('MX-Record_Checked_Count', explode(split(col('MX-Record_Checked'), ','))) \
        .groupBy('MX-Record_Checked_Count') \
        .count() \
        .sort(desc("count")) \
        .show(20, False)

domains_df.withColumn('A-Record_Count', explode(split(col('A-Record'), ','))) \
        .groupBy('A-Record_Count') \
        .count() \
        .sort(desc("count")) \
        .show(20, False)

domains_df.withColumn('A-Record_Checked', explode(split(col('A-Record_Checked'), ','))) \
        .groupBy('A-Record_Checked') \
        .count() \
        .sort(desc("count")) \
        .show(20, False)


In [None]:
# Write the data frame to the PostgreSQL database
domains_df.write \
    .format("jdbc") \
    .option("url", connection) \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .mode("overwrite") \
    .save()