In [34]:
!pip install dnspython
!pip install ipynb

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
# Import all functions from Funtions.ipynb
from ipynb.fs.full.Functions import *

#PostgreSQL access data
host = "bda_gr4_database"
port = "5432"
database = "domainanalysis"
user = "postgres"
password = "postgres"

# PostgreSQL connection url
connection = f"jdbc:postgresql://{host}:{port}/{database}"

# Create a Spark session
spark = SparkSession.builder \
    .appName("domain_analysis") \
    .getOrCreate()

# Read data from the database
domains_df = spark.read \
                .format("jdbc") \
                .option("url", connection) \
                .option("dbtable", "domain") \
                .option("user", user) \
                .option("password", password) \
                .load()



In [36]:
# delete
# Get SOA information error from domain  
def getSOAInformation_error(domain):
    try:
        result = dns.resolver.resolve(domain, 'SOA')
        return 0
    except dns.resolver.NXDOMAIN:
        return 1
    except dns.resolver.NoAnswer:
        return 2
    except dns.resolver.NoNameservers:
        return 3
    except dns.exception.Timeout:
        return 4

In [37]:
udf_getIPv6Records = udf(getIPv6Records, StringType())
udf_getSOAInformation = udf(getSOAInformation, ArrayType(StringType()))
udf_getSOAInformation_error = udf(getSOAInformation_error, ArrayType(StringType()))
udf_getNameServers = udf(getNameServers, ArrayType(StringType()))
udf_getNameServers_error = udf(getNameServers_error, IntegerType())

In [39]:
domains_df = domains_df.withColumn("aaaa_record", udf_getIPv6Records("top_level_domain"))
domains_df = domains_df.withColumn("soa_information", udf_getSOAInformation("top_level_domain"))
domains_df = domains_df.withColumn("soa_information_error", udf_getSOAInformation_error("top_level_domain"))
domains_df = domains_df.withColumn("nameservers", udf_getNameServers("top_level_domain"))
domains_df = domains_df.withColumn("nameservers_error", udf_getNameServers_error("top_level_domain"))

In [44]:
# Change ArrayType<String> into String as preparation for information separation
domains_df = domains_df.withColumn("soa_infos_rep", concat_ws(" ", "soa_information"))

# Split SOA information into separate columns (all String)
split_col = split(domains_df['soa_infos_rep'], ' ')
domains_df = domains_df.withColumn('mname', split_col.getItem(0))
domains_df = domains_df.withColumn('refresh', split_col.getItem(3))
domains_df = domains_df.withColumn('minimum', split_col.getItem(6))

#
def replace_empty_strings(x):
    return when(col(x) == "", None).otherwise(col(x))

domains_df = domains_df.withColumn("mname", replace_empty_strings("mname"))

In [46]:
#domains_df.select('mname').show(20, False)

+------------------------+
|mname                   |
+------------------------+
|ns1.sedoparking.com.    |
|shades07.rzone.de.      |
|ns1.undeveloped.com.    |
|ns1.sedoparking.com.    |
|null                    |
|ns1.sedoparking.com.    |
|ns1.hosting.de.         |
|ns1.sedoparking.com.    |
|dns01-tld.t-online.de.  |
|ns1.bestcpanel.eu.      |
|ns1-tec.de.0-500.de.    |
|dns01-tld.t-online.de.  |
|null                    |
|ns1.sedoparking.com.    |
|ns1.ns67.de.            |
|becky.ns.cloudflare.com.|
|ns.namespace4you.de.    |
|root-dns.netcup.net.    |
|ns1.contabo.net.        |
|root-dns.netcup.net.    |
+------------------------+
only showing top 20 rows



In [47]:
# Remove last dot per soa mname
domains_df = domains_df.withColumn('mname', regexp_replace('mname', '.$', ''))   

# Remove last dot per nameserver entry
lambda_dot = lambda arr: [x[:-1] for x in arr]
def fn_remove_dot(arr): return None if arr == None else lambda_dot(arr)
udf_remove_last_char_in_array = udf(fn_remove_dot, ArrayType(StringType()))

domains_df = domains_df \
    .select("*") \
    .withColumn('nameservers', udf_remove_last_char_in_array(col('nameservers')))

In [48]:
# change dtypes of seconds to int
domains_df = domains_df.withColumn("refresh", domains_df["refresh"].cast(IntegerType()))
domains_df = domains_df.withColumn("minimum", domains_df["minimum"].cast(IntegerType()))

#domains_df.select('nameservers').show(20, False)
#domains_df.show(5)

In [49]:
# Count the values of each column with descending sort and show it
# domains_df.groupBy("Top-Level-Domain").count().sort(desc("count")).show(20, False)
# domains_df.groupBy("MX-Record").count().sort(desc("count")).show(20, False)
# domains_df.groupBy("A-Record").count().sort(desc("count")).show(20, False)
# domains_df.groupBy("A-Record_Checked").count().sort(desc("count")).show(20, False)
# domains_df.groupBy("MX-Record_Checked").count().sort(desc("count")).show(20, False)
# domains_df.groupBy("Redirection").count().sort(desc("count")).show(20, False)

In [50]:
# domains_df = domains_df.withColumn('mx_record_count', explode(col('mx_record'))) \
#         .groupBy('mx_record_count') \
#         .count() \
#         .sort(desc("count"))

domains_df = domains_df.withColumn('mx_record_checked_count', explode(col('mx_record_checked'))) \
        .groupBy('mx_record_checked_count') \
        .count() \
        .sort(desc("count"))

# domains_df = domains_df.withColumn('a_record_count', explode(col('a_record'))) \
#         .groupBy('a_record_count') \
#         .count() \
#         .sort(desc("count"))

domains_df = domains_df.withColumn('a_record_checked', explode(col('a_record_checked'))) \
        .groupBy('a_record_checked') \
        .count() \
        .sort(desc("count"))

domains_df = domains_df.withColumn('m_name', explode(col('m_name'))) \
        .groupBy('m_name') \
        .count() \
        .sort(desc("count"))

domains_df = domains_df.withColumn('nameservers_count', explode(col('nameservers'))) \
        .groupBy('nameservers_count') \
        .count() \
        .sort(desc("count"))

domains_df = domains_df.drop('mx_record').drop('a_record').drop('soa_information').drop('soa_infos_rep')

AnalysisException: cannot resolve '`mx_record_checked`' given input columns: [a_record, aaaa_record, minimum, mname, mx_record, nameservers, nameservers_error, refresh, soa_information, soa_information_error, soa_infos_rep, top_level_domain];
'Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#703, nameservers_error#522, soa_infos_rep#598, mname#677, refresh#716, minimum#729, explode('mx_record_checked) AS mx_record_checked_count#742]
+- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#703, nameservers_error#522, soa_infos_rep#598, mname#677, refresh#716, cast(minimum#637 as int) AS minimum#729]
   +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#703, nameservers_error#522, soa_infos_rep#598, mname#677, cast(refresh#624 as int) AS refresh#716, minimum#637]
      +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, fn_remove_dot(nameservers#512) AS nameservers#703, nameservers_error#522, soa_infos_rep#598, mname#677, refresh#624, minimum#637]
         +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#512, nameservers_error#522, soa_infos_rep#598, mname#677, refresh#624, minimum#637]
            +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#512, nameservers_error#522, soa_infos_rep#598, regexp_replace(mname#650, .$, , 1) AS mname#677, refresh#624, minimum#637]
               +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#512, nameservers_error#522, soa_infos_rep#598, CASE WHEN NOT (mname#611 = ) THEN mname#611 ELSE cast(null as string) END AS mname#650, refresh#624, minimum#637]
                  +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#512, nameservers_error#522, soa_infos_rep#598, mname#611, refresh#624, split(soa_infos_rep#598,  , -1)[6] AS minimum#637]
                     +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#512, nameservers_error#522, soa_infos_rep#598, mname#611, split(soa_infos_rep#598,  , -1)[3] AS refresh#624, minimum#564]
                        +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#512, nameservers_error#522, soa_infos_rep#598, split(soa_infos_rep#598,  , -1)[0] AS mname#611, refresh#552, minimum#564]
                           +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#512, nameservers_error#522, concat_ws( , soa_information#492) AS soa_infos_rep#598, mname#541, refresh#552, minimum#564]
                              +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#512, nameservers_error#522, soa_infos_rep#531, mname#541, refresh#552, split(soa_infos_rep#531,  , -1)[6] AS minimum#564]
                                 +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#512, nameservers_error#522, soa_infos_rep#531, mname#541, split(soa_infos_rep#531,  , -1)[3] AS refresh#552]
                                    +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#512, nameservers_error#522, soa_infos_rep#531, split(soa_infos_rep#531,  , -1)[0] AS mname#541]
                                       +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#512, nameservers_error#522, concat_ws( , soa_information#492) AS soa_infos_rep#531]
                                          +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, nameservers#512, getNameServers_error(top_level_domain#435) AS nameservers_error#522]
                                             +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, soa_information_error#502, getNameServers(top_level_domain#435) AS nameservers#512, nameservers_error#472]
                                                +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, soa_information#492, getSOAInformation_error(top_level_domain#435) AS soa_information_error#502, nameservers#463, nameservers_error#472]
                                                   +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#482, getSOAInformation(top_level_domain#435) AS soa_information#492, soa_information_error#455, nameservers#463, nameservers_error#472]
                                                      +- Project [top_level_domain#435, mx_record#436, a_record#437, getIPv6Records(top_level_domain#435) AS aaaa_record#482, soa_information#448, soa_information_error#455, nameservers#463, nameservers_error#472]
                                                         +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#442, soa_information#448, soa_information_error#455, nameservers#463, getNameServers_error(top_level_domain#435) AS nameservers_error#472]
                                                            +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#442, soa_information#448, soa_information_error#455, getNameServers(top_level_domain#435) AS nameservers#463]
                                                               +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#442, soa_information#448, getSOAInformation_error(top_level_domain#435) AS soa_information_error#455]
                                                                  +- Project [top_level_domain#435, mx_record#436, a_record#437, aaaa_record#442, getSOAInformation(top_level_domain#435) AS soa_information#448]
                                                                     +- Project [top_level_domain#435, mx_record#436, a_record#437, getIPv6Records(top_level_domain#435) AS aaaa_record#442]
                                                                        +- Relation[top_level_domain#435,mx_record#436,a_record#437] JDBCRelation(domain) [numPartitions=1]


In [None]:
domains_df.show(5)

In [None]:
# Write the data frame to the PostgreSQL database
domains_df.write \
    .format("jdbc") \
    .option("url", connection) \
    .option("dbtable", "domain_enhanced") \
    .option("user", user) \
    .option("password", password) \
    .mode("append") \
    .save()