In [1]:
!pip install dnspython
!pip install ipynb

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np
import requests
import dns.resolver
# Import all functions from Funtions.ipynb
from ipynb.fs.full.Functions import *


#PostgreSQL access data
host = "bda_gr4_database"
port = "5432"
database = "domainanalysis"
user = "postgres"
password = "postgres"

# PostgreSQL connection url
connection = f"jdbc:postgresql://{host}:{port}/{database}"

# Create a Spark session
spark = SparkSession.builder \
    .appName("domain_analysis") \
    .getOrCreate()

# Read data from the database
domains_df = spark.read \
                .format("jdbc") \
                .option("url", connection) \
                .option("dbtable", "domain") \
                .option("user", user) \
                .option("password", password) \
                .load()



In [2]:
udf_IPv6Record = udf(IPv6Record, BooleanType())
udf_IPv6Record_error = udf(IPv6Record_error, StringType())
udf_getSOAInformation = udf(getSOAInformation, ArrayType(StringType()))
udf_getSOAInformation_error = udf(getSOAInformation_error, ArrayType(StringType()))
udf_getNameServers = udf(getNameServers, ArrayType(StringType()))
udf_getNameServers_error = udf(getNameServers_error, IntegerType())

In [3]:
domains_df = domains_df.withColumn("ipv6_available", udf_IPv6Record("top_level_domain"))
domains_df = domains_df.withColumn("ipv6_error", udf_IPv6Record_error("top_level_domain"))
domains_df = domains_df.withColumn("soa_information", udf_getSOAInformation("top_level_domain"))
domains_df = domains_df.withColumn("soa_information_error", udf_getSOAInformation_error("top_level_domain"))
domains_df = domains_df.withColumn("nameservers", udf_getNameServers("top_level_domain"))
domains_df = domains_df.withColumn("nameservers_error", udf_getNameServers_error("top_level_domain"))
#domains_df.limit(30).toPandas()

In [4]:
# Change ArrayType<String> into String as preparation for information separation
domains_df = domains_df.withColumn("soa_infos_rep", concat_ws(" ", "soa_information"))

# Split SOA information into separate columns (all String)
split_col = split(domains_df['soa_infos_rep'], ' ')
domains_df = domains_df.withColumn('mname', split_col.getItem(0))
domains_df = domains_df.withColumn('refresh', split_col.getItem(3))
domains_df = domains_df.withColumn('minimum', split_col.getItem(6))

#
def replace_empty_strings(x):
    return when(col(x) == "", None).otherwise(col(x))

domains_df = domains_df.withColumn("mname", replace_empty_strings("mname"))

In [5]:
# Remove last dot per soa mname
domains_df = domains_df.withColumn('mname', regexp_replace('mname', '.$', ''))   

# Remove last dot per nameserver entry
lambda_dot_remove = lambda arr: [x[:-1] for x in arr]
def fn_remove_dot(arr): return None if arr == None else lambda_dot_remove(arr)
udf_remove_last_char_in_array = udf(fn_remove_dot, ArrayType(StringType()))

domains_df = domains_df \
    .select("*") \
    .withColumn('nameservers', udf_remove_last_char_in_array(col('nameservers')))

In [6]:
# change dtypes of seconds to int
domains_df = domains_df.withColumn("refresh", domains_df["refresh"].cast(IntegerType()))
domains_df = domains_df.withColumn("minimum", domains_df["minimum"].cast(IntegerType()))

#domains_df.select('mname').show(20, False)
#domains_df.limit(30).toPandas()

In [7]:
# Count the occrence of SOA records
soa_mname_count = domains_df.withColumn('mname', (col('mname'))) \
        .groupBy('mname') \
        .count()

soa_mname_count_top_ten_df = soa_mname_count.orderBy(['count'], ascending = [False]).limit(10)

In [8]:
def getSOAIPv4(mname):
    if mname == None:
        return None
    resolver = dns.resolver.Resolver(); 
    response = resolver.query(mname , "A")
    soa_ipv4 = ''
    for item in response:
        ip_string = ','.join([str(item), soa_ipv4])
        ip_string = ip_string[:len(ip_string)-1]
    return ip_string

In [9]:
udf_getSOAIPv4 = udf(getSOAIPv4, StringType())
soa_mname_count_top_ten_df = soa_mname_count_top_ten_df.withColumn("ipv4", udf_getSOAIPv4("mname"))

In [10]:
# ===================REMOVE AND DELETE ?
udf_getOrganisation = udf(getOrg, StringType())
udf_getPostal = udf(getPostal, StringType())
udf_getCity = udf(getCity, StringType())
udf_getCountry = udf(getCountry, StringType())

In [11]:
# Add company information to SOA server
soa_mname_count_top_ten_df = soa_mname_count_top_ten_df.withColumn("organization", udf_getOrganisation("ipv4"))
soa_mname_count_top_ten_df = soa_mname_count_top_ten_df.withColumn("postal", udf_getPostal("ipv4"))
soa_mname_count_top_ten_df = soa_mname_count_top_ten_df.withColumn("city", udf_getCity("ipv4"))
soa_mname_count_top_ten_df = soa_mname_count_top_ten_df.withColumn("country", udf_getCountry("ipv4"))
soa_mname_count_top_ten_df.limit(10).toPandas()

Unnamed: 0,mname,count,ipv4,organization,postal,city,country
0,ns1.sedoparking.com,6,3.130.216.63,"AS16509 Amazon.com, Inc.",43026,Hilliard,US
1,,4,,,,,
2,dns01-tld.t-online.de,3,212.185.250.107,AS3320 Deutsche Telekom AG,50676,Köln,DE
3,root-dns.netcup.net,2,46.38.225.225,AS197540 netcup GmbH,76461,Muggensturm,DE
4,ns1.hosting.de,1,134.0.30.178,AS48823 Hosting.de GmbH,33519,Bielefeld,DE
5,ns1.antagus.de,1,195.191.92.10,AS25504 Vautron Rechenzentrum AG,93047,Regensburg,DE
6,brit.ns.cloudflare.com,1,108.162.192.78,"AS13335 Cloudflare, Inc.",94107,San Francisco,US
7,ns1.contabo.net,1,79.143.182.242,AS51167 Contabo GmbH,50676,Köln,DE
8,shades07.rzone.de,1,217.160.82.134,AS8560 1&1 IONOS SE,76137,Karlsruhe,DE
9,ns1.undeveloped.com,1,52.209.184.250,"AS16509 Amazon.com, Inc.",D02,Dublin,IE


In [12]:
# Delete unnecessary columns
domains_df = domains_df.drop('mx_record').drop('a_record').drop('soa_information').drop('soa_infos_rep')

In [13]:
#domains_df.limit(10).toPandas()

In [14]:
# Write the data frame to the PostgreSQL database
# domains_df.repartition(8).write \
#     .format("jdbc") \
#     .option("url", connection) \
#     .option("dbtable", "domain_enhanced") \
#     .option("user", user) \
#     .option("batchsize", 10000) \
#     .option("password", password) \
#     .mode("append") \
#     .save()