In [None]:
# Imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

#PostgreSQL access data
host = "bda_gr4_database"
port = "5432"
database = "domainanalysis"
user = "postgres"
password = "postgres"

# PostgreSQL connection url
connection = f"jdbc:postgresql://{host}:{port}/{database}"

# Create a Spark session
spark = SparkSession.builder \
    .appName("domain_analysis") \
    .getOrCreate()

# Read data from the database
domains_df = spark.read \
                .format("jdbc") \
                .option("url", connection) \
                .option("dbtable", "domain_records_checked") \
                .option("user", user) \
                .option("password", password) \
                .load()

# Drop columns
domains_df = domains_df.drop("a_record_checked").drop("a_record_checked_error").drop("mx_record_checked_error")

# Display the data frame
domains_df.limit(20).toPandas()

In [None]:
# Import all functions from Funtions.ipynb
from ipynb.fs.full.Functions import *

# Creating of UDF's
udf_getARecords = udf(getARecords, ArrayType(StringType()))
udf_getCity = udf(getCity, StringType())
udf_getRegion = udf(getRegion, StringType())
udf_getCountry = udf(getCountry, StringType())
udf_getCoordinates = udf(getCoordinates, StringType())
udf_getOrg = udf(getOrg, StringType())
udf_getPostal = udf(getPostal, StringType())
udf_getTimezone = udf(getTimezone, StringType())

In [None]:
# Create the new columns with the results
#domains_df = domains_df.withColumn("MX-Records", concat_ws(",",col("mx_record_checked")))
domains_mx_record_df = domains_df.select(domains_df.top_level_domain,explode(domains_df.mx_record_checked).alias('mx_record_checked'))
domains_mx_record_df = domains_mx_record_df.withColumn("mx_record_ip", udf_getARecords("mx_record_checked"))
domains_mx_record_df = domains_mx_record_df.withColumn('mx_record_ip', concat_ws('', 'mx_record_ip'))
domains_mx_record_df = domains_mx_record_df.withColumn("mx_record_city", udf_getCity("mx_record_ip"))
domains_mx_record_df = domains_mx_record_df.withColumn("mx_record_region", udf_getRegion("mx_record_ip"))
domains_mx_record_df = domains_mx_record_df.withColumn("mx_record_country", udf_getCountry("mx_record_ip"))
domains_mx_record_df = domains_mx_record_df.withColumn("mx_record_coordinates", udf_getCoordinates("mx_record_ip"))
domains_mx_record_df = domains_mx_record_df.withColumn("mx_record_org", udf_getOrg("mx_record_ip"))
domains_mx_record_df = domains_mx_record_df.withColumn("mx_record_postal", udf_getPostal("mx_record_ip"))
domains_mx_record_df = domains_mx_record_df.withColumn("mx_record_timezone", udf_getTimezone("mx_record_ip"))

In [None]:
# Display the data frame
domains_mx_record_df.limit(20).toPandas()

In [None]:
# Write the data frame to the PostgreSQL database
domains_mx_record_df.repartition(8).write \
    .format("jdbc") \
    .option("url", connection) \
    .option("dbtable", "domain_mx_record_location") \
    .option("user", user) \
    .option("batchsize", 10000) \
    .option("password", password) \
    .mode("append") \
    .save()