In [None]:
# Required installations
!pip install ipynb
!pip install dnspython
!pip install geoip2

# Imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

#PostgreSQL access data
host = "bda_gr4_database"
port = "5432"
database = "domainanalysis"
user = "postgres"
password = "postgres"

# PostgreSQL connection url
connection = f"jdbc:postgresql://{host}:{port}/{database}"

# Create a Spark session
spark = SparkSession.builder \
    .appName("domain_analysis") \
    .getOrCreate()

# Read data from the database
domains_df = spark.read \
                .format("jdbc") \
                .option("url", connection) \
                .option("dbtable", "domain_records_checked") \
                .option("user", user) \
                .option("password", password) \
                .load()

# Drop columns
domains_df = domains_df.drop("a_record_checked").drop("a_record_checked_error").drop("mx_record_checked_error")

# Display the data frame
domains_df.limit(20).toPandas()

In [None]:
# Import all functions from Funtions.ipynb
from ipynb.fs.full.Functions import *

schema_location = StructType([
    StructField("iso_code", StringType(), True),
    StructField("country", StringType(), True),
    StructField("city", StringType(), True),
    StructField("postal", StringType(), True),
    StructField("latitude", StringType(), True),
    StructField("longitude", StringType(), True)])

schema_asn = StructType([
    StructField("autonomous_system_number", StringType(), True),
    StructField("autonomous_system_organization", StringType(), True)])

udf_getARecords = udf(getARecords, ArrayType(StringType()))
udf_getGeoLite2_Location = udf(getGeoLite2_Location, schema_location)
udf_getGeoLite2_ASN = udf(getGeoLite2_ASN, schema_asn)


In [None]:
# Create the new columns with the results
domains_mx_record_geolite2_df = domains_df.select(domains_df.top_level_domain,explode(domains_df.mx_record_checked).alias('mx_record_checked'))
domains_mx_record_geolite2_df = domains_mx_record_geolite2_df.withColumn("mx_record_ip", udf_getARecords("mx_record_checked")) \
                            .withColumn('mx_record_ip', explode(col('mx_record_ip'))) \
                            .withColumn("location", udf_getGeoLite2_Location("mx_record_ip")) \
                            .withColumn("asn", udf_getGeoLite2_ASN("mx_record_ip")) \
                            .select("top_level_domain", "mx_record_checked", "mx_record_ip", "location.*", "asn.*")


In [None]:
# Display the data frame
domains_mx_record_geolite2_df.limit(50).toPandas()

In [None]:
# Write the data frame to the PostgreSQL database
domains_mx_record_geolite2_df.repartition(8).write \
    .format("jdbc") \
    .option("url", connection) \
    .option("dbtable", "domain_mx_record_geolite2") \
    .option("user", user) \
    .option("batchsize", 10000) \
    .option("password", password) \
    .mode("append") \
    .save()