In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# PostgreSQL access data
host = "0.0.0.0"
port = "5432"
database = "domainanalysis"
user = "postgres"
password = "postgres"
table = "domain"

# PostgreSQL connection url
connection = f"jdbc:postgresql://{host}:{port}/{database}"

# Create a Spark session
spark = SparkSession.builder \
    .appName("ETL_DomainAnaylsis") \
    .getOrCreate()

# Read csv file into Spark data frame
domains_df = spark.read.csv('../data/real_domains.csv', escape = "\"").toDF("Top-Level-Domain", "MX-Record", "A-Record", "Timestamp")

# Delete the timestamp column
domains_df = domains_df.drop('Timestamp')

# Display the data frame
domains_df.show()

In [None]:
# Function to clean up a data frame
def cleanData(df, column, toDelete, toReplace):
    cleaned_df = df.withColumn(column, regexp_replace(column, toDelete, toReplace))
    return cleaned_df

In [None]:
# Save the column names
colNames = domains_df.schema.names

# Clean up each column
for column in colNames:
    domains_df = cleanData(domains_df, column, '\\[|\\]|\\"', "")

In [None]:
# Show first 5 rows
domains_df.head(5)

In [None]:
# Replace all empty rows with "no information"
domains_df = domains_df.select([when(col(c)=="","No information").otherwise(col(c)).alias(c) for c in domains_df.columns])

# Display the data frame
domains_df.show()

In [None]:
# Write the data frame to the PostgreSQL database
domains_df.write \
    .format("jdbc") \
    .option("url", connection) \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .mode("overwrite") \
    .save()