In [None]:
!pip install psycopg2-binary

In [None]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import *
import psycopg2
import pandas as pd
import io
import os

spark = SparkSession.builder \
    .appName("etl_domains") \
    .getOrCreate()

In [None]:
domains_df = spark \
                .read.csv('../data/real_domains.csv', escape = "\"") \
                .toDF("top_level_domain", "mx_record", "a_record", "timestamp") \
                .drop('timestamp')
    
domains_df.show()

In [None]:
def clean_data(df, column, to_delete, to_replace):
    return df.withColumn(column, regexp_replace(column, to_delete, to_replace))

# Clean up each column
for column in domains_df.schema.names:
    domains_df = domains_df.withColumn(column, regexp_replace(column, '\\[', "{")) \
                           .withColumn(column, regexp_replace(column, '\\]', "}")) \
                           .withColumn(column, regexp_replace(column, '\\"', ""))

domains_df.show(5)

In [None]:
domains_df = domains_df.replace("{}", "null")

# Display the data frame
domains_df.show()

In [None]:
host = "bda_gr4_database"
port = "5432"
database = "domainanalysis"
user = "postgres"
password = "postgres"
# table = "domain"
connection_url = f"postgres://{user}:{password}@{host}:{port}/{database}"

def copy_to_db(df, table): 
    conn = psycopg2.connect(connection_url)
    buffer = io.StringIO()
    df.to_csv(buffer, header=False, index=False, sep=";")
    buffer.seek(0)
    cursor = conn.cursor()

    try:
        cursor.copy_from(buffer, table, sep=";", null="null")
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    cursor.close()
    conn.close()


slice_size = 100000
i = 0
count = domains_df.count()

domains_df = domains_df.withColumn("index", row_number().over(Window.orderBy(monotonically_increasing_id()))-1)

while i < count:
    df = domains_df.filter((domains_df["index"] >= i) & (domains_df["index"] < i + slice_size))
    copy_to_db(df.drop("index").toPandas(), "domain")
    i = i + slice_size

# TODO: Discuss => mx_record {} != null ?

# see: https://www.mikulskibartosz.name/how-to-speed-up-pyspark/
