[Reference](https://itnext.io/how-to-create-a-simple-etl-job-locally-with-pyspark-postgresql-and-docker-ea53cd43311d)

In [1]:
def initialize_Spark():

    spark = SparkSession.builder \
        .master("local[*]") \
        .appName("simple etl job") \
        .getOrCreate()

    return spark

In [2]:
def loadDFWithoutSchema(spark):

    df = spark.read.format("csv").option("header", "true").load(environ["HOME"] + "/Downloads/autos.csv")

    return df

In [3]:
def loadDFWithSchema(spark):

    schema = StructType([
        StructField("dateCrawled", TimestampType(), True),
        StructField("name", StringType(), True),
        StructField("seller", StringType(), True),
        StructField("offerType", StringType(), True),
        StructField("price", LongType(), True),
        StructField("abtest", StringType(), True),
        StructField("vehicleType", StringType(), True),
        StructField("yearOfRegistration", StringType(), True),
        StructField("gearbox", StringType(), True),
        StructField("powerPS", ShortType(), True),
        StructField("model", StringType(), True),
        StructField("kilometer", LongType(), True),
        StructField("monthOfRegistration", StringType(), True),
        StructField("fuelType", StringType(), True),
        StructField("brand", StringType(), True),
        StructField("notRepairedDamage", StringType(), True),
        StructField("dateCreated", DateType(), True),
        StructField("nrOfPictures", ShortType(), True),
        StructField("postalCode", StringType(), True),
        StructField("lastSeen", TimestampType(), True)
    ])

    df = spark \
        .read \
        .format("csv") \
        .schema(schema)         \
        .option("header", "true") \
        .load(environ["HOME"] + "/Downloads/autos.csv")

    return df

In [4]:
def clean_drop_data(df):

    df_dropped = df.drop("dateCrawled","nrOfPictures","lastSeen")
    df_filtered = df_dropped.where(col("seller") != "gewerblich")
    df_dropped_seller = df_filtered.drop("seller")
    df_filtered2 = df_dropped_seller.where(col("offerType") != "Gesuch")
    df_final = df_filtered2.drop("offerType")

    return df_final

In [6]:
import psycopg2

conn = psycopg2.connect(
        host = "localhost",
        database = "cars",
        user = "admin",
        password = "admin")
# cursor
cur = conn.cursor()

In [7]:
def create_table(cursor):

    cursor.execute("CREATE TABLE IF NOT EXISTS cars_table \
    (   name VARCHAR(255) NOT NULL, \
        price integer NOT NULL, \
        abtest VARCHAR(255) NOT NULL, \
        vehicleType VARCHAR(255), \
        yearOfRegistration VARCHAR(4) NOT NULL, \
        gearbox VARCHAR(255), \
        powerPS integer NOT NULL, \
        model VARCHAR(255), \
        kilometer integer, \
        monthOfRegistration VARCHAR(255) NOT NULL, \
        fuelType VARCHAR(255), \
        brand VARCHAR(255) NOT NULL, \
        notRepairedDamage VARCHAR(255), \
        dateCreated DATE NOT NULL, \
        postalCode VARCHAR(255) NOT NULL);")

In [8]:
def write_postgresql(df):

    cars_seq = [tuple(x) for x in df.collect()]

    records_list_template = ','.join(['%s'] * len(cars_seq))

    insert_query = "INSERT INTO cars_table (name, price, abtest, vehicleType, yearOfRegistration, gearbox, powerPS, \
                        model, kilometer, monthOfRegistration, fuelType, brand, notRepairedDamage, dateCreated, postalCode \
                           ) VALUES {}".format(records_list_template)

    return insert_query, cars_seq

In [9]:
cur.execute(insert_query, cars_seq)

In [10]:
def get_insterted_data(cursor):

    postgreSQL_select_Query = "select brand, model, price from cars_table"

    cursor.execute(postgreSQL_select_Query)

    cars_records = cursor.fetchmany(2)

    print("Printing 2 rows")
    for row in cars_records:
        print("Brand = ", row[0], )
        print("Model = ", row[1])
        print("Price  = ", row[2], "\n")

In [11]:
conn.commit()