In [1]:
from pyspark.sql import SparkSession

jar_path = "/home/jovyan/jars/postgresql-42.7.5.jar,/home/jovyan/jars/neo4j-connector-apache-spark_2.12-5.3.7_for_spark_3.jar"
class_path = "/home/jovyan/jars/postgresql-42.7.5.jar:/home/jovyan/jars/neo4j-connector-apache-spark_2.12-5.3.7_for_spark_3.jar"

spark = SparkSession.builder \
    .appName("PostgreSQL + Neo4j Integration") \
    .config("spark.jars", jar_path) \
    .config("spark.driver.extraClassPath", class_path) \
    .config("spark.executor.extraClassPath", class_path) \
    .getOrCreate()

In [None]:
url = "jdbc:postgresql://host.docker.internal:65432/mydatabase"
properties = {
    "user": "myuser",
    "password": "mysecretpassword",
    "driver": "org.postgresql.Driver"
}

products_df = spark.read.jdbc(url=url, table="products", properties=properties)
sales_df = spark.read.jdbc(url=url, table="sales", properties=properties)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, count, avg, desc

products_df = spark.read.jdbc(url=url, table="products", properties=properties)
sales_df = spark.read.jdbc(url=url, table="sales", properties=properties)

top_products_df = sales_df.groupBy("sale_product_id") \
    .agg(
        sum("sale_quantity").alias("total_quantity_sold"),
        sum("sale_total_price").alias("total_revenue"),
        count("*").alias("sales_count")
    ).alias("sales") \
    .join(
        products_df.alias("products"),
        col("sales.sale_product_id") == col("products.sale_product_id"),
        "left"
    ) \
    .select(
        col("sales.sale_product_id").alias("product_id"),
        col("products.product_name"),
        col("products.product_category"),
        col("total_quantity_sold"),
        col("total_revenue"),
        col("sales_count")
    ).orderBy(desc("total_quantity_sold")).limit(10)

revenue_by_category_df = sales_df.alias("sales") \
    .join(products_df.alias("products"), col("sales.sale_product_id") == col("products.sale_product_id"), "left") \
    .groupBy("products.product_category") \
    .agg(sum("sales.sale_total_price").alias("total_revenue")) \
    .withColumnRenamed("product_category", "category") \
    .orderBy(desc("total_revenue"))

ratings_df = products_df.select(
    col("sale_product_id").alias("product_id"),
    "product_name",
    "product_category",
    "product_rating",
    "product_reviews"
).filter(col("product_rating").isNotNull())

neo4j_options = {
    "url": "bolt://neo4j:7687",
    "authentication.basic.username": "neo4j",
    "authentication.basic.password": "password"
}

top_products_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":TopSellingProduct") \
    .option("node.keys", "product_id") \
    .save()

revenue_by_category_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":RevenueByCategory") \
    .option("node.keys", "category") \
    .save()

ratings_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":ProductRating") \
    .option("node.keys", "product_id") \
    .save()

In [None]:
from pyspark.sql.functions import col, sum, count, desc

customers_df = spark.read.jdbc(url=url, table="customers", properties=properties)
sales_df = spark.read.jdbc(url=url, table="sales", properties=properties)

top_customers_df = sales_df.groupBy("sale_customer_id") \
    .agg(sum("sale_total_price").alias("total_spent")) \
    .alias("sales") \
    .join(customers_df.alias("customers"),
          col("sales.sale_customer_id") == col("customers.sale_customer_id"),
          "left") \
    .select(
        col("sales.sale_customer_id").alias("customer_id"),
        col("customers.customer_first_name"),
        col("customers.customer_last_name"),
        col("customers.customer_email"),
        col("customers.customer_country"),
        col("total_spent")
    ).orderBy(desc("total_spent")).limit(10)

customers_by_country_df = customers_df.groupBy("customer_country") \
    .agg(count("*").alias("customer_count")) \
    .withColumnRenamed("customer_country", "country") \
    .orderBy(desc("customer_count"))

avg_check_df = sales_df.groupBy("sale_customer_id") \
    .agg((sum("sale_total_price") / sum("sale_quantity")).alias("avg_check")) \
    .alias("sales") \
    .join(customers_df.alias("customers"),
          col("sales.sale_customer_id") == col("customers.sale_customer_id"),
          "left") \
    .select(
        col("sales.sale_customer_id").alias("customer_id"),
        col("customers.customer_first_name"),
        col("customers.customer_last_name"),
        col("customers.customer_email"),
        col("customers.customer_country"),
        col("avg_check")
    ).orderBy(desc("avg_check"))

top_customers_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":TopCustomer") \
    .option("node.keys", "customer_id") \
    .save()


customers_by_country_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":CustomerCountryStats") \
    .option("node.keys", "country") \
    .save()

avg_check_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":AverageCheck") \
    .option("node.keys", "customer_id") \
    .save()

In [None]:
from pyspark.sql.functions import year, month, sum, count

sales_df = spark.read.jdbc(url=url, table="sales", properties=properties)

monthly_trends_df = sales_df \
    .withColumn("year", year("sale_date")) \
    .withColumn("month", month("sale_date")) \
    .groupBy("year", "month") \
    .agg(
        sum("sale_total_price").alias("monthly_revenue"),
        sum("sale_quantity").alias("monthly_quantity")
    ) \
    .orderBy("year", "month")

yearly_comparison_df = sales_df \
    .withColumn("year", year("sale_date")) \
    .groupBy("year") \
    .agg(
        sum("sale_total_price").alias("total_revenue"),
        count("*").alias("sales_count")
    ) \
    .orderBy("year")

avg_order_monthly_df = sales_df \
    .withColumn("year", year("sale_date")) \
    .withColumn("month", month("sale_date")) \
    .groupBy("year", "month") \
    .agg(
        (sum("sale_total_price") / count("*")).alias("avg_order_value")
    ) \
    .orderBy("year", "month")

monthly_trends_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":MonthlySalesTrend") \
    .option("node.keys", "year,month") \
    .save()

yearly_comparison_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":YearlySalesSummary") \
    .option("node.keys", "year") \
    .save()

avg_order_monthly_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":MonthlyAvgOrderValue") \
    .option("node.keys", "year,month") \
    .save()

In [None]:
stores_df = spark.read.jdbc(url=url, table="stores", properties=properties)
sales_df = spark.read.jdbc(url=url, table="sales", properties=properties)

top_stores_df = sales_df.groupBy("store_name") \
    .agg(
        sum("sale_total_price").alias("total_revenue"),
        count("*").alias("sales_count")
    ) \
    .orderBy(desc("total_revenue")) \
    .limit(5)

geo_sales_df = sales_df.alias("sales") \
    .join(stores_df.alias("stores"), col("sales.store_name") == col("stores.store_name"), "left") \
    .groupBy("stores.store_city", "stores.store_country") \
    .agg(sum("sales.sale_total_price").alias("total_revenue")) \
    .withColumnRenamed("store_city", "city") \
    .withColumnRenamed("store_country", "country") \
    .orderBy(desc("total_revenue"))

avg_check_per_store_df = sales_df.groupBy("store_name") \
    .agg(
        (sum("sale_total_price") / count("*")).alias("avg_receipt")
    ) \
    .orderBy(desc("avg_receipt"))

top_stores_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":TopStoresByRevenue") \
    .option("node.keys", "store_name") \
    .save()

geo_sales_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":GeoSales") \
    .option("node.keys", "city,country") \
    .save()

avg_check_per_store_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":StoreAvgCheck") \
    .option("node.keys", "store_name") \
    .save()

In [None]:
suppliers_df = spark.read.jdbc(url=url, table="suppliers", properties=properties)
products_df = spark.read.jdbc(url=url, table="products", properties=properties)
sales_df = spark.read.jdbc(url=url, table="sales", properties=properties)

supplier_revenue_df = sales_df.alias("sales") \
    .join(products_df.alias("products"), col("sales.sale_product_id") == col("products.sale_product_id"), "left") \
    .join(suppliers_df.alias("suppliers"), col("products.product_brand") == col("suppliers.supplier_name"), "left") \
    .groupBy("suppliers.supplier_name") \
    .agg(sum("sales.sale_total_price").alias("total_revenue")) \
    .orderBy(desc("total_revenue")) \
    .limit(5)

avg_price_by_supplier_df = products_df.alias("products") \
    .join(suppliers_df.alias("suppliers"), col("products.product_brand") == col("suppliers.supplier_name"), "left") \
    .groupBy("suppliers.supplier_name") \
    .agg(avg("products.product_price").alias("avg_product_price")) \
    .orderBy(desc("avg_product_price"))

sales_by_supplier_country_df = sales_df.alias("sales") \
    .join(products_df.alias("products"), col("sales.sale_product_id") == col("products.sale_product_id"), "left") \
    .join(suppliers_df.alias("suppliers"), col("products.product_brand") == col("suppliers.supplier_name"), "left") \
    .groupBy("suppliers.supplier_country") \
    .agg(sum("sales.sale_total_price").alias("total_revenue")) \
    .withColumnRenamed("supplier_country", "country") \
    .orderBy(desc("total_revenue"))

supplier_revenue_df_clean = supplier_revenue_df.filter(col("supplier_name").isNotNull())
avg_price_by_supplier_df_clean = avg_price_by_supplier_df.filter(col("supplier_name").isNotNull())
sales_by_supplier_country_df_clean = sales_by_supplier_country_df.filter(col("country").isNotNull())

supplier_revenue_df_clean.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":TopSuppliersByRevenue") \
    .option("node.keys", "supplier_name") \
    .save()

avg_price_by_supplier_df_clean.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":AvgPriceBySupplier") \
    .option("node.keys", "supplier_name") \
    .save()

sales_by_supplier_country_df_clean.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":SalesBySupplierCountry") \
    .option("node.keys", "country") \
    .save()


In [None]:
from pyspark.sql.functions import col, desc, asc, sum

extreme_ratings_df = products_df.select(
    "sale_product_id", "product_name", "product_category", "product_rating"
).filter(col("product_rating").isNotNull()) \
    .orderBy(desc("product_rating")) \
    .limit(5) \
    .union(
        products_df.select(
            "sale_product_id", "product_name", "product_category", "product_rating"
        ).filter(col("product_rating").isNotNull()) \
        .orderBy(asc("product_rating")) \
        .limit(5)
    )

rating_sales_corr_df = sales_df.alias("sales") \
    .join(products_df.alias("products"), col("sales.sale_product_id") == col("products.sale_product_id"), "left") \
    .groupBy("products.product_rating") \
    .agg(sum("sales.sale_quantity").alias("total_quantity_sold")) \
    .orderBy("product_rating")

most_reviewed_df = products_df.select(
    "sale_product_id", "product_name", "product_category", "product_reviews"
).filter(col("product_reviews").isNotNull()) \
    .orderBy(desc("product_reviews")) \
    .limit(10)

extreme_ratings_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":ExtremeRatedProducts") \
    .option("node.keys", "sale_product_id") \
    .save()

rating_sales_corr_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":RatingVsSalesVolume") \
    .option("node.keys", "product_rating") \
    .save()

most_reviewed_df.write \
    .format("org.neo4j.spark.DataSource") \
    .mode("overwrite") \
    .options(**neo4j_options) \
    .option("labels", ":MostReviewedProducts") \
    .option("node.keys", "sale_product_id") \
    .save()
