In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MongoDBIntegration") \
    .master("local")\
    .config("spark.jars", 
            r"C:\Users\chopp\mongo-spark-connector_2.12-10.4.0-all.jar,"
            r"C:\Users\chopp\mongodb-driver-core-5.2.0-javadoc.jar,"
            r"C:\Users\chopp\bson-5.2.0-javadoc.jar," 
            r"C:\Users\chopp\postgresql-42.2.20.jar") \
    .config("spark.mongodb.read.connection.uri", "mongodb://localhost:27017/mydb") \
    .config("spark.mongodb.write.connection.uri", "mongodb://localhost:27017/mydb") \
    .getOrCreate()

spark.stop()

In [6]:
from pyspark.sql import SparkSession

# สร้าง SparkSession
spark = SparkSession.builder \
    .appName("MongoDBIntegration") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.0.0,org.postgresql:postgresql:42.3.1") \
    .config("spark.mongodb.read.connection.uri", "mongodb+srv://test:1234@cluster0.tlwua.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0") \
    .config("spark.mongodb.write.connection.uri", "mongodb+srv://test:1234@cluster0.tlwua.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0") \
    .getOrCreate()

df = spark.read \
    .format("mongodb") \
    .option("database", "user_database") \
    .option("collection", "user_data") \
    .load()

# แสดงข้อมูลที่โหลดมา
df.show()



+--------------------+------------+------------+--------------------+----------+
|                 _id|customername|phone_number|             product| sale_date|
+--------------------+------------+------------+--------------------+----------+
|67023b9bc1816958d...|  thanathorn|  0821588636|[{ASUS, 1, 2.0}, ...|2024-10-06|
|67023f84e36a1dfed...|    chaninat|  0812588636|[{SAMSUNG, 1, 200...|2024-10-05|
|67024ba390f757aa4...|   pongpatin|  0814523682|[{SAMSUNG, 14, 52...|2024-10-06|
|6702504b939259ddd...|    kongthai|  0971456202|[{SAMSUNG, 1, 300...|2024-10-31|
|670251b0939259ddd...|   kongthai2|  0945603473|[{SAMSUNG, 1, 500...|2024-10-06|
|670252cd939259ddd...|   kongthai3|  0823654869| [{OPPO, 1, 1000.0}]|2024-10-06|
|6703fe6e9e66d7833...|  thanathorn|  0945603471|[{SAMSUNG, 20, 50...|2024-10-07|
+--------------------+------------+------------+--------------------+----------+



In [7]:
from pyspark.sql.functions import explode, split, col
df_product = df.withColumn("product", explode(col("product")))\
.select("customername","sale_date",col("product.product").alias("band"),col("product.unit").alias("unit"),col("product.price").alias("price"))
df_product.show(truncate=False)

df_information = df.select("sale_date","customername","phone_number")
df_information.show(truncate=False)

+------------+----------+-------+----+-------+
|customername|sale_date |band   |unit|price  |
+------------+----------+-------+----+-------+
|thanathorn  |2024-10-06|ASUS   |1   |2.0    |
|thanathorn  |2024-10-06|OPPO   |3   |2000.0 |
|chaninat    |2024-10-05|SAMSUNG|1   |2000.0 |
|pongpatin   |2024-10-06|SAMSUNG|14  |52120.0|
|kongthai    |2024-10-31|SAMSUNG|1   |3000.0 |
|kongthai    |2024-10-31|RAZER  |5   |20000.0|
|kongthai    |2024-10-31|IPHONE |1   |29000.0|
|kongthai2   |2024-10-06|SAMSUNG|1   |5000.0 |
|kongthai3   |2024-10-06|OPPO   |1   |1000.0 |
|thanathorn  |2024-10-07|SAMSUNG|20  |50000.0|
+------------+----------+-------+----+-------+

+----------+------------+------------+
|sale_date |customername|phone_number|
+----------+------------+------------+
|2024-10-06|thanathorn  |0821588636  |
|2024-10-05|chaninat    |0812588636  |
|2024-10-06|pongpatin   |0814523682  |
|2024-10-31|kongthai    |0971456202  |
|2024-10-06|kongthai2   |0945603473  |
|2024-10-06|kongthai3   |0823

In [8]:
def write_to_postgresql(df, table_name):
    df.write \
        .format("jdbc") \
        .option("url", "jdbc:postgresql://localhost:5432/projectmongodb") \
        .option("dbtable", table_name) \
        .option("user", "postgres") \
        .option("password", "1160") \
        .option("driver", "org.postgresql.Driver") \
        .mode("append") \
        .save()
    
write_to_postgresql(df_product, "product")
write_to_postgresql(df_information, "information")



In [None]:
# CREATE OR REPLACE FUNCTION prevent_duplicate_product()
# RETURNS TRIGGER AS $$
# BEGIN
#     -- ตรวจสอบว่ามีข้อมูลซ้ำในตารางหรือไม่
#     IF EXISTS (
#         SELECT 1 FROM product
#         WHERE customername = NEW.customername
#         AND sale_date = NEW.sale_date
#         AND band = NEW.band
#         AND unit = NEW.unit
#         AND price = NEW.price
#     ) THEN
#         -- ถ้ามีข้อมูลซ้ำ ให้ตัดข้อมูลนี้ออก
#         RETURN NULL; 
#     END IF;
    
#     -- ถ้าไม่มีข้อมูลซ้ำ ให้แทรกข้อมูลใหม่
#     RETURN NEW; 
# END;
# $$ LANGUAGE plpgsql;



# CREATE TRIGGER trigger_prevent_duplicate_product
# BEFORE INSERT ON product
# FOR EACH ROW EXECUTE FUNCTION prevent_duplicate_product();
