In [2]:
import findspark
findspark.init("/opt/manual/spark/")

In [3]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, ArrayType, FloatType

In [4]:
import configparser

In [5]:
spark = SparkSession.builder \
.master("local[2]") \
.appName("CleanDataSaveToPostGre") \
.getOrCreate()

2023-02-22 16:12:53,102 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
config = configparser.RawConfigParser()

config.read('/opt/db_conn')
user_name = config.get('DB', 'user_name')
password = config.get('DB', 'password')
db_ip = config.get('DB', 'db_ip')

# Read Data

In [None]:
#! wget -P /home/train/datasets/ \
#https://github.com/erkansirin78/datasets/raw/master/dirty_store_transactions.csv

In [26]:
df = spark.read \
.format("csv") \
.option("header", True) \
.option("sep", ",") \
.option("inferSchema", True) \
.load("file:///home/train/datasets/dirty_store_transactions.csv")

# Clean Data

In [8]:
df1=df.withColumn("Date", F.to_date("Date", "y-M-d")) \
.withColumn("CP", F.translate("CP", "$", "").cast(DoubleType())) \
.withColumn("DISCOUNT",  F.translate("DISCOUNT", "$", "").cast(FloatType())) \
.withColumn("SP",  F.translate("SP", "$", "").cast(FloatType())) \
.withColumn("MRP",  F.translate("MRP", "$", "").cast(IntegerType())) \
.withColumn("STORE_LOCATION", F.regexp_replace(F.col("STORE_LOCATION"), "[^a-zA-Z0-9]", "")) \
.withColumn("PRODUCT_ID", F.regexp_replace(F.col("PRODUCT_ID"), "[^0-9]", ""))

In [11]:
df1.printSchema()

root
 |-- STORE_ID: string (nullable = true)
 |-- STORE_LOCATION: string (nullable = true)
 |-- PRODUCT_CATEGORY: string (nullable = true)
 |-- PRODUCT_ID: string (nullable = true)
 |-- MRP: integer (nullable = true)
 |-- CP: double (nullable = true)
 |-- DISCOUNT: float (nullable = true)
 |-- SP: float (nullable = true)
 |-- Date: date (nullable = true)



In [12]:
df1.show()

+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|STORE_ID|STORE_LOCATION|PRODUCT_CATEGORY|PRODUCT_ID|MRP|   CP|DISCOUNT|   SP|      Date|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|  YR7220|       NewYork|     Electronics|  12254943| 31|20.77|    1.86|29.14|2019-11-26|
|  YR7220|       NewYork|       Furniture|  72619323| 15| 9.75|     1.5| 13.5|2019-11-26|
|  YR7220|       NewYork|     Electronics|  34161682| 88|62.48|     4.4| 83.6|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  79411621| 91|58.24|    3.64|87.36|2019-11-26|
|  YR7220|       NewYork|         Fashion|  39520263| 85| 51.0|    2.55|82.45|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  93809204| 37|24.05|    0.74|36.26|2019-11-26|
|  YR7220|       NewYork|       Cosmetics|  86610412| 80| 48.8|     6.4| 73.6|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  52503356| 71| 42.6|    5.68|65.32|2019-11-26|
|  YR7220|

In [16]:
df2=df1.select([F.col(x).alias(x.lower()) for x in df1.columns])

In [14]:
df2.printSchema()

root
 |-- store_id: string (nullable = true)
 |-- store_location: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- mrp: integer (nullable = true)
 |-- cp: double (nullable = true)
 |-- discount: float (nullable = true)
 |-- sp: float (nullable = true)
 |-- date: date (nullable = true)



# Save Data to PostGre

In [20]:
(df2
.write.format("jdbc")
.mode("overwrite")
.option("driver", "org.postgresql.Driver")
.option("url", f"jdbc:postgresql://{db_ip}:5432/traindb")
.option("dbtable", "clean_transactions")
.option("user", user_name)
.option("password", password)
.save())

                                                                                

In [11]:
#(df1
#.write.format("jdbc")
#.mode("overwrite")
#.option("driver", "org.postgresql.Driver")
#.option("url", "jdbc:postgresql://127.0.0.1:5432/traindb")
#.option("dbtable", "clean_transactions")
#.option("user", "train")
#.option("password", "Ankara06")
#.save())

                                                                                

# Read Data from PostGre 

In [24]:
df_analysis_query = (spark.read.format("jdbc")
.option("driver", "org.postgresql.Driver")
.option("url", f"jdbc:postgresql://{db_ip}:5432/traindb")
.option("query", "select store_location, avg(cp) as avg_cp, avg(discount) AS avg_discount from clean_transactions group by store_location")
.option("user", user_name)
.option("password", password)
.load())

In [25]:
df_analysis_query.show()

+--------------+------------------+------------------+
|store_location|            avg_cp|      avg_discount|
+--------------+------------------+------------------+
|    Washington|37.717426778242505| 3.414211990766362|
|       Houston|37.754843601895644| 3.377819901337556|
|        Denver|37.361677360219716|3.2846746087675243|
|         Miami|  36.9078081058724| 3.349412734617962|
|       NewYork| 37.01221354166632|3.2593142362182133|
+--------------+------------------+------------------+



In [None]:
#spark.stop()