In [13]:
import findspark
findspark.init("/opt/manual/spark/")

In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, ArrayType, FloatType

In [2]:
spark = SparkSession.builder \
.master("yarn") \
.enableHiveSupport() \
.appName("CleanDataSaveToHdfs") \
.getOrCreate()

2023-02-22 17:37:37,384 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2023-02-22 17:37:42,023 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


# Read Data

In [4]:
#! wget -P /home/train/datasets/ \
#https://github.com/erkansirin78/datasets/raw/master/dirty_store_transactions.csv

In [3]:
df = spark.read \
.format("csv") \
.option("header", True) \
.option("sep", ",") \
.option("inferSchema", True) \
.load("file:///home/train/datasets/dirty_store_transactions.csv")

                                                                                

# Clean Data

In [4]:
df1=df.withColumn("Date", F.to_date("Date", "y-M-d")) \
.withColumn("CP", F.translate("CP", "$", "").cast(DoubleType())) \
.withColumn("DISCOUNT",  F.translate("DISCOUNT", "$", "").cast(FloatType())) \
.withColumn("SP",  F.translate("SP", "$", "").cast(FloatType())) \
.withColumn("MRP",  F.translate("MRP", "$", "").cast(IntegerType())) \
.withColumn("STORE_LOCATION", F.regexp_replace(F.col("STORE_LOCATION"), "[^a-zA-Z0-9]", "")) \
.withColumn("PRODUCT_ID", F.regexp_replace(F.col("PRODUCT_ID"), "[^0-9]", ""))

In [7]:
df1.printSchema()

root
 |-- STORE_ID: string (nullable = true)
 |-- STORE_LOCATION: string (nullable = true)
 |-- PRODUCT_CATEGORY: string (nullable = true)
 |-- PRODUCT_ID: string (nullable = true)
 |-- MRP: integer (nullable = true)
 |-- CP: double (nullable = true)
 |-- DISCOUNT: float (nullable = true)
 |-- SP: float (nullable = true)
 |-- Date: date (nullable = true)



In [5]:
df2=df1.select([F.col(x).alias(x.lower()) for x in df1.columns])

In [6]:
df2.printSchema()

root
 |-- store_id: string (nullable = true)
 |-- store_location: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- mrp: integer (nullable = true)
 |-- cp: double (nullable = true)
 |-- discount: float (nullable = true)
 |-- sp: float (nullable = true)
 |-- date: date (nullable = true)



In [7]:
df2.show()

                                                                                

+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|store_id|store_location|product_category|product_id|mrp|   cp|discount|   sp|      date|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|  YR7220|       NewYork|     Electronics|  12254943| 31|20.77|    1.86|29.14|2019-11-26|
|  YR7220|       NewYork|       Furniture|  72619323| 15| 9.75|     1.5| 13.5|2019-11-26|
|  YR7220|       NewYork|     Electronics|  34161682| 88|62.48|     4.4| 83.6|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  79411621| 91|58.24|    3.64|87.36|2019-11-26|
|  YR7220|       NewYork|         Fashion|  39520263| 85| 51.0|    2.55|82.45|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  93809204| 37|24.05|    0.74|36.26|2019-11-26|
|  YR7220|       NewYork|       Cosmetics|  86610412| 80| 48.8|     6.4| 73.6|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  52503356| 71| 42.6|    5.68|65.32|2019-11-26|
|  YR7220|

# Write to HDFS

In [24]:
(df2
.write.format("parquet")
.mode("overwrite")
.save("/user/train/spark_odev_transaction"))

                                                                                

In [25]:
!hdfs dfs -ls -t /user/train/spark_odev_transaction

Found 2 items
-rw-r--r--   1 train supergroup          0 2023-02-22 17:47 /user/train/spark_odev_transaction/_SUCCESS
-rw-r--r--   1 train supergroup     241687 2023-02-22 17:47 /user/train/spark_odev_transaction/part-00000-44cf0685-c36c-46ad-b6a4-0e4ec56e7e66-c000.snappy.parquet


# Read from HDFS

In [15]:
df_read = (spark.read.format("parquet").load("hdfs://localhost:9000/user/train/spark_odev_transaction"))

                                                                                

In [16]:
df_read.show()

+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|store_id|store_location|product_category|product_id|mrp|   cp|discount|   sp|      date|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|  YR7220|       NewYork|     Electronics|  12254943| 31|20.77|    1.86|29.14|2019-11-26|
|  YR7220|       NewYork|       Furniture|  72619323| 15| 9.75|     1.5| 13.5|2019-11-26|
|  YR7220|       NewYork|     Electronics|  34161682| 88|62.48|     4.4| 83.6|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  79411621| 91|58.24|    3.64|87.36|2019-11-26|
|  YR7220|       NewYork|         Fashion|  39520263| 85| 51.0|    2.55|82.45|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  93809204| 37|24.05|    0.74|36.26|2019-11-26|
|  YR7220|       NewYork|       Cosmetics|  86610412| 80| 48.8|     6.4| 73.6|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  52503356| 71| 42.6|    5.68|65.32|2019-11-26|
|  YR7220|

                                                                                

In [12]:
#spark.stop()