In [1]:
import findspark
findspark.init("/opt/manual/spark/")

In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, ArrayType, FloatType

In [2]:
spark = SparkSession.builder \
.master("yarn") \
.enableHiveSupport() \
.appName("CleanDataSaveToHive") \
.getOrCreate()

2023-02-22 17:01:29,206 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2023-02-22 17:01:34,631 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


# Read Data

In [4]:
#! wget -P /home/train/datasets/ \
#https://github.com/erkansirin78/datasets/raw/master/dirty_store_transactions.csv

In [3]:
df = spark.read \
.format("csv") \
.option("header", True) \
.option("sep", ",") \
.option("inferSchema", True) \
.load("file:///home/train/datasets/dirty_store_transactions.csv")

                                                                                

# Clean Data

In [4]:
df1=df.withColumn("Date", F.to_date("Date", "y-M-d")) \
.withColumn("CP", F.translate("CP", "$", "").cast(DoubleType())) \
.withColumn("DISCOUNT",  F.translate("DISCOUNT", "$", "").cast(FloatType())) \
.withColumn("SP",  F.translate("SP", "$", "").cast(FloatType())) \
.withColumn("MRP",  F.translate("MRP", "$", "").cast(IntegerType())) \
.withColumn("STORE_LOCATION", F.regexp_replace(F.col("STORE_LOCATION"), "[^a-zA-Z0-9]", "")) \
.withColumn("PRODUCT_ID", F.regexp_replace(F.col("PRODUCT_ID"), "[^0-9]", ""))

In [5]:
df1.printSchema()

root
 |-- STORE_ID: string (nullable = true)
 |-- STORE_LOCATION: string (nullable = true)
 |-- PRODUCT_CATEGORY: string (nullable = true)
 |-- PRODUCT_ID: string (nullable = true)
 |-- MRP: integer (nullable = true)
 |-- CP: double (nullable = true)
 |-- DISCOUNT: float (nullable = true)
 |-- SP: float (nullable = true)
 |-- Date: date (nullable = true)



In [23]:
hive_df=df1.select([F.col(x).alias(x.lower()) for x in df1.columns])

In [24]:
hive_df.printSchema()

root
 |-- store_id: string (nullable = true)
 |-- store_location: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- mrp: integer (nullable = true)
 |-- cp: double (nullable = true)
 |-- discount: float (nullable = true)
 |-- sp: float (nullable = true)
 |-- date: date (nullable = true)



In [25]:
hive_df.show()

+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|store_id|store_location|product_category|product_id|mrp|   cp|discount|   sp|      date|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|  YR7220|       NewYork|     Electronics|  12254943| 31|20.77|    1.86|29.14|2019-11-26|
|  YR7220|       NewYork|       Furniture|  72619323| 15| 9.75|     1.5| 13.5|2019-11-26|
|  YR7220|       NewYork|     Electronics|  34161682| 88|62.48|     4.4| 83.6|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  79411621| 91|58.24|    3.64|87.36|2019-11-26|
|  YR7220|       NewYork|         Fashion|  39520263| 85| 51.0|    2.55|82.45|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  93809204| 37|24.05|    0.74|36.26|2019-11-26|
|  YR7220|       NewYork|       Cosmetics|  86610412| 80| 48.8|     6.4| 73.6|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  52503356| 71| 42.6|    5.68|65.32|2019-11-26|
|  YR7220|

In [26]:
hive_df.count()

37853

# Save data to Hive

In [None]:
spark.sql("CREATE DATABASE IF NOT EXISTS test1")

In [27]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|  company|
|  default|
|hive_odev|
|    test1|
+---------+



# ORC Format

In [36]:
hive_df.write.format("orc") \
.mode("overwrite") \
.saveAsTable("test1.clean_transactions1")

                                                                                

In [48]:
!hdfs dfs -ls /user/hive/warehouse/test1.db/clean_transactions

Found 2 items
-rw-r--r--   1 train hive          0 2023-02-22 17:04 /user/hive/warehouse/test1.db/clean_transactions/_SUCCESS
-rw-r--r--   1 train hive     194855 2023-02-22 17:04 /user/hive/warehouse/test1.db/clean_transactions/part-00000-9551138c-6532-401d-9b35-4682cb104196-c000.snappy.orc


# Default Format Parquet

In [37]:
hive_df.write. \
.mode("overwrite") \
.saveAsTable("test1.clean_transactions_default")

                                                                                

In [49]:
!hdfs dfs -ls /user/hive/warehouse/test1.db/clean_transactions_default

Found 2 items
-rw-r--r--   1 train hive          0 2023-02-22 17:12 /user/hive/warehouse/test1.db/clean_transactions_default/_SUCCESS
-rw-r--r--   1 train hive     241687 2023-02-22 17:12 /user/hive/warehouse/test1.db/clean_transactions_default/part-00000-02df1282-0e19-4ade-8131-f140819ff36f-c000.snappy.parquet


# Txt Format

In [47]:
hive_df.write \
.mode("overwrite") \
.option("fileFormat", "textfile") \
.saveAsTable("test1.clean_transactions_text")

                                                                                

In [51]:
!hdfs dfs -ls /user/hive/warehouse/test1.db/clean_transactions_text

Found 2 items
-rw-r--r--   1 train hive          0 2023-02-22 17:26 /user/hive/warehouse/test1.db/clean_transactions_text/_SUCCESS
-rw-r--r--   1 train hive     241687 2023-02-22 17:26 /user/hive/warehouse/test1.db/clean_transactions_text/part-00000-67abe4ad-345a-4b78-a80e-4731b710bcc4-c000.snappy.parquet


# Read data from Hive

In [30]:
df_read1 = spark.read.table("test1.clean_transactions1")

In [31]:
df_read1.show()

+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|store_id|store_location|product_category|product_id|mrp|   cp|discount|   sp|      date|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|  YR7220|       NewYork|     Electronics|  12254943| 31|20.77|    1.86|29.14|2019-11-26|
|  YR7220|       NewYork|       Furniture|  72619323| 15| 9.75|     1.5| 13.5|2019-11-26|
|  YR7220|       NewYork|     Electronics|  34161682| 88|62.48|     4.4| 83.6|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  79411621| 91|58.24|    3.64|87.36|2019-11-26|
|  YR7220|       NewYork|         Fashion|  39520263| 85| 51.0|    2.55|82.45|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  93809204| 37|24.05|    0.74|36.26|2019-11-26|
|  YR7220|       NewYork|       Cosmetics|  86610412| 80| 48.8|     6.4| 73.6|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  52503356| 71| 42.6|    5.68|65.32|2019-11-26|
|  YR7220|

In [32]:
df_read2 = spark.sql("select distinct store_location from test1.clean_transactions1")

In [33]:
df_read2.show()



+--------------+
|store_location|
+--------------+
|    Washington|
|       Houston|
|       NewYork|
|         Miami|
|        Denver|
+--------------+



                                                                                

In [34]:
df_read3 = spark.sql("select * from test1.clean_transactions_default limit 2")

In [35]:
df_read3.show()

+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|store_id|store_location|product_category|product_id|mrp|   cp|discount|   sp|      date|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|  YR7220|       NewYork|     Electronics|  12254943| 31|20.77|    1.86|29.14|2019-11-26|
|  YR7220|       NewYork|       Furniture|  72619323| 15| 9.75|     1.5| 13.5|2019-11-26|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+



                                                                                

In [None]:
spark.stop()