In [1]:
import findspark
findspark.init("/opt/manual/spark/")

In [2]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, ArrayType, FloatType

In [3]:
spark = SparkSession.builder \
.master("local[2]") \
.enableHiveSupport() \
.appName("CleanData") \
.getOrCreate()

2023-02-22 16:42:22,757 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
#! wget -P /home/train/datasets/ \
#https://github.com/erkansirin78/datasets/raw/master/dirty_store_transactions.csv

In [4]:
df = spark.read \
.format("csv") \
.option("header", True) \
.option("sep", ",") \
.option("inferSchema", True) \
.load("file:///home/train/datasets/dirty_store_transactions.csv")

                                                                                

In [5]:
df1=df.withColumn("Date", F.to_date("Date", "y-M-d")) \
.withColumn("CP", F.translate("CP", "$", "").cast(DoubleType())) \
.withColumn("DISCOUNT",  F.translate("DISCOUNT", "$", "").cast(FloatType())) \
.withColumn("SP",  F.translate("SP", "$", "").cast(FloatType())) \
.withColumn("MRP",  F.translate("MRP", "$", "").cast(IntegerType())) \
.withColumn("STORE_LOCATION", F.regexp_replace(F.col("STORE_LOCATION"), "[^a-zA-Z0-9]", "")) \
.withColumn("PRODUCT_ID", F.regexp_replace(F.col("PRODUCT_ID"), "[^0-9]", ""))

In [6]:
df1.printSchema()

root
 |-- STORE_ID: string (nullable = true)
 |-- STORE_LOCATION: string (nullable = true)
 |-- PRODUCT_CATEGORY: string (nullable = true)
 |-- PRODUCT_ID: string (nullable = true)
 |-- MRP: integer (nullable = true)
 |-- CP: double (nullable = true)
 |-- DISCOUNT: float (nullable = true)
 |-- SP: float (nullable = true)
 |-- Date: date (nullable = true)



In [7]:
df1.show()

+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|STORE_ID|STORE_LOCATION|PRODUCT_CATEGORY|PRODUCT_ID|MRP|   CP|DISCOUNT|   SP|      Date|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|  YR7220|       NewYork|     Electronics|  12254943| 31|20.77|    1.86|29.14|2019-11-26|
|  YR7220|       NewYork|       Furniture|  72619323| 15| 9.75|     1.5| 13.5|2019-11-26|
|  YR7220|       NewYork|     Electronics|  34161682| 88|62.48|     4.4| 83.6|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  79411621| 91|58.24|    3.64|87.36|2019-11-26|
|  YR7220|       NewYork|         Fashion|  39520263| 85| 51.0|    2.55|82.45|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  93809204| 37|24.05|    0.74|36.26|2019-11-26|
|  YR7220|       NewYork|       Cosmetics|  86610412| 80| 48.8|     6.4| 73.6|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  52503356| 71| 42.6|    5.68|65.32|2019-11-26|
|  YR7220|

In [8]:
df2=df1.select([F.col(x).alias(x.lower()) for x in df1.columns])

In [9]:
df2.show()

+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|store_id|store_location|product_category|product_id|mrp|   cp|discount|   sp|      date|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|  YR7220|       NewYork|     Electronics|  12254943| 31|20.77|    1.86|29.14|2019-11-26|
|  YR7220|       NewYork|       Furniture|  72619323| 15| 9.75|     1.5| 13.5|2019-11-26|
|  YR7220|       NewYork|     Electronics|  34161682| 88|62.48|     4.4| 83.6|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  79411621| 91|58.24|    3.64|87.36|2019-11-26|
|  YR7220|       NewYork|         Fashion|  39520263| 85| 51.0|    2.55|82.45|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  93809204| 37|24.05|    0.74|36.26|2019-11-26|
|  YR7220|       NewYork|       Cosmetics|  86610412| 80| 48.8|     6.4| 73.6|2019-11-26|
|  YR7220|       NewYork|         Kitchen|  52503356| 71| 42.6|    5.68|65.32|2019-11-26|
|  YR7220|

In [10]:
df2.createOrReplaceTempView("data_temp")

In [11]:
df_check = spark.sql("""
    
    
    SELECT STORE_LOCATION, SUM(CP), SUM(DISCOUNT), SUM(SP)
    FROM data_temp 
    GROUP BY STORE_LOCATION;

""")

In [12]:
df_check.show()

                                                                                

+--------------+------------------+------------------+------------------+
|STORE_LOCATION|           sum(CP)|     sum(DISCOUNT)|           sum(SP)|
+--------------+------------------+------------------+------------------+
|    Washington|324520.73999999854| 29375.87996855378|450798.11982536316|
|       Houston| 278819.5199999993| 24945.19997137785|388271.79981708527|
|       NewYork| 298466.4899999972| 26283.11000086367| 415423.8898162842|
|         Miami|267729.23999999836|24296.639976918697|371295.35949897766|
|        Denver|244569.53999999826|21501.479988992214| 340802.5197200775|
+--------------+------------------+------------------+------------------+



                                                                                

In [13]:
df2.count()

37853

In [14]:
spark.stop()