In [13]:
from pyspark.sql import SparkSession as SS
spark = SS.builder.appName("HandsOnPertemuan3").getOrCreate()

# Membuat data barang per baris
data = [("1","Sabun",5000,20,"Alat Mandi"),
        ("2","Pasta Gigi",10000,15,"Alat Mandi"),
        ("3","Pensil",3000,25,"Alat Tulis"),
        ("4","Bolpoin",6000,10,"Alat Tulis"),
        ("5","Shampoo",1000,30,"Alat Mandi")]

# Membuat kolom barang
columns = ["ID Barang","Nama","Harga","Stok","Kategori"]

df = spark.createDataFrame(data, schema=columns)
df.show()

25/09/04 18:50:19 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+---------+----------+-----+----+----------+
|ID Barang|      Nama|Harga|Stok|  Kategori|
+---------+----------+-----+----+----------+
|        1|     Sabun| 5000|  20|Alat Mandi|
|        2|Pasta Gigi|10000|  15|Alat Mandi|
|        3|    Pensil| 3000|  25|Alat Tulis|
|        4|   Bolpoin| 6000|  10|Alat Tulis|
|        5|   Shampoo| 1000|  30|Alat Mandi|
+---------+----------+-----+----+----------+



In [14]:
# Menampilkan data Nama dan Stok barang
df.select("Nama","Stok").show()

# Menampilkan barang dengan harga di atas 5000
df.filter(df["Harga"]>5000).show()

# Menampilkan jumlah harga di tiap kategori
df.groupBy("Kategori").sum("Harga").show()

# Menampilkan rata-rata harga
df.groupBy().mean("Harga").show()

# Menampilkan stok tertinggi
df.groupBy().max("Stok").show()

# Menampilkan keseluruhan jumlah stok
df.groupBy().sum("Stok").show()

+----------+----+
|      Nama|Stok|
+----------+----+
|     Sabun|  20|
|Pasta Gigi|  15|
|    Pensil|  25|
|   Bolpoin|  10|
|   Shampoo|  30|
+----------+----+

+---------+----------+-----+----+----------+
|ID Barang|      Nama|Harga|Stok|  Kategori|
+---------+----------+-----+----+----------+
|        2|Pasta Gigi|10000|  15|Alat Mandi|
|        4|   Bolpoin| 6000|  10|Alat Tulis|
+---------+----------+-----+----+----------+

+----------+----------+
|  Kategori|sum(Harga)|
+----------+----------+
|Alat Mandi|     16000|
|Alat Tulis|      9000|
+----------+----------+

+----------+
|avg(Harga)|
+----------+
|    5000.0|
+----------+

+---------+
|max(Stok)|
+---------+
|       30|
+---------+

+---------+
|sum(Stok)|
+---------+
|      100|
+---------+



In [15]:
# Menambah kolom baru bernama Pajak
df.withColumn("Pajak", df["Harga"] * 0.12).show()

# Mengubah nilai stok
df.withColumn("Stok", df["Stok"] + 3).show()

+---------+----------+-----+----+----------+------+
|ID Barang|      Nama|Harga|Stok|  Kategori| Pajak|
+---------+----------+-----+----+----------+------+
|        1|     Sabun| 5000|  20|Alat Mandi| 600.0|
|        2|Pasta Gigi|10000|  15|Alat Mandi|1200.0|
|        3|    Pensil| 3000|  25|Alat Tulis| 360.0|
|        4|   Bolpoin| 6000|  10|Alat Tulis| 720.0|
|        5|   Shampoo| 1000|  30|Alat Mandi| 120.0|
+---------+----------+-----+----+----------+------+

+---------+----------+-----+----+----------+
|ID Barang|      Nama|Harga|Stok|  Kategori|
+---------+----------+-----+----+----------+
|        1|     Sabun| 5000|  23|Alat Mandi|
|        2|Pasta Gigi|10000|  18|Alat Mandi|
|        3|    Pensil| 3000|  28|Alat Tulis|
|        4|   Bolpoin| 6000|  13|Alat Tulis|
|        5|   Shampoo| 1000|  33|Alat Mandi|
+---------+----------+-----+----+----------+



In [16]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F

windowSpec = Window.partitionBy("Kategori").orderBy("Harga")
df.withColumn("Rank", F.rank().over(windowSpec)).show()

+---------+----------+-----+----+----------+----+
|ID Barang|      Nama|Harga|Stok|  Kategori|Rank|
+---------+----------+-----+----+----------+----+
|        5|   Shampoo| 1000|  30|Alat Mandi|   1|
|        1|     Sabun| 5000|  20|Alat Mandi|   2|
|        2|Pasta Gigi|10000|  15|Alat Mandi|   3|
|        3|    Pensil| 3000|  25|Alat Tulis|   1|
|        4|   Bolpoin| 6000|  10|Alat Tulis|   2|
+---------+----------+-----+----+----------+----+

