In [None]:
pip install pandas==2.2.2 numpy pyspark==3.4.1

Collecting pandas==2.2.2
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting pyspark==3.4.1
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285391 sha256=b0ccbe64a0d187cbec06daa0018458fc97cc66b1d0b0852a4beaf7b255484f07
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspa

In [None]:
# Contoh membuat DataFrame sederhana dan operasi dasar
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('HandsOnPertemuan6').getOrCreate()

data = [('Siti', 'Sales', 26, 1500),
        ('Dave the Void Wizard', 'Penyihir', 100, 3000),
        ('Burgundi', 'Pelukis', 41, 5000),
        ('Skrong', 'Pokemon Trainer', 20, 2000)]
columns = ['Nama', 'Pekerjaan', 'Umur', 'Gaji']

df = spark.createDataFrame(data, schema=columns)
df.show()

+--------------------+---------------+----+----+
|                Nama|      Pekerjaan|Umur|Gaji|
+--------------------+---------------+----+----+
|                Siti|          Sales|  26|1500|
|Dave the Void Wizard|       Penyihir| 100|3000|
|            Burgundi|        Pelukis|  41|5000|
|              Skrong|Pokemon Trainer|  20|2000|
+--------------------+---------------+----+----+



In [None]:
# Contoh operasi transformasi DataFrame
df.select('Nama', 'Umur').show()
df.filter(df['Umur'] > 30).show()
df.groupBy('Pekerjaan').avg('Umur').show()

+--------------------+----+
|                Nama|Umur|
+--------------------+----+
|                Siti|  26|
|Dave the Void Wizard| 100|
|            Burgundi|  41|
|              Skrong|  20|
+--------------------+----+

+--------------------+---------+----+----+
|                Nama|Pekerjaan|Umur|Gaji|
+--------------------+---------+----+----+
|Dave the Void Wizard| Penyihir| 100|3000|
|            Burgundi|  Pelukis|  41|5000|
+--------------------+---------+----+----+

+---------------+---------+
|      Pekerjaan|avg(Umur)|
+---------------+---------+
|          Sales|     26.0|
|       Penyihir|    100.0|
|Pokemon Trainer|     20.0|
|        Pelukis|     41.0|
+---------------+---------+



In [None]:
from pyspark.sql.functions import col

df = df.withColumn("BonusGaji", col("Gaji") * 0.5)
df.show()


+--------------------+---------------+----+----+---------+
|                Nama|      Pekerjaan|Umur|Gaji|BonusGaji|
+--------------------+---------------+----+----+---------+
|                Siti|          Sales|  26|1500|    750.0|
|Dave the Void Wizard|       Penyihir| 100|3000|   1500.0|
|            Burgundi|        Pelukis|  41|5000|   2500.0|
|              Skrong|Pokemon Trainer|  20|2000|   1000.0|
+--------------------+---------------+----+----+---------+



In [None]:
# Contoh manipulasi tipe data kompleks
df.withColumn('BonusGaji', df['Gaji'] * 0.5).show()
df.withColumn('TotalGaji', df['Gaji'] + df['BonusGaji']).show()

+--------------------+---------------+----+----+---------+
|                Nama|      Pekerjaan|Umur|Gaji|BonusGaji|
+--------------------+---------------+----+----+---------+
|                Siti|          Sales|  26|1500|    750.0|
|Dave the Void Wizard|       Penyihir| 100|3000|   1500.0|
|            Burgundi|        Pelukis|  41|5000|   2500.0|
|              Skrong|Pokemon Trainer|  20|2000|   1000.0|
+--------------------+---------------+----+----+---------+

+--------------------+---------------+----+----+---------+---------+
|                Nama|      Pekerjaan|Umur|Gaji|BonusGaji|TotalGaji|
+--------------------+---------------+----+----+---------+---------+
|                Siti|          Sales|  26|1500|    750.0|   2250.0|
|Dave the Void Wizard|       Penyihir| 100|3000|   1500.0|   4500.0|
|            Burgundi|        Pelukis|  41|5000|   2500.0|   7500.0|
|              Skrong|Pokemon Trainer|  20|2000|   1000.0|   3000.0|
+--------------------+---------------+----+-

In [None]:
# Contoh menggunakan window functions
from pyspark.sql.window import Window
from pyspark.sql import functions as F

windowSpec = Window.orderBy('BonusGaji')
df.withColumn('Rank', F.rank().over(windowSpec)).show()

+--------------------+---------------+----+----+---------+----+
|                Nama|      Pekerjaan|Umur|Gaji|BonusGaji|Rank|
+--------------------+---------------+----+----+---------+----+
|                Siti|          Sales|  26|1500|    750.0|   1|
|              Skrong|Pokemon Trainer|  20|2000|   1000.0|   2|
|Dave the Void Wizard|       Penyihir| 100|3000|   1500.0|   3|
|            Burgundi|        Pelukis|  41|5000|   2500.0|   4|
+--------------------+---------------+----+----+---------+----+



Review apa yang telah dipelajari tentang pemrosesan data menggunakan Spark dan eksplorasi teknik lebih lanjut untuk mengoptimalkan pemrosesan data Anda.

Tugas 5: Buat ringkasan dari semua operasi yang telah dilakukan dan bagaimana teknik ini dapat diterapkan pada proyek data Anda.