# 1. Pengenalan Spark DataFrames

In [None]:
pip install pyspark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('HandsOnPertemuan6').getOrCreate()

data = [('James', 'Sales', 3000),
        ('Michael', 'Sales', 4600),
        ('Robert', 'Sales', 4100),
        ('Maria', 'Finance', 3000)]
columns = ['EmployeeName', 'Department', 'Salary']

df = spark.createDataFrame(data, schema=columns)
df.show()

Tugas 1: Buat DataFrame sederhana di Spark dan eksplorasi beberapa fungsi dasar yang tersedia.

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("HandsOnPertemuan6").getOrCreate()

data1 = [('Ferdi', 'FT', 100),
        ('Maulana', 'Faperta', 90),
        ('Ikhsan', 'FT', 80),
        ('Sadi', 'FKIP', 95)]

columns1 = ['Nama', 'Fakultas', 'Nilai']

df = spark.createDataFrame(data1, schema=columns1)
df.show() #menampilkan semua data

df.printSchema() #menampilkan struktur data

#mengurutkan data berdasarkan nilai tertinggi
df.orderBy(df['Nilai'].desc()).show()

# 2. Transformasi Dasar dengan DataFrames

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('HandsOnPertemuan6').getOrCreate()

data2 = [('James', 'Sales', 3000),
        ('Michael', 'Sales', 4600),
        ('Robert', 'Sales', 4100),
        ('Maria', 'Finance', 3000)]
columns2 = ['EmployeeName', 'Department', 'Salary']

df = spark.createDataFrame(data2, schema=columns2)
df.show()

df.select('EmployeeName', 'Salary').show()
df.filter(df['Salary'] > 3000).show()
df.groupBy('Department').avg('Salary').show()

Tugas 2: Gunakan operasi filter, select, groupBy untuk mengekstrak informasi dari data, serta lakukan agregasi data untuk mendapatkan insight tentang dataset menggunakan perintah seperti mean, max, sum.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max, mean, sum

spark = SparkSession.builder.appName('HandsOnPertemuan6').getOrCreate()

data3 = [('James', 'Sales', 3000),
         ('Michael', 'Sales', 4600),
         ('Robert', 'Sales', 4100),
         ('Maria', 'Finance', 3000)]

columns3 = ['EmployeeName', 'Department', 'Salary']

df = spark.createDataFrame(data3, schema=columns3)
df.show()

# 1. Select kolom tertentu (EmployeeName dan Salary)
df.select('EmployeeName', 'Salary').show()

# 2. Filter data karyawan yang gajinya lebih dari 3000
df.filter(df['Salary'] > 3000).show()

# 3. Menghitung rata-rata gaji per departemen
df.groupBy('Department').avg('Salary').show()

# 4. Menghitung total gaji per departemen
df.groupBy('Department').sum('Salary').show()

# 5. Menghitung gaji maksimum di seluruh departemen
df.groupBy('Department').max('Salary').show()

# 6. Menghitung rata-rata gaji di seluruh dataset
df.select(mean('Salary')).show()

# 7. Menghitung total gaji di seluruh dataset
df.select(sum('Salary')).show()

# 3. Bekerja dengan Tipe Data Kompleks

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max, mean, sum

spark = SparkSession.builder.appName('HandsOnPertemuan6').getOrCreate()

data4 = [('James', 'Sales', 3000),
         ('Michael', 'Sales', 4600),
         ('Robert', 'Sales', 4100),
         ('Maria', 'Finance', 3000)]

columns4 = ['EmployeeName', 'Department', 'Salary']

df = spark.createDataFrame(data4, schema=columns4)

df = df.withColumn('SalaryBonus', df['Salary'] * 0.1)
df = df.withColumn('TotalCompensation', df['Salary'] + df['SalaryBonus'])
df.show()

# 4. Operasi Data Lanjutan

Tugas 4: Implementasikan window function untuk menghitung running totals atau rangkings.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import functions as F

spark = SparkSession.builder.appName('HandsOnPertemuan6').getOrCreate()

data5 = [('James', 'Sales', 3000),
         ('Michael', 'Sales', 4600),
         ('Robert', 'Sales', 4100),
         ('Maria', 'Finance', 3000)]

columns5 = ['EmployeeName', 'Department', 'Salary']

df = spark.createDataFrame(data5, schema=columns5)

windowSpec = Window.partitionBy('Department').orderBy('Salary')
df.withColumn('Rank', F.rank().over(windowSpec)).show()