**Pegenalan**

In [32]:
# Contoh membuat DataFrame sederhana dan operasi dasar
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('HandsOnPertemuan3').getOrCreate()

data = [('James', 'Sales', 3000),
        ('Michael', 'Sales', 4600),
        ('Robert', 'Sales', 4100),
        ('Maria', 'Finance', 3000)]
columns = ['EmployeeName', 'Department', 'Salary']

df = spark.createDataFrame(data, schema=columns)
df.show()

+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|       James|     Sales|  3000|
|     Michael|     Sales|  4600|
|      Robert|     Sales|  4100|
|       Maria|   Finance|  3000|
+------------+----------+------+



Transformasi **dasar**

In [33]:
# Menampilkan kolom employeename dan salary
df.select('EmployeeName', 'Salary').show()
# Menampilkan salary yang diatas 3000
df.filter(df['Salary'] > 3000).show()
# Mengelompokkan data berdasarkan department lalu menghitung rata rata salary tiap department
df.groupBy('Department').avg('Salary').show()
# Mengelompokkan data berdasarkan department lalu di aggregasi
df.groupBy("Department").agg(
    # Untuk menghitung rata rata salary tiap department
    F.mean("Salary").alias("AvgSalary"),
    # Untuk menampilkan salary tertinggi tiap department
    F.max("Salary").alias("MaxSalary"),
    # Untuk menampilkan salary terendah tiap department
    F.min("Salary").alias("MinSalary"),
    # Untuk menghitung total salary tiap department
    F.sum("Salary").alias("TotalSalary"),
    # Untuk menghitung jumlah employeename tiap department
    F.count("Salary").alias("CountSalary")).show()

+------------+------+
|EmployeeName|Salary|
+------------+------+
|       James|  3000|
|     Michael|  4600|
|      Robert|  4100|
|       Maria|  3000|
+------------+------+

+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|     Michael|     Sales|  4600|
|      Robert|     Sales|  4100|
+------------+----------+------+

+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|     Sales|     3900.0|
|   Finance|     3000.0|
+----------+-----------+

+----------+---------+---------+---------+-----------+-----------+
|Department|AvgSalary|MaxSalary|MinSalary|TotalSalary|CountSalary|
+----------+---------+---------+---------+-----------+-----------+
|     Sales|   3900.0|     4600|     3000|      11700|          3|
|   Finance|   3000.0|     3000|     3000|       3000|          1|
+----------+---------+---------+---------+-----------+-----------+



In [35]:
# Kolom SalaryBonus dihitung dari 10% gaji
df = df.withColumn("SalaryBonus", df["Salary"] * 0.1)

# Kolom TotalCompensation
df = df.withColumn("TotalCompensation", df["Salary"] + df["SalaryBonus"])
df.show()

+------------+----------+------+-----------+-----------------+
|EmployeeName|Department|Salary|SalaryBonus|TotalCompensation|
+------------+----------+------+-----------+-----------------+
|       James|     Sales|  3000|      300.0|           3300.0|
|     Michael|     Sales|  4600|      460.0|           5060.0|
|      Robert|     Sales|  4100|      410.0|           4510.0|
|       Maria|   Finance|  3000|      300.0|           3300.0|
+------------+----------+------+-----------+-----------------+



In [16]:
# Penggunaan window functions
from pyspark.sql.window import Window
from pyspark.sql import functions as F

windowSpec = Window.partitionBy('Department').orderBy('Salary')
df.withColumn('Rank', F.rank().over(windowSpec)).show()

+------------+---------------+------+-----------+-----------------+----+
|EmployeeName|     Department|Salary|SalaryBonus|TotalCompensation|Rank|
+------------+---------------+------+-----------+-----------------+----+
|        Lina|     Accounting|  5200|      520.0|           5720.0|   1|
|       Maria|        Finance|  3000|      300.0|           3300.0|   1|
|       David|        Manager|  7000|      700.0|           7700.0|   1|
|        Andy|Market Analysis|  4800|      480.0|           5280.0|   1|
|       James|          Sales|  3000|      300.0|           3300.0|   1|
|      Robert|          Sales|  4100|      410.0|           4510.0|   2|
|     Michael|          Sales|  4600|      460.0|           5060.0|   3|
+------------+---------------+------+-----------+-----------------+----+



In [30]:


# Import library
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count

# Membuat SparkSession
spark = SparkSession.builder.appName('Analisis Diabetes').getOrCreate()

# Membaca dataset diabetes
df = spark.read.csv("diabetes.csv", header=True, inferSchema=True)

# Melihat struktur data
df.printSchema()
df.show(5)

# Statistik deskriptif kolom utama
df.describe(["Pregnancies", "Glucose", "BloodPressure", "BMI", "Age"]).show()

# Rata-rata nilai setiap kolom berdasarkan Outcome (0 = tidak diabetes, 1 = diabetes)
outcome_avg = df.groupBy("Outcome").agg(
    avg("Pregnancies").alias("AvgPregnancies"),
    avg("Glucose").alias("AvgGlucose"),
    avg("BloodPressure").alias("AvgBloodPressure"),
    avg("BMI").alias("AvgBMI"),
    avg("Age").alias("AvgAge")
)
outcome_avg.show()

# Jumlah pasien diabetes vs non-diabetes
outcome_count = df.groupBy("Outcome").agg(count("*").alias("JumlahPasien"))
outcome_count.show()

# Rata-rata Glucose berdasarkan kelompok usia
age_group = df.withColumn("AgeGroup",
                          (col("Age")/10).cast("int")*10)  # membagi umur per 10 tahun
age_group_avg = age_group.groupBy("AgeGroup").agg(avg("Glucose").alias("AvgGlucose"))
age_group_avg.orderBy("AgeGroup").show()


root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          