In [2]:
from pyspark.sql import SparkSession
# Membuat Objek SparkSession
spark = SparkSession.builder.appName('HandsOn9').getOrCreate()

# Membuat database sederhana
data = [
    ('James', 34, 'M', 3000, 1),
    ('Anna', 28, 'F', 4100, 2),
    ('Lee', 23, 'M', 2700, 1)
]

# Membuat daftar kolom pada tabel untuk database
columns = ['Name','Age','Gender','Salary','DeptID']

# Membuat dataframe 
df = spark.createDataFrame(data, schema=columns)
df.createOrReplaceTempView('employees')

# Menjalankan query
spark.sql('SELECT * FROM employees').show()
spark.sql('SELECT Name, Age FROM employees WHERE Salary > 3000').show()
spark.sql('SELECT AVG(Salary) FROM employees').show()

+-----+---+------+------+------+
| Name|Age|Gender|Salary|DeptID|
+-----+---+------+------+------+
|James| 34|     M|  3000|     1|
| Anna| 28|     F|  4100|     2|
|  Lee| 23|     M|  2700|     1|
+-----+---+------+------+------+

+----+---+
|Name|Age|
+----+---+
|Anna| 28|
+----+---+

+------------------+
|       avg(Salary)|
+------------------+
|3266.6666666666665|
+------------------+



In [3]:
# Total gaji berdasarkan jenis kelamin dan usia
spark.sql('''
SELECT Gender, SUM(Salary) as TotalSalary, Age
FROM employees 
GROUP BY Gender, Age
ORDER BY Age
''').show()

+------+-----------+---+
|Gender|TotalSalary|Age|
+------+-----------+---+
|     M|       2700| 23|
|     F|       4100| 28|
|     M|       3000| 34|
+------+-----------+---+



In [36]:
# Top 3 karyawan dalam kelompok usia tertentu
spark.sql('''
SELECT Name, Age, Salary, ROW_NUMBER() OVER (PARTITION BY Age ORDER BY Salary DESC) as rank
FROM employees
''').show()

+-----+---+------+----+
| Name|Age|Salary|rank|
+-----+---+------+----+
|  Lee| 23|  2700|   1|
| Anna| 28|  4100|   1|
|James| 34|  3000|   1|
+-----+---+------+----+



In [6]:
# RANKING KARYAWAN
spark.sql('''
SELECT Name, Salary, DeptID,
RANK() OVER (PARTITION BY DeptID ORDER BY Salary DESC) AS Rank
FROM employees
''').show()

+-----+------+------+----+
| Name|Salary|DeptID|Rank|
+-----+------+------+----+
|James|  3000|     1|   1|
|  Lee|  2700|     1|   2|
| Anna|  4100|     2|   1|
+-----+------+------+----+



In [4]:
# RATA-RATA GAJI PER DEPARTMENT
spark.sql('''
SELECT DeptID, AVG(SALARY) as Rata_rata_gaji
FROM employees
GROUP BY DeptID
''').show()

+------+--------------+
|DeptID|Rata_rata_gaji|
+------+--------------+
|     1|        2850.0|
|     2|        4100.0|
+------+--------------+



In [5]:
# GAJI DI ATAS RATA-RATA TIAP GENDER
spark.sql('''
SELECT Name, Gender, Salary
FROM employees e
WHERE Salary > (SELECT AVG(Salary)
FROM employees e2
WHERE e2.Gender = e.Gender)
''').show()

+-----+------+------+
| Name|Gender|Salary|
+-----+------+------+
|James|     M|  3000|
+-----+------+------+

