In [1]:
!pip install pyspark




In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Latihan_MLlib") \
    .getOrCreate()
print("‚úì Spark Session berhasil dibuat!\n")

‚úì Spark Session berhasil dibuat!



In [3]:
print("=" * 60)
print("BAGIAN 2: REGRESI LINIER - PREDIKSI GAJI")
print("=" * 60)

BAGIAN 2: REGRESI LINIER - PREDIKSI GAJI


In [4]:
data_gaji = [
    (1.0, 20, 5000),
    (2.0, 22, 6000),
    (3.0, 25, 7000),
    (4.0, 26, 8500),
    (5.0, 30, 10000),
    (6.0, 31, 11500)
]
columns = ["pengalaman", "umur", "gaji"]
df_regresi = spark.createDataFrame(data_gaji, columns)
print("\nüìä Data Awal:")
df_regresi.show()


üìä Data Awal:
+----------+----+-----+
|pengalaman|umur| gaji|
+----------+----+-----+
|       1.0|  20| 5000|
|       2.0|  22| 6000|
|       3.0|  25| 7000|
|       4.0|  26| 8500|
|       5.0|  30|10000|
|       6.0|  31|11500|
+----------+----+-----+



In [5]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["pengalaman", "umur"],
    outputCol="features"
)
data_siap_reg = assembler.transform(df_regresi).select("features", "gaji")
print("üì¶ Data dalam format Vector:")
data_siap_reg.show(truncate=False)

üì¶ Data dalam format Vector:
+----------+-----+
|features  |gaji |
+----------+-----+
|[1.0,20.0]|5000 |
|[2.0,22.0]|6000 |
|[3.0,25.0]|7000 |
|[4.0,26.0]|8500 |
|[5.0,30.0]|10000|
|[6.0,31.0]|11500|
+----------+-----+



In [6]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = data_siap_reg.randomSplit([0.7, 0.3], seed=42)

lr = LinearRegression(featuresCol="features", labelCol="gaji")
model_lr = lr.fit(train_data)

hasil_prediksi = model_lr.transform(test_data)
print("üéØ Hasil Prediksi Gaji:")
hasil_prediksi.select("features", "gaji", "prediction").show()

print(f"\nüìà Koefisien: {model_lr.coefficients}")
print(f"üìç Intercept: {model_lr.intercept}\n")


üéØ Hasil Prediksi Gaji:
+----------+----+------------------+
|  features|gaji|        prediction|
+----------+----+------------------+
|[3.0,25.0]|7000|7439.3939393939345|
|[4.0,26.0]|8500| 8848.484848484854|
+----------+----+------------------+


üìà Koefisien: [1484.8484848485002,-75.75757575758196]
üìç Intercept: 4878.787878787984



In [7]:
print("=" * 60)
print("BAGIAN 3: KLASIFIKASI - PREDIKSI CHURN")
print("=" * 60)

# Langkah 6: Persiapan Data Klasifikasi
data_churn = [
    (2.0, 5, 1),
    (1.0, 4, 1),
    (10.0, 0, 0),
    (12.0, 1, 0),
    (3.0, 3, 1),
    (15.0, 0, 0)
]
df_churn = spark.createDataFrame(data_churn, ["durasi", "komplain", "label"])

assembler_churn = VectorAssembler(
    inputCols=["durasi", "komplain"],
    outputCol="features"
)
data_siap_class = assembler_churn.transform(df_churn).select("features", "label")

BAGIAN 3: KLASIFIKASI - PREDIKSI CHURN


In [8]:
from pyspark.ml.classification import LogisticRegression

log_reg = LogisticRegression(featuresCol="features", labelCol="label")
model_churn = log_reg.fit(data_siap_class)

hasil_churn = model_churn.transform(data_siap_class)
print("\nüéØ Hasil Prediksi Churn (Perhatikan kolom prediction vs label):")
hasil_churn.select("features", "label", "prediction", "probability").show(truncate=False)


üéØ Hasil Prediksi Churn (Perhatikan kolom prediction vs label):
+----------+-----+----------+-------------------------------------------+
|features  |label|prediction|probability                                |
+----------+-----+----------+-------------------------------------------+
|[2.0,5.0] |1    |1.0       |[2.7486447839081227E-14,0.9999999999999725]|
|[1.0,4.0] |1    |1.0       |[4.729924826979886E-13,0.999999999999527]  |
|[10.0,0.0]|0    |0.0       |[0.999999988084208,1.1915792019756566E-8]  |
|[12.0,1.0]|0    |0.0       |[0.9999999863708826,1.3629117390223655E-8] |
|[3.0,3.0] |1    |1.0       |[2.771802222998734E-8,0.9999999722819778]  |
|[15.0,0.0]|0    |0.0       |[0.9999999999999845,1.554312234475219E-14] |
+----------+-----+----------+-------------------------------------------+



In [9]:
print("=" * 60)
print("BAGIAN 4: CLUSTERING - K-MEANS")
print("=" * 60)


BAGIAN 4: CLUSTERING - K-MEANS


In [10]:
from pyspark.ml.clustering import KMeans

data_mall = [
    (15, 39), (16, 81), (17, 6), (18, 77), (19, 40),
    (50, 50), (55, 55), (60, 60),
    (100, 90), (110, 95), (120, 88)
]
df_mall = spark.createDataFrame(data_mall, ["pendapatan", "skor"])

assembler_cluster = VectorAssembler(
    inputCols=["pendapatan", "skor"],
    outputCol="features"
)
data_siap_cluster = assembler_cluster.transform(df_mall)

kmeans = KMeans().setK(3).setSeed(1)
model_km = kmeans.fit(data_siap_cluster)

prediksi_cluster = model_km.transform(data_siap_cluster)
print("\nüéØ Hasil Pengelompokan (Prediction adalah nomor cluster):")
prediksi_cluster.show()


üéØ Hasil Pengelompokan (Prediction adalah nomor cluster):
+----------+----+------------+----------+
|pendapatan|skor|    features|prediction|
+----------+----+------------+----------+
|        15|  39| [15.0,39.0]|         1|
|        16|  81| [16.0,81.0]|         1|
|        17|   6|  [17.0,6.0]|         1|
|        18|  77| [18.0,77.0]|         1|
|        19|  40| [19.0,40.0]|         1|
|        50|  50| [50.0,50.0]|         2|
|        55|  55| [55.0,55.0]|         2|
|        60|  60| [60.0,60.0]|         2|
|       100|  90|[100.0,90.0]|         0|
|       110|  95|[110.0,95.0]|         0|
|       120|  88|[120.0,88.0]|         0|
+----------+----+------------+----------+



In [11]:
centers = model_km.clusterCenters()
print("üìç Pusat Cluster (Centroids):")
for i, center in enumerate(centers):
    print(f"   Cluster {i}: Pendapatan={center[0]:.2f}, Skor={center[1]:.2f}")

üìç Pusat Cluster (Centroids):
   Cluster 0: Pendapatan=110.00, Skor=91.00
   Cluster 1: Pendapatan=17.00, Skor=48.60
   Cluster 2: Pendapatan=55.00, Skor=55.00


In [12]:
print("\n" + "=" * 60)
print("BAGIAN 5: TUGAS LATIHAN")
print("=" * 60)

# TUGAS 1: Prediksi Gaji untuk Data Baru
print("\nüìù TUGAS 1: Prediksi Gaji Data Baru")
print("-" * 60)

# Tambahkan data baru
data_gaji_baru = [
    (1.0, 20, 5000),
    (2.0, 22, 6000),
    (3.0, 25, 7000),
    (4.0, 26, 8500),
    (5.0, 30, 10000),
    (6.0, 31, 11500),
    (10.0, 40, None)  # Data baru yang akan diprediksi
]

df_regresi_baru = spark.createDataFrame(data_gaji_baru, ["pengalaman", "umur", "gaji"])
print("üìä Data dengan tambahan baru:")
df_regresi_baru.show()

# Transformasi dengan assembler
data_siap_reg_baru = assembler.transform(df_regresi_baru).select("features", "gaji")

# Training ulang dengan semua data lama (kecuali data baru)
data_training = data_siap_reg_baru.filter(data_siap_reg_baru.gaji.isNotNull())
model_lr_baru = lr.fit(data_training)

# Prediksi untuk semua data
hasil_prediksi_baru = model_lr_baru.transform(data_siap_reg_baru)
print("üéØ Hasil Prediksi (termasuk data baru):")
hasil_prediksi_baru.select("features", "gaji", "prediction").show()

print(f"üí° Prediksi gaji untuk Pengalaman=10 tahun, Umur=40:")
prediksi_terakhir = hasil_prediksi_baru.filter(hasil_prediksi_baru.gaji.isNull()).select("prediction").collect()
if prediksi_terakhir:
    print(f"   Gaji diprediksi: ${prediksi_terakhir[0]['prediction']:.2f}")



BAGIAN 5: TUGAS LATIHAN

üìù TUGAS 1: Prediksi Gaji Data Baru
------------------------------------------------------------
üìä Data dengan tambahan baru:
+----------+----+-----+
|pengalaman|umur| gaji|
+----------+----+-----+
|       1.0|  20| 5000|
|       2.0|  22| 6000|
|       3.0|  25| 7000|
|       4.0|  26| 8500|
|       5.0|  30|10000|
|       6.0|  31|11500|
|      10.0|  40| NULL|
+----------+----+-----+

üéØ Hasil Prediksi (termasuk data baru):
+-----------+-----+------------------+
|   features| gaji|        prediction|
+-----------+-----+------------------+
| [1.0,20.0]| 5000| 4712.500000000006|
| [2.0,22.0]| 6000|  6037.49999999996|
| [3.0,25.0]| 7000| 7325.000000000075|
| [4.0,26.0]| 8500| 8687.499999999867|
| [5.0,30.0]|10000| 9937.500000000146|
| [6.0,31.0]|11500|11299.999999999936|
|[10.0,40.0]| NULL|16562.499999999913|
+-----------+-----+------------------+

üí° Prediksi gaji untuk Pengalaman=10 tahun, Umur=40:
   Gaji diprediksi: $16562.50


In [13]:
print("\nüìù TUGAS 2: K-Means dengan 2 Cluster")
print("-" * 60)

kmeans_2 = KMeans().setK(2).setSeed(1)
model_km_2 = kmeans_2.fit(data_siap_cluster)

prediksi_cluster_2 = model_km_2.transform(data_siap_cluster)
print("\nüéØ Hasil Pengelompokan dengan 2 Cluster:")
prediksi_cluster_2.show()

centers_2 = model_km_2.clusterCenters()
print("üìç Pusat 2 Cluster:")
for i, center in enumerate(centers_2):
    print(f"   Cluster {i}: Pendapatan={center[0]:.2f}, Skor={center[1]:.2f}")

print("\nüí° Analisis:")
print("   Dengan K=2, data cenderung terbagi menjadi:")
print("   - Cluster 0: Kelompok pendapatan rendah-menengah")
print("   - Cluster 1: Kelompok pendapatan tinggi")
print("   Ini mencerminkan pembagian 'Kaya' vs 'Tidak Kaya'")

print("\n" + "=" * 60)
print("‚úÖ PRAKTIKUM SELESAI!")
print("=" * 60)

# Tutup Spark Session
# spark.stop()


üìù TUGAS 2: K-Means dengan 2 Cluster
------------------------------------------------------------

üéØ Hasil Pengelompokan dengan 2 Cluster:
+----------+----+------------+----------+
|pendapatan|skor|    features|prediction|
+----------+----+------------+----------+
|        15|  39| [15.0,39.0]|         1|
|        16|  81| [16.0,81.0]|         1|
|        17|   6|  [17.0,6.0]|         1|
|        18|  77| [18.0,77.0]|         1|
|        19|  40| [19.0,40.0]|         1|
|        50|  50| [50.0,50.0]|         1|
|        55|  55| [55.0,55.0]|         1|
|        60|  60| [60.0,60.0]|         1|
|       100|  90|[100.0,90.0]|         0|
|       110|  95|[110.0,95.0]|         0|
|       120|  88|[120.0,88.0]|         0|
+----------+----+------------+----------+

üìç Pusat 2 Cluster:
   Cluster 0: Pendapatan=110.00, Skor=91.00
   Cluster 1: Pendapatan=31.25, Skor=51.00

üí° Analisis:
   Dengan K=2, data cenderung terbagi menjadi:
   - Cluster 0: Kelompok pendapatan rendah-menengah
