In [4]:
#mengimport modul yang dibutuhkan
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import hash, abs

#membuat session
appName = "Sistem Penskoran Otomatis pada Soal Essay 2"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [5]:
spark

# Import Dataset

In [8]:
df_pyspark=spark.read.csv('training_data_essay.csv', sep=';', inferSchema=True, header=True)

In [9]:
df_pyspark.show(20)

+----------+------------+--------------------+----+-------------+
|       npm|nama_peserta|             jawaban|soal|skor_per_soal|
+----------+------------+--------------------+----+-------------+
|         0|       Admin|Tidak, Hanya memb...|   1|          100|
|         0|       Admin|Biaya dihitung be...|   2|          100|
|         0|       Admin|Hak cipta adalah ...|   3|          100|
|         0|       Admin|Dijelaskan kepada...|   4|          100|
|         0|       Admin|1. Melindungi dan...|   5|          100|
|         0|       Admin|Ruang Komputer, P...|   6|          100|
|         0|       Admin|Aturlah posisi pe...|   7|          100|
|         0|       Admin|Posisi Kepala dan...|   8|          100|
|         0|       Admin|1. Kecocokan soft...|   9|          100|
|         0|       Admin|1. Fokus dan expo...|  10|          100|
|         0|       Admin|1. Peralatan yang...|  11|          100|
|         0|       Admin|1. Dibuat grafik ...|  12|          100|
|112102003

# Pre-Processing

In [10]:
# Replace comma (",") with dot (".") in the "score" column
df_pyspark = df_pyspark.withColumn("skor_per_soal", regexp_replace(col("skor_per_soal"), ",", "."))

# Show the DataFrame after replacing the comma with a dot
df_pyspark.show()

+----------+------------+--------------------+----+-------------+
|       npm|nama_peserta|             jawaban|soal|skor_per_soal|
+----------+------------+--------------------+----+-------------+
|         0|       Admin|Tidak, Hanya memb...|   1|          100|
|         0|       Admin|Biaya dihitung be...|   2|          100|
|         0|       Admin|Hak cipta adalah ...|   3|          100|
|         0|       Admin|Dijelaskan kepada...|   4|          100|
|         0|       Admin|1. Melindungi dan...|   5|          100|
|         0|       Admin|Ruang Komputer, P...|   6|          100|
|         0|       Admin|Aturlah posisi pe...|   7|          100|
|         0|       Admin|Posisi Kepala dan...|   8|          100|
|         0|       Admin|1. Kecocokan soft...|   9|          100|
|         0|       Admin|1. Fokus dan expo...|  10|          100|
|         0|       Admin|1. Peralatan yang...|  11|          100|
|         0|       Admin|1. Dibuat grafik ...|  12|          100|
|112102003

In [11]:
df_pyspark.printSchema()

root
 |-- npm: integer (nullable = true)
 |-- nama_peserta: string (nullable = true)
 |-- jawaban: string (nullable = true)
 |-- soal: integer (nullable = true)
 |-- skor_per_soal: string (nullable = true)



In [12]:
# Change the data type of the "id" column to integer
df_pyspark = df_pyspark.withColumn("skor_per_soal", col("skor_per_soal").cast("float"))

# Display the updated DataFrame schema
df_pyspark.printSchema()
df_pyspark.show()

root
 |-- npm: integer (nullable = true)
 |-- nama_peserta: string (nullable = true)
 |-- jawaban: string (nullable = true)
 |-- soal: integer (nullable = true)
 |-- skor_per_soal: float (nullable = true)

+----------+------------+--------------------+----+-------------+
|       npm|nama_peserta|             jawaban|soal|skor_per_soal|
+----------+------------+--------------------+----+-------------+
|         0|       Admin|Tidak, Hanya memb...|   1|        100.0|
|         0|       Admin|Biaya dihitung be...|   2|        100.0|
|         0|       Admin|Hak cipta adalah ...|   3|        100.0|
|         0|       Admin|Dijelaskan kepada...|   4|        100.0|
|         0|       Admin|1. Melindungi dan...|   5|        100.0|
|         0|       Admin|Ruang Komputer, P...|   6|        100.0|
|         0|       Admin|Aturlah posisi pe...|   7|        100.0|
|         0|       Admin|Posisi Kepala dan...|   8|        100.0|
|         0|       Admin|1. Kecocokan soft...|   9|        100.0|
| 

# Menyiapkan Data

In [13]:
data = df_pyspark.select("soal","jawaban",'skor_per_soal')
data.show()

+----+--------------------+-------------+
|soal|             jawaban|skor_per_soal|
+----+--------------------+-------------+
|   1|Tidak, Hanya memb...|        100.0|
|   2|Biaya dihitung be...|        100.0|
|   3|Hak cipta adalah ...|        100.0|
|   4|Dijelaskan kepada...|        100.0|
|   5|1. Melindungi dan...|        100.0|
|   6|Ruang Komputer, P...|        100.0|
|   7|Aturlah posisi pe...|        100.0|
|   8|Posisi Kepala dan...|        100.0|
|   9|1. Kecocokan soft...|        100.0|
|  10|1. Fokus dan expo...|        100.0|
|  11|1. Peralatan yang...|        100.0|
|  12|1. Dibuat grafik ...|        100.0|
|   1|tidak, cuma mengi...|         52.7|
|   2|biaya dihitung be...|        42.86|
|   3|hak membuat merup...|        42.16|
|   4|dipaparkan pada k...|        27.19|
|   5|1. mencegah serta...|        44.14|
|   6|ruang komputer, p...|        100.0|
|   7|aturlah posisi fi...|        57.68|
|   8|posisi kepala ser...|        45.71|
+----+--------------------+-------

In [14]:
# Apply the hash function
hashedData = data.withColumn("hashedValue", hash("jawaban"))

# Show the results
hashedData.select("soal", "hashedValue","skor_per_soal").show(truncate=False)


+----+-----------+-------------+
|soal|hashedValue|skor_per_soal|
+----+-----------+-------------+
|1   |-2059296905|100.0        |
|2   |1183180174 |100.0        |
|3   |1232762403 |100.0        |
|4   |-2035408785|100.0        |
|5   |1588395990 |100.0        |
|6   |339970513  |100.0        |
|7   |50850002   |100.0        |
|8   |-945877996 |100.0        |
|9   |1576366224 |100.0        |
|10  |-1905649442|100.0        |
|11  |550139146  |100.0        |
|12  |1727767227 |100.0        |
|1   |1947733435 |52.7         |
|2   |-1139863335|42.86        |
|3   |122676417  |42.16        |
|4   |-1054163002|27.19        |
|5   |1990940339 |44.14        |
|6   |1770907636 |100.0        |
|7   |-463479969 |57.68        |
|8   |-412537011 |45.71        |
+----+-----------+-------------+
only showing top 20 rows



# Splitting Data

In [15]:
#membagi data, 70% training dan 30% testing
splits = hashedData.randomSplit([0.7, 0.3])
train = splits[0].withColumnRenamed("skor_per_soal", "Label")
test = splits[1].withColumnRenamed("skor_per_soal", "trueLabel")

#menghitung baris data training dan testing
train_rows = train.count()
test_rows = test.count()
print ("Jumlah baris data training:", train_rows, 
       ", jumlah baris data testing:", test_rows)

Jumlah baris data training: 79 , jumlah baris data testing: 41


In [16]:
train.show()

+----+--------------------+-----+-----------+
|soal|             jawaban|Label|hashedValue|
+----+--------------------+-----+-----------+
|   1|Tidak, Hanya memb...|100.0|-2059296905|
|   1|tidak, cuma mengi...| 52.7| 1947733435|
|   1|tidak, hanya memb...|100.0| -256638840|
|   1|tidak, hanya memb...|100.0| -256638840|
|   1|tidak, hanya memb...|100.0| -256638840|
|   1|tidak, hanya memb...|100.0| -256638840|
|   1|tidak, hanya memb...|100.0| -256638840|
|   1|tidak, hanya memb...|100.0| -256638840|
|   1|tidak, hanya memb...|100.0| -256638840|
|   2|Biaya dihitung be...|100.0| 1183180174|
|   2|biaya dihitung be...|84.52| -219318287|
|   2|biaya dihitung be...|100.0| 1176853507|
|   2|biaya dihitung be...|100.0| 1176853507|
|   2|biaya dihitung be...|100.0| 1176853507|
|   2|biaya dihitung be...|100.0| 1176853507|
|   2|biaya dihitung be...|42.86|-1139863335|
|   2|   perhitungan biaya|26.73| -808224749|
|   3|hak cipta adalah ...|83.43|-1876419705|
|   3|hak cipta adalah ...|91.71| 

In [17]:
train.printSchema()

root
 |-- soal: integer (nullable = true)
 |-- jawaban: string (nullable = true)
 |-- Label: float (nullable = true)
 |-- hashedValue: integer (nullable = false)



# Mendefinisikan Model

In [None]:
#mendefinisikan algoritma ALS untuk sistem rekomender kita
als = ALS(maxIter=120, regParam=0.01, userCol="soal", 
          itemCol="hashedValue", ratingCol="Label")
#mentraining model dengan fungsi ".fit()"
model = als.fit(train)
print("Model telah selesai ditraining!")

# Menyiapkan Data Baru

In [71]:
data_baru=spark.read.csv('dataset_baru.csv', sep=';', inferSchema=True, header=True)

In [72]:
data_baru.show()

+-----------+------------+--------------------+----+-------------+
|        npm|nama_peserta|             jawaban|soal|skor_per_soal|
+-----------+------------+--------------------+----+-------------+
|21083010032|      Angela|Ya, semakin banya...|   1|         20,5|
|21083010032|      Angela|Jumlah uang yang ...|   2|           45|
|21083010032|      Angela|hak membuat merup...|   3|        43,18|
|21083010032|      Angela|bila graf sangat ...|   4|        24,56|
|21083010032|      Angela|1. mencegah serta...|   5|         46,9|
|21083010032|      Angela|ruang komputer, p...|   6|          100|
|21083010032|      Angela|aturlah posisi  k...|   7|         63,4|
|21083010032|      Angela|posisi kepala ser...|   8|           48|
|21083010032|      Angela|1.kesesuaian apli...|   9|        51,33|
|21083010032|      Angela|fokus serta apa a...|  10|        39,08|
|21083010032|      Angela|1. perlengkapan y...|  11|        39,88|
|21083010032|      Angela|metode artwork 2d...|  12|        25

In [73]:
# Replace comma (",") with dot (".") in the "score" column
data_baru = data_baru.withColumn("skor_per_soal", regexp_replace(col("skor_per_soal"), ",", "."))

# Show the DataFrame after replacing the comma with a dot
data_baru.show()

+-----------+------------+--------------------+----+-------------+
|        npm|nama_peserta|             jawaban|soal|skor_per_soal|
+-----------+------------+--------------------+----+-------------+
|21083010032|      Angela|Ya, semakin banya...|   1|         20.5|
|21083010032|      Angela|Jumlah uang yang ...|   2|           45|
|21083010032|      Angela|hak membuat merup...|   3|        43.18|
|21083010032|      Angela|bila graf sangat ...|   4|        24.56|
|21083010032|      Angela|1. mencegah serta...|   5|         46.9|
|21083010032|      Angela|ruang komputer, p...|   6|          100|
|21083010032|      Angela|aturlah posisi  k...|   7|         63.4|
|21083010032|      Angela|posisi kepala ser...|   8|           48|
|21083010032|      Angela|1.kesesuaian apli...|   9|        51.33|
|21083010032|      Angela|fokus serta apa a...|  10|        39.08|
|21083010032|      Angela|1. perlengkapan y...|  11|        39.88|
|21083010032|      Angela|metode artwork 2d...|  12|        25

In [74]:
# Change the data type of the "id" column to integer
data_baru_2 = data_baru.withColumn("skor_per_soal", col("skor_per_soal").cast("float"))

# Display the updated DataFrame schema
data_baru_2.printSchema()
data_baru_2.show()

root
 |-- npm: long (nullable = true)
 |-- nama_peserta: string (nullable = true)
 |-- jawaban: string (nullable = true)
 |-- soal: integer (nullable = true)
 |-- skor_per_soal: float (nullable = true)

+-----------+------------+--------------------+----+-------------+
|        npm|nama_peserta|             jawaban|soal|skor_per_soal|
+-----------+------------+--------------------+----+-------------+
|21083010032|      Angela|Ya, semakin banya...|   1|         20.5|
|21083010032|      Angela|Jumlah uang yang ...|   2|         45.0|
|21083010032|      Angela|hak membuat merup...|   3|        43.18|
|21083010032|      Angela|bila graf sangat ...|   4|        24.56|
|21083010032|      Angela|1. mencegah serta...|   5|         46.9|
|21083010032|      Angela|ruang komputer, p...|   6|        100.0|
|21083010032|      Angela|aturlah posisi  k...|   7|         63.4|
|21083010032|      Angela|posisi kepala ser...|   8|         48.0|
|21083010032|      Angela|1.kesesuaian apli...|   9|        

In [75]:
data2 = data_baru_2.select("soal","jawaban",'skor_per_soal')
data2.show()

+----+--------------------+-------------+
|soal|             jawaban|skor_per_soal|
+----+--------------------+-------------+
|   1|Ya, semakin banya...|         20.5|
|   2|Jumlah uang yang ...|         45.0|
|   3|hak membuat merup...|        43.18|
|   4|bila graf sangat ...|        24.56|
|   5|1. mencegah serta...|         46.9|
|   6|ruang komputer, p...|        100.0|
|   7|aturlah posisi  k...|         63.4|
|   8|posisi kepala ser...|         48.0|
|   9|1.kesesuaian apli...|        51.33|
|  10|fokus serta apa a...|        39.08|
|  11|1. perlengkapan y...|        39.88|
|  12|metode artwork 2d...|        25.67|
+----+--------------------+-------------+



In [76]:
# Apply the hash function
hashedData2 = data2.withColumn("hashedValue", hash("jawaban"))

# Show the results
hashedData2.select("soal", "hashedValue","skor_per_soal").show(truncate=False)

+----+-----------+-------------+
|soal|hashedValue|skor_per_soal|
+----+-----------+-------------+
|1   |1019100933 |20.5         |
|2   |1481524314 |45.0         |
|3   |122676417  |43.18        |
|4   |76487259   |24.56        |
|5   |1990940339 |46.9         |
|6   |1770907636 |100.0        |
|7   |400780623  |63.4         |
|8   |-412537011 |48.0         |
|9   |-55989520  |51.33        |
|10  |670920752  |39.08        |
|11  |723150141  |39.88        |
|12  |343114756  |25.67        |
+----+-----------+-------------+



# Melakukan Prediksi dengan Dataset baru

In [78]:
predictions2 = model.transform(hashedData2)
predictions2.show()

+----+--------------------+-------------+-----------+----------+
|soal|             jawaban|skor_per_soal|hashedValue|prediction|
+----+--------------------+-------------+-----------+----------+
|   1|Ya, semakin banya...|         20.5| 1019100933|       NaN|
|   5|1. mencegah serta...|         46.9| 1990940339| 44.139896|
|   2|Jumlah uang yang ...|         45.0| 1481524314|       NaN|
|   9|1.kesesuaian apli...|        51.33|  -55989520|       NaN|
|   7|aturlah posisi  k...|         63.4|  400780623|       NaN|
|   3|hak membuat merup...|        43.18|  122676417| 42.159916|
|  10|fokus serta apa a...|        39.08|  670920752|       NaN|
|  12|metode artwork 2d...|        25.67|  343114756|       NaN|
|  11|1. perlengkapan y...|        39.88|  723150141|       NaN|
|   6|ruang komputer, p...|        100.0| 1770907636|     100.0|
|   8|posisi kepala ser...|         48.0| -412537011|       NaN|
|   4|bila graf sangat ...|        24.56|   76487259|       NaN|
+----+-------------------

# Evaluasi Model

In [79]:
#import RegressionEvaluator karena kita ingin menghitung RMSE
#, yg formulanya sama saja meski untuk sistem rekomender
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="skor_per_soal", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions2)
print ("Root Mean Square Error (RMSE):", rmse)

Root Mean Square Error (RMSE): nan


In [80]:
a = predictions2.count()
print("jumlah baris sebelum di hapus data kosong: ", a)
cleanPred = predictions2.dropna(how="any", subset=["prediction"])
b = cleanPred.count()
print("jumlah baris setelah di hapus data kosong: ", b)
print("jumlah baris data kosong: ", a-b)

jumlah baris sebelum di hapus data kosong:  12
jumlah baris setelah di hapus data kosong:  3
jumlah baris data kosong:  9


In [81]:
cleanPred.show()

+----+--------------------+-------------+-----------+----------+
|soal|             jawaban|skor_per_soal|hashedValue|prediction|
+----+--------------------+-------------+-----------+----------+
|   5|1. mencegah serta...|         46.9| 1990940339| 44.139896|
|   3|hak membuat merup...|        43.18|  122676417| 42.159916|
|   6|ruang komputer, p...|        100.0| 1770907636|     100.0|
+----+--------------------+-------------+-----------+----------+



In [82]:
rmse = evaluator.evaluate(cleanPred)
print ("Root Mean Square Error (RMSE):", rmse)

Root Mean Square Error (RMSE): 1.6988969450888771
