In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover, IDF
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
appName = "task automated scoring system"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [4]:
df = spark.read.csv("C:/Users/LENOVO/OneDrive/Documents/Tugasku/big data b/rtm5/data.csv", inferSchema=True, header=True)
df.show()

df = df.select('npm', 'jawaban', 'soal', 'skor_per_soal')
df.show()

from pyspark.sql.functions import hash, abs
df = df.withColumn("HashValue", hash("jawaban"))
df.show()

+----------+------------+--------------------+----+-------------+
|       npm|nama_peserta|             jawaban|soal|skor_per_soal|
+----------+------------+--------------------+----+-------------+
|         0|       Admin|Tidak, Hanya memb...|   1|        100.0|
|         0|       Admin|Biaya dihitung be...|   2|        100.0|
|         0|       Admin|Hak cipta adalah ...|   3|        100.0|
|         0|       Admin|Dijelaskan kepada...|   4|        100.0|
|         0|       Admin|1. Melindungi dan...|   5|        100.0|
|         0|       Admin|Ruang Komputer, P...|   6|        100.0|
|         0|       Admin|Aturlah posisi pe...|   7|        100.0|
|         0|       Admin|Posisi Kepala dan...|   8|        100.0|
|         0|       Admin|1. Kecocokan soft...|   9|        100.0|
|         0|       Admin|1. Fokus dan expo...|  10|        100.0|
|         0|       Admin|1. Peralatan yang...|  11|        100.0|
|         0|       Admin|1. Dibuat grafik ...|  12|        100.0|
|112102003

In [5]:
splits = df.randomSplit([0.8, 0.2], seed=1234)
train = splits[0].withColumnRenamed("skor_per_soal", "label")
test = splits[1].withColumnRenamed("skor_per_soal", "trueLabel")

In [6]:
train_rows = train.count()
test_rows = test.count()
print ("Jumlah baris data training:", train_rows, 
       ", jumlah baris data testing:", test_rows)

Jumlah baris data training: 89 , jumlah baris data testing: 31


In [7]:
train.show()
test.show()

+----------+--------------------+----+-----+-----------+
|       npm|             jawaban|soal|label|  HashValue|
+----------+--------------------+----+-----+-----------+
|         0|1. Dibuat grafik ...|  12|100.0| 1727767227|
|         0|1. Kecocokan soft...|   9|100.0| 1576366224|
|         0|1. Melindungi dan...|   5|100.0| 1588395990|
|         0|Aturlah posisi pe...|   7|100.0|   50850002|
|         0|Biaya dihitung be...|   2|100.0| 1183180174|
|         0|Dijelaskan kepada...|   4|100.0|-2035408785|
|         0|Hak cipta adalah ...|   3|100.0| 1232762403|
|         0|Posisi Kepala dan...|   8|100.0| -945877996|
|         0|Ruang Komputer, P...|   6|100.0|  339970513|
|         0|Tidak, Hanya memb...|   1|100.0|-2059296905|
|1120020017|aturlah posisi pe...|   7|86.22|-1392782412|
|1120020017|biaya dihitung be...|   2|84.52| -219318287|
|1120020017|dibuat grafik yan...|  12|86.53| -902409772|
|1120020017|dijelaskan kepada...|   4|72.06| -683553012|
|1120020017|emperbanyak cipta..

In [10]:
# Aktifkan strategi cold start untuk menghapus prediksi NaN
als = ALS(maxIter=5, regParam=0.01, userCol='HashValue', itemCol='soal', ratingCol='label')
model = als.fit(train)
model.setColdStartStrategy("drop")
predictions = model.transform(test)

# Mengisi prediksi NaN dengan rata-rata skor
avg_score = train.select(mean("label")).first()[0]
predictions = predictions.withColumn(
    "prediction", 
    when(col("prediction").isNull(), avg_score).otherwise(col("prediction"))
)

predictions.show()


+----------+--------------------+----+---------+-----------+-----------------+
|       npm|             jawaban|soal|trueLabel|  HashValue|       prediction|
+----------+--------------------+----+---------+-----------+-----------------+
|1220020018|dibuat grafik yan...|  12|    86.53| -902409772|86.52997589111328|
|1121020032|tidak, hanya memb...|   1|    100.0| -256638840|99.99993896484375|
|1121020035|tidak, hanya memb...|   1|    100.0| -256638840|99.99993896484375|
|1121020033|ruang komputer, p...|   6|    100.0| 1770907636|99.99995422363281|
|1220020018|ruang komputer, p...|   6|    100.0| 1770907636|99.99995422363281|
|1121020035|hak cipta adalah ...|   3|    83.43|-1876419705|83.42996978759766|
|1220020018|hak cipta adalah ...|   3|    91.71|  770340049|91.70994567871094|
|1220020018|melindungi dan me...|   5|    74.73|-1932865057|   74.72998046875|
|1220020029|kecocokan softwar...|   9|    84.88|-1092404005|84.87995910644531|
|1220020023|kecocokan softwar...|   9|    65.89|  44

In [13]:
# Evaluasi akurasi model
evaluator_rmse = RegressionEvaluator(metricName="rmse", labelCol="trueLabel", predictionCol="prediction")
evaluator_mae = RegressionEvaluator(metricName="mae", labelCol="trueLabel", predictionCol="prediction")
evaluator_r2 = RegressionEvaluator(metricName="r2", labelCol="trueLabel", predictionCol="prediction")

rmse = evaluator_rmse.evaluate(predictions)
mae = evaluator_mae.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"Root Mean Square Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

Root Mean Square Error (RMSE): 4.167011583199324e-05
Mean Absolute Error (MAE): 3.976821899431826e-05
R-squared (R2): 0.9999999999818238
