In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType

In [2]:
# ====================== INITIALISATION ======================
spark = SparkSession.builder \
    .appName("TrainALSModel") \
    .master("yarn") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/02 11:46:38 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [3]:
# ====================== CHARGEMENT DES DONNÃ‰ES ======================
ratings_schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("rating", FloatType(), True),
    StructField("timestamp", StringType(), True),
])

movies_schema = StructType([
    StructField("movieId", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("genres", StringType(), True),
])

print("ðŸ“¥ Lecture des fichiers CSV depuis HDFS...")
ratings_df = spark.read.csv("hdfs:///input/rating.csv", header=True, schema=ratings_schema)
movies_df = spark.read.csv("hdfs:///input/movie.csv", header=True, schema=movies_schema)

ðŸ“¥ Lecture des fichiers CSV depuis HDFS...


In [4]:
# ====================== NETTOYAGE ======================
# Supprimer les lignes avec valeurs nulles
ratings_df = ratings_df.dropna(subset=["userId", "movieId", "rating"])

In [5]:
# ====================== DIVISION TRAIN / TEST ======================
train_df, test_df = ratings_df.randomSplit([0.8, 0.2], seed=42)

In [6]:
# ====================== ENTRAÃŽNEMENT ======================
print("ðŸ¤– EntraÃ®nement du modÃ¨le ALS...")
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    nonnegative=True,
    implicitPrefs=False,
    coldStartStrategy="drop",  # pour Ã©viter les NaN en test
    rank=12,
    maxIter=15,
    regParam=0.05
)

model = als.fit(train_df)

ðŸ¤– EntraÃ®nement du modÃ¨le ALS...


                                                                                

In [None]:
# ====================== Ã‰VALUATION ======================
print("ðŸ“Š Ã‰valuation du modÃ¨le...")
predictions = model.transform(test_df)

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"âœ… RMSE sur l'ensemble test : {rmse:.4f}")

ðŸ“Š Ã‰valuation du modÃ¨le...


[Stage 74:=>  (2 + 2) / 6][Stage 108:> (0 + 0) / 10][Stage 109:> (0 + 0) / 10]

In [None]:
# ====================== ENREGISTREMENT ======================
print("ðŸ’¾ Sauvegarde du modÃ¨le dans HDFS (/models/als)...")
model.write().overwrite().save("hdfs:///models/als")

print("ðŸŽ‰ ModÃ¨le entraÃ®nÃ© et sauvegardÃ© avec succÃ¨s.")

In [None]:
spark.stop()