In [1]:
pip install pyspark


Collecting pyspark
  Downloading pyspark-4.0.0.tar.gz (434.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.1/434.1 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.9 (from pyspark)
  Downloading py4j-0.10.9.9-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading py4j-0.10.9.9-py2.py3-none-any.whl (203 kB)
Building wheels for collected packages: pyspark
[33m  DEPRECATION: Building 'pyspark' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'pyspark'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for pyspark (setup.py) ... [?25

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MovieLensRecommendation") \
    .config("spark.driver.memory", "50g") \
    .config("spark.executor.memory", "50g") \
    .getOrCreate()


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/15 12:39:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
ratings = spark.read.csv("ml-32m/ratings.csv", header=True, inferSchema=True)
movies = spark.read.csv("ml-32m/movies.csv", header=True, inferSchema=True)


                                                                                

In [6]:
ratings.show(10)
movies.show(10)


+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|     17|   4.0|944249077|
|     1|     25|   1.0|944250228|
|     1|     29|   2.0|943230976|
|     1|     30|   5.0|944249077|
|     1|     32|   5.0|943228858|
|     1|     34|   2.0|943228491|
|     1|     36|   1.0|944249008|
|     1|     80|   5.0|944248943|
|     1|    110|   3.0|943231119|
|     1|    111|   5.0|944249008|
+------+-------+------+---------+
only showing top 10 rows
+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)| 

In [7]:
from pyspark.sql.functions import col

ratings = ratings.dropna().dropDuplicates()
movies = movies.dropna().dropDuplicates()


In [8]:
from pyspark.sql.functions import avg, count

top_movies = ratings.groupBy("movieId") \
    .agg(avg("rating").alias("avg_rating"), count("rating").alias("count_rating")) \
    .filter("count_rating >= 50") \
    .join(movies, "movieId") \
    .orderBy("avg_rating", ascending=False)

top_movies.show(10)


                                                                                

+-------+------------------+------------+--------------------+------------------+
|movieId|        avg_rating|count_rating|               title|            genres|
+-------+------------------+------------+--------------------+------------------+
| 171011|4.4468302658486705|        1956|Planet Earth II (...|       Documentary|
| 159817| 4.444369063772049|        2948| Planet Earth (2006)|       Documentary|
| 170705| 4.426538598363572|        2811|Band of Brothers ...|  Action|Drama|War|
|    318| 4.404613860039444|      102929|Shawshank Redempt...|       Crime|Drama|
| 171495| 4.330081300813008|         615|              Cosmos|(no genres listed)|
|    858| 4.317030403371463|       66440|Godfather, The (1...|       Crime|Drama|
| 202439| 4.312253641816624|       11670|     Parasite (2019)|      Comedy|Drama|
| 179135| 4.300085984522786|        1163|Blue Planet II (2...|       Documentary|
| 198185| 4.298684210526316|        1140|   Twin Peaks (1989)|     Drama|Mystery|
| 220528|  4.286

In [9]:
from pyspark.sql.functions import explode, split

genre_df = movies.withColumn("genre", explode(split(col("genres"), "\\|")))
genre_counts = genre_df.groupBy("genre").count().orderBy("count", ascending=False)
genre_counts.show()




+------------------+-----+
|             genre|count|
+------------------+-----+
|             Drama|34175|
|            Comedy|23123|
|          Thriller|11823|
|           Romance|10369|
|            Action| 9668|
|       Documentary| 9363|
|            Horror| 8654|
|(no genres listed)| 7080|
|             Crime| 6976|
|         Adventure| 5402|
|            Sci-Fi| 4907|
|         Animation| 4617|
|          Children| 4520|
|           Mystery| 4013|
|           Fantasy| 3851|
|               War| 2325|
|           Western| 1696|
|           Musical| 1059|
|         Film-Noir|  353|
|              IMAX|  195|
+------------------+-----+
only showing top 20 rows


                                                                                

In [10]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import rand

# Séparer en train/test
(training, test) = ratings.randomSplit([0.8, 0.2])

als = ALS(
    maxIter=10,
    regParam=0.1,
    rank=10,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"
)

model = als.fit(training)

predictions = model.transform(test)

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("RMSE =", rmse)




RMSE = 0.7991872523969626


                                                                                

In [13]:
from pyspark.sql.functions import col, regexp_replace
from pyspark.ml.feature import Tokenizer, HashingTF, IDF

# Remplacer les "|" par des espaces dans la colonne "genres"
movie_genres = movies.withColumn("genres_text", regexp_replace(col("genres"), r"\|", " "))

# Tokenisation du texte des genres
tokenizer = Tokenizer(inputCol="genres_text", outputCol="words")
words_data = tokenizer.transform(movie_genres)


In [None]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=100)
featurized = hashingTF.transform(words_data)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_model = idf.fit(featurized)
tfidf_movies = idf_model.transform(featurized)

[Stage 167:>                                                        (0 + 3) / 3]

In [12]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF

movie_genres = movies.withColumn("genres_text", col("genres").replace("|", " "))

tokenizer = Tokenizer(inputCol="genres_text", outputCol="words")
words_data = tokenizer.transform(movie_genres)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=100)
featurized = hashingTF.transform(words_data)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_model = idf.fit(featurized)
tfidf_movies = idf_model.transform(featurized)


TypeError: 'Column' object is not callable

In [15]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF

movie_genres = movies.withColumn("genres_text", col("genres").replace("|", " "))

tokenizer = Tokenizer(inputCol="genres_text", outputCol="words")
words_data = tokenizer.transform(movie_genres)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=100)
featurized = hashingTF.transform(words_data)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_model = idf.fit(featurized)
tfidf_movies = idf_model.transform(featurized)


TypeError: 'Column' object is not callable

In [16]:
user_ids = [1, 42, 100, 120, 150]

for user_id in user_ids:
    print(f"Recommandations pour l'utilisateur {user_id} :")
    user_recs = model.recommendForAllUsers(5).filter(col("userId") == user_id)
    user_recs.show(truncate=False)


Recommandations pour l'utilisateur 1 :


                                                                                

+------+-----------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                      |
+------+-----------------------------------------------------------------------------------------------------+
|1     |[{217747, 5.2593064}, {194434, 5.180606}, {289897, 5.154702}, {227066, 5.103722}, {89403, 5.0937047}]|
+------+-----------------------------------------------------------------------------------------------------+

Recommandations pour l'utilisateur 42 :


                                                                                

+------+-------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                        |
+------+-------------------------------------------------------------------------------------------------------+
|42    |[{205453, 5.9349294}, {194278, 5.7084913}, {171849, 5.5113006}, {217747, 5.480564}, {265908, 5.479037}]|
+------+-------------------------------------------------------------------------------------------------------+

Recommandations pour l'utilisateur 100 :


                                                                                

+------+------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                       |
+------+------------------------------------------------------------------------------------------------------+
|100   |[{194278, 5.2350717}, {289897, 5.203506}, {205453, 5.143176}, {275847, 5.1248984}, {194434, 5.069031}]|
+------+------------------------------------------------------------------------------------------------------+

Recommandations pour l'utilisateur 120 :


                                                                                

+------+-------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                        |
+------+-------------------------------------------------------------------------------------------------------+
|120   |[{194434, 6.4531164}, {227066, 6.216584}, {289897, 6.0695543}, {275847, 5.897304}, {193817, 5.8357997}]|
+------+-------------------------------------------------------------------------------------------------------+

Recommandations pour l'utilisateur 150 :




+------+------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                       |
+------+------------------------------------------------------------------------------------------------------+
|150   |[{193817, 6.01458}, {194434, 5.9709315}, {289897, 5.800478}, {227066, 5.7791195}, {282453, 5.4832706}]|
+------+------------------------------------------------------------------------------------------------------+



                                                                                