In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split
from pyspark.ml.feature import StringIndexer, OneHotEncoder 
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexerModel


# Initialize Spark session
spark = SparkSession.builder.appName("MovieGenreEncoding").getOrCreate()

spark.conf.set("spark.hadoop.io.nativeio.enabled", "false")

# Sample selected movie array from JavaScript
selected_movies = [1726, 10138, 10386, 68721, 9313]

# Load movie dataset
df = spark.read.option("header", "true").csv("movies_data.csv")  # Change format if JSON

# Filter movies by selected tmdbId values
df_selected = df.filter(col("movieId").isin(selected_movies))

# Extract movie ID, name, and popularity
df_metadata = df_selected.select(
    col("movieId").alias("tmdbId"),
    col("name"),  # Corrected from title to name
    col("popularity")
)

# Convert `genres` column into an array and explode it
df_genres = df_selected.select(
    col("movieId"),
    explode(split(col("genres"), ", ")).alias("genre_name")  # Convert to array and explode
)

indexer = StringIndexer(inputCol="genre_name", outputCol="genre_index")

# # Define paths for indexer model storage
# indexer_model_path = "genre_indexer_model"

# # Check if a saved indexer model exists and load it if available
# import os
# if os.path.exists(indexer_model_path):
#     indexer = StringIndexerModel.load(indexer_model_path)
# else:
#     indexer = StringIndexer(inputCol="genre_name", outputCol="genre_index").fit(df_genres)
#     indexer.write().overwrite().save(indexer_model_path)

# OneHotEncoder for genre encoding
encoder = OneHotEncoder(inputCol="genre_index", outputCol="genre_vec")

# Index the genre names (convert them to numerical values)
indexer = StringIndexer(inputCol="genre_name", outputCol="genre_index")

# Encode the genre indices using OneHotEncoder
encoder = OneHotEncoder(inputCol="genre_index", outputCol="genre_vec")

# Combine indexing and encoding in a pipeline
pipeline = Pipeline(stages=[indexer, encoder])
genre_model = pipeline.fit(df_genres)
df_genres_encoded = genre_model.transform(df_genres)


# df_genres_encoded = indexer.transform(df_genres)
# df_genres_encoded = encoder.fit(df_genres_encoded).transform(df_genres_encoded)

# Join metadata with genre-encoded data
df_final = df_metadata.join(df_genres_encoded, df_metadata.tmdbId == df_genres_encoded.movieId, "inner") \
    .select(
        col("tmdbId"),
        col("name"),
        col("popularity"),
        col("genre_vec")
    )

# Add userId and rating columns and convert to integer
df_final = df_final.withColumn("userId", (col("tmdbId") * 0).cast("int")) \
                   .withColumn("rating", (col("tmdbId") * 0 + 10).cast("int"))

# Show final DataFrame
df_final.show(truncate=False)

+------+------------------------+----------+-------------+------+------+
|tmdbId|name                    |popularity|genre_vec    |userId|rating|
+------+------------------------+----------+-------------+------+------+
|1726  |Iron Man                |6.034     |(9,[1],[1.0])|0     |10    |
|1726  |Iron Man                |6.034     |(9,[2],[1.0])|0     |10    |
|1726  |Iron Man                |6.034     |(9,[4],[1.0])|0     |10    |
|10138 |Iron Man 2              |5.903     |(9,[3],[1.0])|0     |10    |
|10138 |Iron Man 2              |5.903     |(9,[0],[1.0])|0     |10    |
|10138 |Iron Man 2              |5.903     |(9,[5],[1.0])|0     |10    |
|68721 |Iron Man 3              |5.74      |(9,[3],[1.0])|0     |10    |
|68721 |Iron Man 3              |5.74      |(9,[6],[1.0])|0     |10    |
|68721 |Iron Man 3              |5.74      |(9,[4],[1.0])|0     |10    |
|10386 |The Iron Giant          |4.954     |(9,[1],[1.0])|0     |10    |
|10386 |The Iron Giant          |4.954     |(9,[2],

In [2]:
import os
print(os.path.exists("genre_indexer_model"))

True
