In [6]:
from pyspark.sql import SparkSession

# Step 1: Create a SparkSession
spark = SparkSession.builder \
    .appName("BroadcastVariablesExample") \
    .master("local[*]") \
    .getOrCreate()

# Step 2: Create a DataFrame with movie IDs and ratings
movie_data = [
    (1, 4.5),
    (2, 3.0),
    (1, 5.0),
    (3, 4.0)
]
columns = ["MovieID", "Rating"]
ratings_df = spark.createDataFrame(movie_data, columns)

# Step 3: Create a dictionary with movie names
movie_names = {
    1: "Inception",
    2: "Interstellar",
    3: "The Dark Knight"
}

# Step 4: Broadcast the movie names dictionary
movie_names_broadcast = spark.sparkContext.broadcast(movie_names)

# Step 5: Map MovieID to Movie Name using the broadcast variable
def map_movie_name(movie_id):
    return movie_names_broadcast.value.get(movie_id, "Unknown")

# Register the function as a UDF
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

map_movie_name_udf = udf(map_movie_name, StringType())

# Add a new column with movie names
result_df = ratings_df.withColumn("MovieName", map_movie_name_udf(ratings_df.MovieID))

# Step 6: Show the result
result_df.show()

# Stop the SparkSession
spark.stop()

+-------+------+---------------+
|MovieID|Rating|      MovieName|
+-------+------+---------------+
|      1|   4.5|      Inception|
|      2|   3.0|   Interstellar|
|      1|   5.0|      Inception|
|      3|   4.0|The Dark Knight|
+-------+------+---------------+

