In [22]:
from pyspark.sql import SparkSession
import requests
import time

spark = SparkSession.builder.appName("JSONLoading").getOrCreate()

In [23]:
#Load single-line JSON file
df_id = spark.read.json("movie_ids_03_09_2025.json")
df_id.createOrReplaceTempView("movies_view")
df_id.show()

+-----+-----+--------------------+----------+-----+
|adult|   id|      original_title|popularity|video|
+-----+-----+--------------------+----------+-----+
|false| 3924|             Blondie|     1.423|false|
|false| 6124| Der Mann ohne Namen|      0.15|false|
|false| 8773| L'Amour à vingt ans|     1.913|false|
|false|25449|New World Disorde...|     0.344|false|
|false|31975|Sesame Street: El...|     0.002| true|
|false|    2|               Ariel|      3.88|false|
|false|    3|Varjoja paratiisissa|     4.587|false|
|false|    5|          Four Rooms|     5.541|false|
|false|    6|      Judgment Night|     3.872|false|
|false|    8|Life in Loops (A ...|     1.098|false|
|false|    9|   Sonntag im August|     0.975|false|
|false|   11|           Star Wars|    28.732|false|
|false|   12|        Finding Nemo|    23.678|false|
|false|   13|        Forrest Gump|    27.685|false|
|false|   14|     American Beauty|    11.036|false|
|false|   15|        Citizen Kane|     8.272|false|
|false|   16

In [24]:
df_result = spark.sql("""
SELECT *
FROM movies_view
where popularity > 5
order by popularity desc
""")
df_result.show()
df_result.count()

+-----+-------+--------------------+----------+-----+
|adult|     id|      original_title|popularity|video|
+-----+-------+--------------------+----------+-----+
|false| 950396|           The Gorge|   584.464|false|
|false|1126166|         Flight Risk|   555.547|false|
|false|1064213|               Anora|   448.424|false|
|false| 762509|Mufasa: The Lion ...|   391.149|false|
|false|1241982|             Moana 2|    381.44|false|
|false| 939243|Sonic the Hedgehog 3|   364.976|false|
|false| 822119|Captain America: ...|   309.068|false|
|false| 823219|             Straume|   289.707|false|
|false| 927342|               அமரன்|   272.848|false|
|false| 426889|         Le Clitoris|   244.972|false|
|false| 696506|           Mickey 17|   236.935|false|
|false| 926670|Henry Danger: The...|   228.234|false|
|false|1084199|           Companion|     213.4|false|
|false|1160956|            熊猫计划|   201.464|false|
|false| 912649|Venom: The Last D...|   182.035|false|
|false| 539972|   Kraven the Hun

7616

In [25]:
import json
import time


open("movie_results.json", "w").close()

# Function to query the API
def fetch_movie_details(movie_id):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US"
    headers = {
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJiNTkyNDE4MmIyYzkzNmZlMDE0NDQ3N2Q3NjRmMzZkNCIsIm5iZiI6MTc0MTY2MzMwMy4wMDcsInN1YiI6IjY3Y2ZhYzQ3NDM0Yzk4YzhlYzgxMTQ3ZiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.rSjRBzktp3_KlBfzxxtkDm3oM2wIMkf1_dDEjgs9OBo"
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        return response.json()
    else:
        return {"id": movie_id, "error": response.status_code}

# Process each partition
def process_partition(partition):
    with open("movie_results.json", "a") as f:  # Open file once per partition
        for row in partition:
            movie_id = row["id"]
            movie_details = fetch_movie_details(movie_id)
            f.write(json.dumps(movie_details) + "\n")  # Append results to file
            #time.sleep(0.1)

# Read DataFrame and process each partition
df_result.select("id").rdd.foreachPartition(process_partition)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr

# Initialize Spark session
spark = SparkSession.builder.appName("MoviesData").getOrCreate()

# Load JSON file into a PySpark DataFrame
df = spark.read.json("movie_results.json")  # Replace with your JSON file path

# Select the required columns
df_selected = df.select(
    col("title").alias("name"),
    expr("transform(genres, x -> x.name)").alias("genres"),  # Extract genre names
    col("release_date").substr(1, 4).alias("year"),  # Extract year from release_date
    col("popularity"),
    col("vote_average").alias("rating"),
    col("overview")
)

# Show the results
df_selected.show(truncate=False)
df_selected.count()

+--------------------------------+-----------------------------------------+----+----------+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|name                            |genres                                   |year|popularity|rating|overview                                                                                                                                                                                                                                                                                                                                                                                    |
+--------------------------------+----

7616