In [11]:
from pathlib import Path
import sys 
import pandas as pd

sys.path.append(str(Path.cwd().parent))

from pyspark.sql import SparkSession
import os

# Path to log4j.properties
notebook_dir = Path().resolve()
log4j_path = log4j_path = notebook_dir.parent / "conf" / "log4j.properties"

spark = (
    SparkSession.builder
    .appName("MovieProject")
    .config("spark.driver.extraJavaOptions", f"-Dlog4j.configuration=file:{log4j_path}")
    .config("spark.executor.extraJavaOptions", f"-Dlog4j.configuration=file:{log4j_path}")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN") 

%reload_ext autoreload
%autoreload 2

import scripts.extraction.extractor_tmdb as tmdb
import scripts.transform.clean_data as cln
# import analysis.kpi_analysis as kpi 
# import visualize.visualizations as viz

## Fetch all movies and store in a Database

In [3]:
movie_ids = [0, 299534, 19995, 140607, 299536, 597, 135397, 420818, 24428, 168259, 99861, 284054, 12445, 181808, 330457, 351286, 109445, 321612, 260513]

In [4]:
movies = tmdb.get_all_movies_by_ids(movie_ids)

2026-01-14 16:11:11,170 - INFO - extractor_tmdb - Fetching movie ID: 0
2026-01-14 16:11:11,638 - ERROR - extractor_tmdb - HTTP error on attempt 0: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/0?append_to_response=credits&api_key=d855a01d02babf081991d455eff39348&language=en-US&movie_id=0
2026-01-14 16:11:14,018 - ERROR - extractor_tmdb - HTTP error on attempt 1: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/0?append_to_response=credits&api_key=d855a01d02babf081991d455eff39348&language=en-US&movie_id=0
2026-01-14 16:11:16,380 - ERROR - extractor_tmdb - HTTP error on attempt 2: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/0?append_to_response=credits&api_key=d855a01d02babf081991d455eff39348&language=en-US&movie_id=0
2026-01-14 16:11:18,836 - ERROR - extractor_tmdb - HTTP error on attempt 3: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/0?append_to_response=credits&api_key=d855a01d02ba

In [12]:
df = spark.createDataFrame(movies)

In [6]:
df.printSchema()

root
 |-- adult: boolean (nullable = true)
 |-- backdrop_path: string (nullable = true)
 |-- belongs_to_collection: map (nullable = true)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)
 |-- budget: long (nullable = true)
 |-- credits: map (nullable = true)
 |    |-- key: string
 |    |-- value: array (valueContainsNull = true)
 |    |    |-- element: map (containsNull = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: boolean (valueContainsNull = true)
 |-- genres: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: long (valueContainsNull = true)
 |-- homepage: string (nullable = true)
 |-- id: long (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- origin_country: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 

In [7]:
df.limit(10).toPandas()

                                                                                

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,credits,genres,homepage,id,imdb_id,origin_country,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/7RyHsO4yDXtBv1zUU3mTpHeQ0d5.jpg,"{'backdrop_path': None, 'name': None, 'id': 86...",356000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 12}, {'name': None, 'id'...",https://www.marvel.com/movies/avengers-endgame,299534,tt4154796,[US],...,2019-04-24,2799439100,181,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,Avenge the fallen.,Avengers: Endgame,False,8.237,27117
1,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,"{'backdrop_path': None, 'name': None, 'id': 87...",237000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 28}, {'name': None, 'id'...",https://www.avatar.com/movies/avatar,19995,tt0499549,[US],...,2009-12-16,2923706026,162,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,Enter the world of Pandora.,Avatar,False,7.6,33232
2,False,/8BTsTfln4jlQrLXUBquXJ0ASQy9.jpg,"{'backdrop_path': None, 'name': None, 'id': 10...",245000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 12}, {'name': None, 'id'...",http://www.starwars.com/films/star-wars-episod...,140607,tt2488496,[US],...,2015-12-15,2068223624,136,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,Every generation has a story.,Star Wars: The Force Awakens,False,7.254,20193
3,False,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,"{'backdrop_path': None, 'name': None, 'id': 86...",300000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 12}, {'name': None, 'id'...",https://www.marvel.com/movies/avengers-infinit...,299536,tt4154756,[US],...,2018-04-25,2052415039,149,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,Destiny arrives all the same.,Avengers: Infinity War,False,8.2,31330
4,False,/xnHVX37XZEp33hhCbYlQFq7ux1J.jpg,,200000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 18}, {'name': None, 'id'...",https://www.paramountmovies.com/movies/titanic,597,tt0120338,[US],...,1997-12-18,2264162353,194,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,Nothing on earth could come between them.,Titanic,False,7.903,26650
5,False,/s5QfDFqRO6sjgPtKkjxD0WqXQef.jpg,"{'backdrop_path': None, 'name': None, 'id': 32...",150000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 28}, {'name': None, 'id'...",https://www.jurassicworld.com/,135397,tt0369610,[US],...,2015-06-06,1671537444,124,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,The park is open.,Jurassic World,False,6.7,21212
6,False,/1TUg5pO1VZ4B0Q1amk3OlXvlpXV.jpg,"{'backdrop_path': None, 'name': None, 'id': 76...",260000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 12}, {'name': None, 'id'...",https://movies.disney.com/the-lion-king-2019,420818,tt6105098,[US],...,2019-07-12,1662020819,118,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,The king has returned.,The Lion King,False,7.099,10602
7,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'backdrop_path': None, 'name': None, 'id': 86...",220000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 878}, {'name': None, 'id...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,[US],...,2012-04-25,1518815515,143,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,Some assembly required.,The Avengers,False,7.903,35049
8,False,/ehzI1mVcnHqB58NqPyQwpMqcVoz.jpg,"{'backdrop_path': None, 'name': None, 'id': 94...",190000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 28}, {'name': None, 'id'...",https://www.uphe.com/movies/furious-7,168259,tt2820852,[US],...,2015-04-01,1515400000,139,"[{'name': 'العربية', 'iso_639_1': 'ar', 'engli...",Released,Vengeance hits home.,Furious 7,False,7.219,11084
9,False,/kIBK5SKwgqIIuRKhhWrJn3XkbPq.jpg,"{'backdrop_path': None, 'name': None, 'id': 86...",235000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 28}, {'name': None, 'id'...",https://www.marvel.com/movies/avengers-age-of-...,99861,tt2395427,[US],...,2015-04-22,1405403694,141,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,A new age has come.,Avengers: Age of Ultron,False,7.271,23997


In [8]:
df.columns

['adult',
 'backdrop_path',
 'belongs_to_collection',
 'budget',
 'credits',
 'genres',
 'homepage',
 'id',
 'imdb_id',
 'origin_country',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'video',
 'vote_average',
 'vote_count']

## Drop irrelevant columns

In [13]:
cols_to_drop = ['adult', 'imdb_id', 'original_title', 'video', 'homepage', 	'success', 'status_code', 'status_message']
df = cln.drop_irrelevant_columns(df, cols_to_drop)

In [15]:
df.toPandas()

Unnamed: 0,backdrop_path,belongs_to_collection,budget,credits,genres,id,origin_country,original_language,overview,popularity,...,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,/7RyHsO4yDXtBv1zUU3mTpHeQ0d5.jpg,"{'backdrop_path': None, 'name': None, 'id': 86...",356000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 12}, {'name': None, 'id'...",299534,[US],en,After the devastating events of Avengers: Infi...,19.5676,...,"[{'name': 'United States of America', 'iso_316...",2019-04-24,2799439100,181,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,Avenge the fallen.,Avengers: Endgame,8.237,27117
1,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,"{'backdrop_path': None, 'name': None, 'id': 87...",237000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 28}, {'name': None, 'id'...",19995,[US],en,"In the 22nd century, a paraplegic Marine is di...",67.9234,...,"[{'name': 'United States of America', 'iso_316...",2009-12-16,2923706026,162,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,Enter the world of Pandora.,Avatar,7.6,33232
2,/8BTsTfln4jlQrLXUBquXJ0ASQy9.jpg,"{'backdrop_path': None, 'name': None, 'id': 10...",245000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 12}, {'name': None, 'id'...",140607,[US],en,Thirty years after defeating the Galactic Empi...,10.8439,...,"[{'name': 'United States of America', 'iso_316...",2015-12-15,2068223624,136,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,Every generation has a story.,Star Wars: The Force Awakens,7.254,20193
3,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,"{'backdrop_path': None, 'name': None, 'id': 86...",300000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 12}, {'name': None, 'id'...",299536,[US],en,As the Avengers and their allies have continue...,28.4734,...,"[{'name': 'United States of America', 'iso_316...",2018-04-25,2052415039,149,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,Destiny arrives all the same.,Avengers: Infinity War,8.2,31330
4,/xnHVX37XZEp33hhCbYlQFq7ux1J.jpg,,200000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 18}, {'name': None, 'id'...",597,[US],en,101-year-old Rose DeWitt Bukater tells the sto...,32.4768,...,"[{'name': 'United States of America', 'iso_316...",1997-12-18,2264162353,194,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,Nothing on earth could come between them.,Titanic,7.903,26650
5,/s5QfDFqRO6sjgPtKkjxD0WqXQef.jpg,"{'backdrop_path': None, 'name': None, 'id': 32...",150000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 28}, {'name': None, 'id'...",135397,[US],en,Twenty-two years after the events of Jurassic ...,10.5154,...,"[{'name': 'United States of America', 'iso_316...",2015-06-06,1671537444,124,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,The park is open.,Jurassic World,6.7,21212
6,/1TUg5pO1VZ4B0Q1amk3OlXvlpXV.jpg,"{'backdrop_path': None, 'name': None, 'id': 76...",260000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 12}, {'name': None, 'id'...",420818,[US],en,"Simba idolizes his father, King Mufasa, and ta...",11.117,...,"[{'name': 'United States of America', 'iso_316...",2019-07-12,1662020819,118,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,The king has returned.,The Lion King,7.099,10602
7,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'backdrop_path': None, 'name': None, 'id': 86...",220000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 878}, {'name': None, 'id...",24428,[US],en,When an unexpected enemy emerges and threatens...,65.9961,...,"[{'name': 'United States of America', 'iso_316...",2012-04-25,1518815515,143,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,Some assembly required.,The Avengers,7.903,35049
8,/ehzI1mVcnHqB58NqPyQwpMqcVoz.jpg,"{'backdrop_path': None, 'name': None, 'id': 94...",190000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 28}, {'name': None, 'id'...",168259,[US],en,Deckard Shaw seeks revenge against Dominic Tor...,11.4091,...,"[{'name': 'United States of America', 'iso_316...",2015-04-01,1515400000,139,"[{'name': 'العربية', 'iso_639_1': 'ar', 'engli...",Released,Vengeance hits home.,Furious 7,7.219,11084
9,/kIBK5SKwgqIIuRKhhWrJn3XkbPq.jpg,"{'backdrop_path': None, 'name': None, 'id': 86...",235000000,"{'cast': [{'cast_id': None, 'character': None,...","[{'name': None, 'id': 28}, {'name': None, 'id'...",99861,[US],en,When Tony Stark tries to jumpstart a dormant p...,16.741,...,"[{'name': 'United States of America', 'iso_316...",2015-04-22,1405403694,141,"[{'name': 'English', 'iso_639_1': 'en', 'engli...",Released,A new age has come.,Avengers: Age of Ultron,7.271,23997


In [16]:
df.columns

['backdrop_path',
 'belongs_to_collection',
 'budget',
 'credits',
 'genres',
 'id',
 'origin_country',
 'original_language',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'vote_average',
 'vote_count']

## Evaluate JSON-like columns

In [17]:
categorical_columns = ['belongs_to_collection', 'genres', 'production_countries', 'production_companies', 'spoken_languages', 'origin_country','credits']

In [19]:
df.select(*categorical_columns).limit(10).toPandas()

Unnamed: 0,belongs_to_collection,genres,production_countries,production_companies,spoken_languages,origin_country,credits
0,"{'backdrop_path': None, 'name': None, 'id': 86...","[{'name': None, 'id': 12}, {'name': None, 'id'...","[{'name': 'United States of America', 'iso_316...","[{'name': None, 'id': 420, 'logo_path': None, ...","[{'name': 'English', 'iso_639_1': 'en', 'engli...",[US],"{'cast': [{'cast_id': None, 'character': None,..."
1,"{'backdrop_path': None, 'name': None, 'id': 87...","[{'name': None, 'id': 28}, {'name': None, 'id'...","[{'name': 'United States of America', 'iso_316...","[{'name': None, 'id': 444, 'logo_path': None, ...","[{'name': 'English', 'iso_639_1': 'en', 'engli...",[US],"{'cast': [{'cast_id': None, 'character': None,..."
2,"{'backdrop_path': None, 'name': None, 'id': 10...","[{'name': None, 'id': 12}, {'name': None, 'id'...","[{'name': 'United States of America', 'iso_316...","[{'name': None, 'id': 1, 'logo_path': None, 'o...","[{'name': 'English', 'iso_639_1': 'en', 'engli...",[US],"{'cast': [{'cast_id': None, 'character': None,..."
3,"{'backdrop_path': None, 'name': None, 'id': 86...","[{'name': None, 'id': 12}, {'name': None, 'id'...","[{'name': 'United States of America', 'iso_316...","[{'name': None, 'id': 420, 'logo_path': None, ...","[{'name': 'English', 'iso_639_1': 'en', 'engli...",[US],"{'cast': [{'cast_id': None, 'character': None,..."
4,,"[{'name': None, 'id': 18}, {'name': None, 'id'...","[{'name': 'United States of America', 'iso_316...","[{'name': None, 'id': 4, 'logo_path': None, 'o...","[{'name': 'English', 'iso_639_1': 'en', 'engli...",[US],"{'cast': [{'cast_id': None, 'character': None,..."
5,"{'backdrop_path': None, 'name': None, 'id': 32...","[{'name': None, 'id': 28}, {'name': None, 'id'...","[{'name': 'United States of America', 'iso_316...","[{'name': None, 'id': 56, 'logo_path': None, '...","[{'name': 'English', 'iso_639_1': 'en', 'engli...",[US],"{'cast': [{'cast_id': None, 'character': None,..."
6,"{'backdrop_path': None, 'name': None, 'id': 76...","[{'name': None, 'id': 12}, {'name': None, 'id'...","[{'name': 'United States of America', 'iso_316...","[{'name': None, 'id': 2, 'logo_path': None, 'o...","[{'name': 'English', 'iso_639_1': 'en', 'engli...",[US],"{'cast': [{'cast_id': None, 'character': None,..."
7,"{'backdrop_path': None, 'name': None, 'id': 86...","[{'name': None, 'id': 878}, {'name': None, 'id...","[{'name': 'United States of America', 'iso_316...","[{'name': None, 'id': 420, 'logo_path': None, ...","[{'name': 'English', 'iso_639_1': 'en', 'engli...",[US],"{'cast': [{'cast_id': None, 'character': None,..."
8,"{'backdrop_path': None, 'name': None, 'id': 94...","[{'name': None, 'id': 28}, {'name': None, 'id'...","[{'name': 'United States of America', 'iso_316...","[{'name': None, 'id': 333, 'logo_path': None, ...","[{'name': 'العربية', 'iso_639_1': 'ar', 'engli...",[US],"{'cast': [{'cast_id': None, 'character': None,..."
9,"{'backdrop_path': None, 'name': None, 'id': 86...","[{'name': None, 'id': 28}, {'name': None, 'id'...","[{'name': 'United States of America', 'iso_316...","[{'name': None, 'id': 420, 'logo_path': None, ...","[{'name': 'English', 'iso_639_1': 'en', 'engli...",[US],"{'cast': [{'cast_id': None, 'character': None,..."


## Extracting and cleaning json columns

In [21]:
df = cln.clean_movie_data(df)

TypeError: 'str' object is not callable