In [1]:
from pathlib import Path
import sys 
import pandas as pd
import json

sys.path.append(str(Path.cwd().parent))

from pyspark.sql import SparkSession
import os

# Path to log4j.properties
notebook_dir = Path().resolve()
log4j_path = log4j_path = notebook_dir.parent / "conf" / "log4j.properties"

spark = (
    SparkSession.builder
    .appName("MovieProject")
    .config("spark.driver.extraJavaOptions", f"-Dlog4j.configuration=file:{log4j_path}")
    .config("spark.executor.extraJavaOptions", f"-Dlog4j.configuration=file:{log4j_path}")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN") 

%reload_ext autoreload
%autoreload 2

import scripts.extraction.extractor_tmdb as tmdb
import scripts.transform.clean_data as cln
import scripts.transform.tmdb_data_preprocessor as dp
import scripts.transform.tmdb_schema as ts
# import analysis.kpi_analysis as kpi 
# import visualize.visualizations as viz

Setting default log level to "INFO".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Fetch all movies and store in a Database

In [2]:
movie_ids = [0, 299534, 19995, 140607, 299536, 597, 135397, 420818, 24428, 168259, 99861, 284054, 12445, 181808, 330457, 351286, 109445, 321612, 260513]

In [3]:
movies = tmdb.get_all_movies_by_ids(movie_ids)

2026-01-16 12:02:25,866 - INFO - extractor_tmdb - Fetching movie ID: 0
2026-01-16 12:02:26,547 - ERROR - extractor_tmdb - HTTP error on attempt 0: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/0?append_to_response=credits&api_key=d855a01d02babf081991d455eff39348&language=en-US&movie_id=0
2026-01-16 12:02:28,977 - ERROR - extractor_tmdb - HTTP error on attempt 1: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/0?append_to_response=credits&api_key=d855a01d02babf081991d455eff39348&language=en-US&movie_id=0
2026-01-16 12:02:31,374 - ERROR - extractor_tmdb - HTTP error on attempt 2: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/0?append_to_response=credits&api_key=d855a01d02babf081991d455eff39348&language=en-US&movie_id=0
2026-01-16 12:02:33,790 - ERROR - extractor_tmdb - HTTP error on attempt 3: 404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/0?append_to_response=credits&api_key=d855a01d02ba

In [4]:
output_path = Path("tmdb_raw_movies.jsonl")

with output_path.open("w", encoding="utf-8") as f:
    for movie in movies:
        if movie:
            json.dump(movie, f, ensure_ascii=False)
            f.write("\n")

print(f"Saved {len(movies)} movies to {output_path}")

Saved 18 movies to tmdb_raw_movies.jsonl


In [9]:
df = spark.read \
    .option("multiLine", "False") \
    .schema(ts.data_schema) \
    .json(str(output_path))

In [10]:
df.printSchema()

root
 |-- adult: boolean (nullable = true)
 |-- backdrop_path: string (nullable = true)
 |-- belongs_to_collection: struct (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- poster_path: string (nullable = true)
 |    |-- backdrop_path: string (nullable = true)
 |-- budget: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: long (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- origin_country: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: array (nullable = tr

In [11]:
categorical_columns = ['belongs_to_collection', 'genres', 'production_countries', 'production_companies', 'spoken_languages', 'origin_country',"credits.cast", "credits.crew"]

In [12]:
df.select(*categorical_columns).show(3, truncate=100, vertical=True)

-RECORD 0---------------------------------------------------------------------------------------------------------------------
 belongs_to_collection | {86311, The Avengers Collection, /yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg, /zuW6fOiusv4X9nnW3paHGfXcSll.jpg} 
 genres                | [{12, Adventure}, {878, Science Fiction}, {28, Action}]                                              
 production_countries  | [{US, United States of America}]                                                                     
 production_companies  | [{420, /hUzeosd33nzE5MCNsZxCGEKTXaQ.png, Marvel Studios, US}]                                        
 spoken_languages      | [{English, en, English}, {Japanese, ja, 日本語}, {Xhosa, xh, }]                                      
 origin_country        | [US]                                                                                                 
 cast                  | [{false, 2, 3223, Acting, Robert Downey Jr., Robert Downey Jr., 9.9269, /5qHNjhtjMD4YWH3U

In [13]:
df.limit(10).toPandas()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,origin_country,original_language,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,credits
0,False,/7RyHsO4yDXtBv1zUU3mTpHeQ0d5.jpg,"(86311, The Avengers Collection, /yFSIUVTCvgYr...",356000000,"[(12, Adventure), (878, Science Fiction), (28,...",https://www.marvel.com/movies/avengers-endgame,299534,tt4154796,[US],en,...,2799439100,181,"[(English, en, English), (Japanese, ja, 日本語), ...",Released,Avenge the fallen.,Avengers: Endgame,False,8.237,27130,"([(False, 2, 3223, Acting, Robert Downey Jr., ..."
1,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,"(87096, Avatar Collection, /3C5brXxnBxfkeKWwA1...",237000000,"[(28, Action), (12, Adventure), (14, Fantasy),...",https://www.avatar.com/movies/avatar,19995,tt0499549,[US],en,...,2923706026,162,"[(English, en, English), (Spanish, es, Español)]",Released,Enter the world of Pandora.,Avatar,False,7.6,33251,"([(False, 2, 65731, Acting, Sam Worthington, S..."
2,False,/k6EOrckWFuz7I4z4wiRwz8zsj4H.jpg,"(10, Star Wars Collection, /22dj38IckjzEEUZwN1...",245000000,"[(12, Adventure), (28, Action), (878, Science ...",http://www.starwars.com/films/star-wars-episod...,140607,tt2488496,[US],en,...,2068223624,136,"[(English, en, English)]",Released,Every generation has a story.,Star Wars: The Force Awakens,False,7.254,20200,"([(False, 2, 3, Acting, Harrison Ford, Harriso..."
3,False,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,"(86311, The Avengers Collection, /yFSIUVTCvgYr...",300000000,"[(12, Adventure), (28, Action), (878, Science ...",https://www.marvel.com/movies/avengers-infinit...,299536,tt4154756,[US],en,...,2052415039,149,"[(English, en, English), (Xhosa, xh, )]",Released,Destiny arrives all the same.,Avengers: Infinity War,False,8.235,31339,"([(False, 2, 3223, Acting, Robert Downey Jr., ..."
4,False,/xnHVX37XZEp33hhCbYlQFq7ux1J.jpg,,200000000,"[(18, Drama), (10749, Romance)]",https://www.paramountmovies.com/movies/titanic,597,tt0120338,[US],en,...,2264162353,194,"[(English, en, English), (French, fr, Français...",Released,Nothing on earth could come between them.,Titanic,False,7.903,26666,"([(False, 2, 6193, Acting, Leonardo DiCaprio, ..."
5,False,/s5QfDFqRO6sjgPtKkjxD0WqXQef.jpg,"(328, Jurassic Park Collection, /qIm2nHXLpBBdM...",150000000,"[(28, Action), (12, Adventure), (878, Science ...",https://www.jurassicworld.com/,135397,tt0369610,[US],en,...,1671537444,124,"[(English, en, English)]",Released,The park is open.,Jurassic World,False,6.7,21219,"([(False, 2, 73457, Acting, Chris Pratt, Chris..."
6,False,/1TUg5pO1VZ4B0Q1amk3OlXvlpXV.jpg,"(762512, The Lion King (Reboot) Collection, /d...",260000000,"[(12, Adventure), (18, Drama), (10751, Family)...",https://movies.disney.com/the-lion-king-2019,420818,tt6105098,[US],en,...,1662020819,118,"[(English, en, English)]",Released,The king has returned.,The Lion King,False,7.099,10602,"([(False, 2, 5294, Acting, Chiwetel Ejiofor, C..."
7,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"(86311, The Avengers Collection, /yFSIUVTCvgYr...",220000000,"[(878, Science Fiction), (28, Action), (12, Ad...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,[US],en,...,1518815515,143,"[(English, en, English), (Hindi, hi, हिन्दी), ...",Released,Some assembly required.,The Avengers,False,7.905,35090,"([(False, 2, 3223, Acting, Robert Downey Jr., ..."
8,False,/ehzI1mVcnHqB58NqPyQwpMqcVoz.jpg,"(9485, The Fast and the Furious Collection, /z...",190000000,"[(28, Action), (80, Crime), (53, Thriller)]",https://www.uphe.com/movies/furious-7,168259,tt2820852,[US],en,...,1515400000,139,"[(Arabic, ar, العربية), (English, en, English)...",Released,Vengeance hits home.,Furious 7,False,7.219,11089,"([(False, 2, 12835, Acting, Vin Diesel, Vin Di..."
9,False,/kIBK5SKwgqIIuRKhhWrJn3XkbPq.jpg,"(86311, The Avengers Collection, /yFSIUVTCvgYr...",235000000,"[(28, Action), (12, Adventure), (878, Science ...",https://www.marvel.com/movies/avengers-age-of-...,99861,tt2395427,[US],en,...,1405403694,141,"[(English, en, English)]",Released,A new age has come.,Avengers: Age of Ultron,False,7.271,24009,"([(False, 2, 3223, Acting, Robert Downey Jr., ..."


In [14]:
df.columns

['adult',
 'backdrop_path',
 'belongs_to_collection',
 'budget',
 'genres',
 'homepage',
 'id',
 'imdb_id',
 'origin_country',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'video',
 'vote_average',
 'vote_count',
 'credits']

## Drop irrelevant columns

In [15]:
cols_to_drop = ['adult', 'imdb_id', 'original_title', 'video', 'homepage', 	'success', 'status_code', 'status_message']
df = cln.drop_irrelevant_columns(df, cols_to_drop)

In [16]:
df.toPandas()

Unnamed: 0,backdrop_path,belongs_to_collection,budget,genres,id,origin_country,original_language,overview,popularity,poster_path,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,credits
0,/7RyHsO4yDXtBv1zUU3mTpHeQ0d5.jpg,"(86311, The Avengers Collection, /yFSIUVTCvgYr...",356000000,"[(12, Adventure), (878, Science Fiction), (28,...",299534,[US],en,After the devastating events of Avengers: Infi...,19.2389,/bR8ISy1O9XQxqiy0fQFw2BX72RQ.jpg,...,2019-04-24,2799439100,181,"[(English, en, English), (Japanese, ja, 日本語), ...",Released,Avenge the fallen.,Avengers: Endgame,8.237,27130,"([(False, 2, 3223, Acting, Robert Downey Jr., ..."
1,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,"(87096, Avatar Collection, /3C5brXxnBxfkeKWwA1...",237000000,"[(28, Action), (12, Adventure), (14, Fantasy),...",19995,[US],en,"In the 22nd century, a paraplegic Marine is di...",62.8235,/gKY6q7SjCkAU6FqvqWybDYgUKIF.jpg,...,2009-12-16,2923706026,162,"[(English, en, English), (Spanish, es, Español)]",Released,Enter the world of Pandora.,Avatar,7.6,33251,"([(False, 2, 65731, Acting, Sam Worthington, S..."
2,/k6EOrckWFuz7I4z4wiRwz8zsj4H.jpg,"(10, Star Wars Collection, /22dj38IckjzEEUZwN1...",245000000,"[(12, Adventure), (28, Action), (878, Science ...",140607,[US],en,Thirty years after defeating the Galactic Empi...,11.1639,/wqnLdwVXoBjKibFRR5U3y0aDUhs.jpg,...,2015-12-15,2068223624,136,"[(English, en, English)]",Released,Every generation has a story.,Star Wars: The Force Awakens,7.254,20200,"([(False, 2, 3, Acting, Harrison Ford, Harriso..."
3,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,"(86311, The Avengers Collection, /yFSIUVTCvgYr...",300000000,"[(12, Adventure), (28, Action), (878, Science ...",299536,[US],en,As the Avengers and their allies have continue...,28.6259,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,...,2018-04-25,2052415039,149,"[(English, en, English), (Xhosa, xh, )]",Released,Destiny arrives all the same.,Avengers: Infinity War,8.235,31339,"([(False, 2, 3223, Acting, Robert Downey Jr., ..."
4,/xnHVX37XZEp33hhCbYlQFq7ux1J.jpg,,200000000,"[(18, Drama), (10749, Romance)]",597,[US],en,101-year-old Rose DeWitt Bukater tells the sto...,29.4892,/9xjZS2rlVxm8SFx8kPC3aIGCOYQ.jpg,...,1997-12-18,2264162353,194,"[(English, en, English), (French, fr, Français...",Released,Nothing on earth could come between them.,Titanic,7.903,26666,"([(False, 2, 6193, Acting, Leonardo DiCaprio, ..."
5,/s5QfDFqRO6sjgPtKkjxD0WqXQef.jpg,"(328, Jurassic Park Collection, /qIm2nHXLpBBdM...",150000000,"[(28, Action), (12, Adventure), (878, Science ...",135397,[US],en,Twenty-two years after the events of Jurassic ...,10.564,/rhr4y79GpxQF9IsfJItRXVaoGs4.jpg,...,2015-06-06,1671537444,124,"[(English, en, English)]",Released,The park is open.,Jurassic World,6.7,21219,"([(False, 2, 73457, Acting, Chris Pratt, Chris..."
6,/1TUg5pO1VZ4B0Q1amk3OlXvlpXV.jpg,"(762512, The Lion King (Reboot) Collection, /d...",260000000,"[(12, Adventure), (18, Drama), (10751, Family)...",420818,[US],en,"Simba idolizes his father, King Mufasa, and ta...",9.8378,/dzBtMocZuJbjLOXvrl4zGYigDzh.jpg,...,2019-07-12,1662020819,118,"[(English, en, English)]",Released,The king has returned.,The Lion King,7.099,10602,"([(False, 2, 5294, Acting, Chiwetel Ejiofor, C..."
7,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"(86311, The Avengers Collection, /yFSIUVTCvgYr...",220000000,"[(878, Science Fiction), (28, Action), (12, Ad...",24428,[US],en,When an unexpected enemy emerges and threatens...,65.2961,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,...,2012-04-25,1518815515,143,"[(English, en, English), (Hindi, hi, हिन्दी), ...",Released,Some assembly required.,The Avengers,7.905,35090,"([(False, 2, 3223, Acting, Robert Downey Jr., ..."
8,/ehzI1mVcnHqB58NqPyQwpMqcVoz.jpg,"(9485, The Fast and the Furious Collection, /z...",190000000,"[(28, Action), (80, Crime), (53, Thriller)]",168259,[US],en,Deckard Shaw seeks revenge against Dominic Tor...,10.437,/ktofZ9Htrjiy0P6LEowsDaxd3Ri.jpg,...,2015-04-01,1515400000,139,"[(Arabic, ar, العربية), (English, en, English)...",Released,Vengeance hits home.,Furious 7,7.219,11089,"([(False, 2, 12835, Acting, Vin Diesel, Vin Di..."
9,/kIBK5SKwgqIIuRKhhWrJn3XkbPq.jpg,"(86311, The Avengers Collection, /yFSIUVTCvgYr...",235000000,"[(28, Action), (12, Adventure), (878, Science ...",99861,[US],en,When Tony Stark tries to jumpstart a dormant p...,17.1821,/4ssDuvEDkSArWEdyBl2X5EHvYKU.jpg,...,2015-04-22,1405403694,141,"[(English, en, English)]",Released,A new age has come.,Avengers: Age of Ultron,7.271,24009,"([(False, 2, 3223, Acting, Robert Downey Jr., ..."


In [17]:
df.columns

['backdrop_path',
 'belongs_to_collection',
 'budget',
 'genres',
 'id',
 'origin_country',
 'original_language',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'vote_average',
 'vote_count',
 'credits']

## Evaluate JSON-like columns

In [19]:
categorical_columns = ['belongs_to_collection', 'genres', 'production_countries', 'production_companies', 'spoken_languages', 'origin_country','credits']

In [20]:
df.select(*categorical_columns).limit(10).toPandas()

Unnamed: 0,belongs_to_collection,genres,production_countries,production_companies,spoken_languages,origin_country,credits
0,"(86311, The Avengers Collection, /yFSIUVTCvgYr...","[(12, Adventure), (878, Science Fiction), (28,...","[(US, United States of America)]","[(420, /hUzeosd33nzE5MCNsZxCGEKTXaQ.png, Marve...","[(English, en, English), (Japanese, ja, 日本語), ...",[US],"([(False, 2, 3223, Acting, Robert Downey Jr., ..."
1,"(87096, Avatar Collection, /3C5brXxnBxfkeKWwA1...","[(28, Action), (12, Adventure), (14, Fantasy),...","[(US, United States of America), (GB, United K...","[(444, None, Dune Entertainment, US), (574, /8...","[(English, en, English), (Spanish, es, Español)]",[US],"([(False, 2, 65731, Acting, Sam Worthington, S..."
2,"(10, Star Wars Collection, /22dj38IckjzEEUZwN1...","[(12, Adventure), (28, Action), (878, Science ...","[(US, United States of America)]","[(1, /tlVSws0RvvtPBwViUyOFAO0vcQS.png, Lucasfi...","[(English, en, English)]",[US],"([(False, 2, 3, Acting, Harrison Ford, Harriso..."
3,"(86311, The Avengers Collection, /yFSIUVTCvgYr...","[(12, Adventure), (28, Action), (878, Science ...","[(US, United States of America)]","[(420, /hUzeosd33nzE5MCNsZxCGEKTXaQ.png, Marve...","[(English, en, English), (Xhosa, xh, )]",[US],"([(False, 2, 3223, Acting, Robert Downey Jr., ..."
4,,"[(18, Drama), (10749, Romance)]","[(US, United States of America)]","[(4, /jay6WcMgagAklUt7i9Euwj1pzTF.png, Paramou...","[(English, en, English), (French, fr, Français...",[US],"([(False, 2, 6193, Acting, Leonardo DiCaprio, ..."
5,"(328, Jurassic Park Collection, /qIm2nHXLpBBdM...","[(28, Action), (12, Adventure), (878, Science ...","[(US, United States of America)]","[(56, /cEaxANEisCqeEoRvODv2dO1I0iI.png, Amblin...","[(English, en, English)]",[US],"([(False, 2, 73457, Acting, Chris Pratt, Chris..."
6,"(762512, The Lion King (Reboot) Collection, /d...","[(12, Adventure), (18, Drama), (10751, Family)...","[(US, United States of America)]","[(2, /wdrCwmRnLFJhEoH8GSfymY85KHT.png, Walt Di...","[(English, en, English)]",[US],"([(False, 2, 5294, Acting, Chiwetel Ejiofor, C..."
7,"(86311, The Avengers Collection, /yFSIUVTCvgYr...","[(878, Science Fiction), (28, Action), (12, Ad...","[(US, United States of America)]","[(420, /hUzeosd33nzE5MCNsZxCGEKTXaQ.png, Marve...","[(English, en, English), (Hindi, hi, हिन्दी), ...",[US],"([(False, 2, 3223, Acting, Robert Downey Jr., ..."
8,"(9485, The Fast and the Furious Collection, /z...","[(28, Action), (80, Crime), (53, Thriller)]","[(US, United States of America)]","[(333, /5xUJfzPZ8jWJUDzYtIeuPO4qPIa.png, Origi...","[(Arabic, ar, العربية), (English, en, English)...",[US],"([(False, 2, 12835, Acting, Vin Diesel, Vin Di..."
9,"(86311, The Avengers Collection, /yFSIUVTCvgYr...","[(28, Action), (12, Adventure), (878, Science ...","[(US, United States of America)]","[(420, /hUzeosd33nzE5MCNsZxCGEKTXaQ.png, Marve...","[(English, en, English)]",[US],"([(False, 2, 3223, Acting, Robert Downey Jr., ..."


## Extracting and cleaning json columns

In [21]:
df = cln.clean_movie_data(df)

In [22]:
df.limit(10).toPandas()

Unnamed: 0,backdrop_path,belongs_to_collection,budget,genres,id,origin_country,original_language,overview,popularity,poster_path,...,spoken_languages,status,tagline,title,vote_average,vote_count,cast,cast_size,director,crew_size
0,/7RyHsO4yDXtBv1zUU3mTpHeQ0d5.jpg,The Avengers Collection,356000000.0,Adventure|Science Fiction|Action,299534.0,US,en,After the devastating events of Avengers: Infi...,19.2389,/bR8ISy1O9XQxqiy0fQFw2BX72RQ.jpg,...,English|Japanese|Xhosa,Released,Avenge the fallen.,Avengers: Endgame,8.237,27130.0,Robert Downey Jr.|Chris Evans|Mark Ruffalo|Chr...,107,Anthony Russo|Joe Russo,608
1,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,Avatar Collection,237000000.0,Action|Adventure|Fantasy|Science Fiction,19995.0,US,en,"In the 22nd century, a paraplegic Marine is di...",62.8235,/gKY6q7SjCkAU6FqvqWybDYgUKIF.jpg,...,English|Spanish,Released,Enter the world of Pandora.,Avatar,7.6,33251.0,Sam Worthington|Zoe Saldaña|Sigourney Weaver|S...,67,James Cameron,991
2,/k6EOrckWFuz7I4z4wiRwz8zsj4H.jpg,Star Wars Collection,245000000.0,Adventure|Action|Science Fiction,140607.0,US,en,Thirty years after defeating the Galactic Empi...,11.1639,/wqnLdwVXoBjKibFRR5U3y0aDUhs.jpg,...,English,Released,Every generation has a story.,Star Wars: The Force Awakens,7.254,20200.0,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,183,J.J. Abrams,264
3,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,The Avengers Collection,300000000.0,Adventure|Action|Science Fiction,299536.0,US,en,As the Avengers and their allies have continue...,28.6259,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,...,English|Xhosa,Released,Destiny arrives all the same.,Avengers: Infinity War,8.235,31339.0,Robert Downey Jr.|Chris Evans|Chris Hemsworth|...,69,Joe Russo|Anthony Russo,734
4,/xnHVX37XZEp33hhCbYlQFq7ux1J.jpg,,200000000.0,Drama|Romance,597.0,US,en,101-year-old Rose DeWitt Bukater tells the sto...,29.4892,/9xjZS2rlVxm8SFx8kPC3aIGCOYQ.jpg,...,English|French|German|Swedish|Italian|Russian,Released,Nothing on earth could come between them.,Titanic,7.903,26666.0,Leonardo DiCaprio|Kate Winslet|Billy Zane|Kath...,116,James Cameron,262
5,/s5QfDFqRO6sjgPtKkjxD0WqXQef.jpg,Jurassic Park Collection,150000000.0,Action|Adventure|Science Fiction|Thriller,135397.0,US,en,Twenty-two years after the events of Jurassic ...,10.564,/rhr4y79GpxQF9IsfJItRXVaoGs4.jpg,...,English,Released,The park is open.,Jurassic World,6.7,21219.0,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,53,Colin Trevorrow,428
6,/1TUg5pO1VZ4B0Q1amk3OlXvlpXV.jpg,The Lion King (Reboot) Collection,260000000.0,Adventure|Drama|Family|Animation,420818.0,US,en,"Simba idolizes his father, King Mufasa, and ta...",9.8378,/dzBtMocZuJbjLOXvrl4zGYigDzh.jpg,...,English,Released,The king has returned.,The Lion King,7.099,10602.0,Chiwetel Ejiofor|John Oliver|Donald Glover|Jam...,20,Jon Favreau,50
7,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,The Avengers Collection,220000000.0,Science Fiction|Action|Adventure,24428.0,US,en,When an unexpected enemy emerges and threatens...,65.2961,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,...,English|Hindi|Russian,Released,Some assembly required.,The Avengers,7.905,35090.0,Robert Downey Jr.|Chris Evans|Mark Ruffalo|Chr...,113,Joss Whedon,642
8,/ehzI1mVcnHqB58NqPyQwpMqcVoz.jpg,The Fast and the Furious Collection,190000000.0,Action|Crime|Thriller,168259.0,US,en,Deckard Shaw seeks revenge against Dominic Tor...,10.437,/ktofZ9Htrjiy0P6LEowsDaxd3Ri.jpg,...,Arabic|English|Spanish|Thai,Released,Vengeance hits home.,Furious 7,7.219,11089.0,Vin Diesel|Paul Walker|Dwayne Johnson|Michelle...,49,James Wan,228
9,/kIBK5SKwgqIIuRKhhWrJn3XkbPq.jpg,The Avengers Collection,235000000.0,Action|Adventure|Science Fiction,99861.0,US,en,When Tony Stark tries to jumpstart a dormant p...,17.1821,/4ssDuvEDkSArWEdyBl2X5EHvYKU.jpg,...,English,Released,A new age has come.,Avengers: Age of Ultron,7.271,24009.0,Robert Downey Jr.|Chris Hemsworth|Mark Ruffalo...,74,Joss Whedon,653


In [23]:
df.dtypes

[('backdrop_path', 'string'),
 ('belongs_to_collection', 'string'),
 ('budget', 'double'),
 ('genres', 'string'),
 ('id', 'double'),
 ('origin_country', 'string'),
 ('original_language', 'string'),
 ('overview', 'string'),
 ('popularity', 'double'),
 ('poster_path', 'string'),
 ('production_companies', 'string'),
 ('production_countries', 'string'),
 ('release_date', 'date'),
 ('revenue', 'double'),
 ('runtime', 'double'),
 ('spoken_languages', 'string'),
 ('status', 'string'),
 ('tagline', 'string'),
 ('title', 'string'),
 ('vote_average', 'double'),
 ('vote_count', 'double'),
 ('cast', 'string'),
 ('cast_size', 'int'),
 ('director', 'string'),
 ('crew_size', 'int')]

In [16]:
# 1) quick overview
show_schema_overview(df)

# 2) get extractor suggestions
suggest_extractors(df)

# 3) explore a specific map col (e.g. belongs_to_collection)
explore_map_column(df, "belongs_to_collection")

# 4) explore an array of maps (e.g. genres or production_companies)
explore_array_column(df, "genres")

# 5) safely apply an extractor only when schema matches
df2 = safe_apply_array_map_extractor(df, "genres", "name", out_col="genres_names")

# 6) run a full exploration of nested columns (be careful on very large datasets)
explore_all_nested(df, sample_limit=5)


Column | SparkType | Nested?
backdrop_path | string | False
belongs_to_collection | map<string,bigint> | True
budget | bigint | False
credits | map<string,array<map<string,boolean>>> | True
genres | array<map<string,bigint>> | True
id | bigint | False
origin_country | array<string> | True
original_language | string | False
overview | string | False
popularity | double | False
poster_path | string | False
production_companies | array<map<string,bigint>> | True
production_countries | array<map<string,string>> | True
release_date | string | False
revenue | bigint | False
runtime | bigint | False
spoken_languages | array<map<string,string>> | True
status | string | False
tagline | string | False
title | string | False
vote_average | double | False
vote_count | bigint | False
backdrop_path                  : SCALAR -> leave or cast
belongs_to_collection          : MAP -> use col.getItem(key) or explode(map_entries(col))
budget                         : SCALAR -> leave or cast
credits       