# The Movie Database: Data Extraction & Cleaning

In [1]:
#  %pip install cpi

In [2]:
# Import dependencies
import pandas as pd
import json
import requests
from pprint import pprint
import numpy as np
import cpi

from datetime import datetime

# Import config
from config import api_key, db_user, db_password, db_host, db_port, db_name



In [3]:
cpi.update()

### Testing:

In [4]:
# Single Test: Discover Most Popular Movies
page_number = 1

# Endpoint for finding most popular movies
discover_movies = "https://api.themoviedb.org/3/discover/movie"
most_popular_url = f"{discover_movies}?api_key={api_key}&page={page_number}&sort_by=popularity.desc"

# Most popular movies
tmdb_response = requests.get(most_popular_url).json()
results = tmdb_response["results"]

json_string = json.dumps(results)
df = pd.read_json(json_string)
df

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg,"[16, 35, 10751, 10402]",438695,en,Sing 2,Buster and his new cast now have their sights ...,9211.33,/aWeKITRFbbwY8txG5uCj4rMCfSP.jpg,2021-12-01,Sing 2,False,7.5,264
1,False,/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg,"[35, 14, 12]",425909,en,Ghostbusters: Afterlife,When a single mom and her two kids arrive in a...,7227.726,/sg4xJaufDiQl7caFEskBtQXfD4x.jpg,2021-11-11,Ghostbusters: Afterlife,False,7.2,753
2,False,/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg,"[28, 12, 878]",634649,en,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,6708.051,/1g0dhYtq4irTY1GPXvft6k4YLjm.jpg,2021-12-15,Spider-Man: No Way Home,False,8.4,4020
3,False,/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg,"[16, 35, 10751, 14]",568124,en,Encanto,"The tale of an extraordinary family, the Madri...",6071.14,/4j0PNHkMr5ax3IA8tjtxcmPU3QT.jpg,2021-11-24,Encanto,False,7.8,2616
4,False,/o76ZDm8PS9791XiuieNB93UZcRV.jpg,"[27, 28, 878]",460458,en,Resident Evil: Welcome to Raccoon City,Once the booming home of pharmaceutical giant ...,5449.78,/7uRbWOXxpWDMtnsd2PF3clu65jc.jpg,2021-11-24,Resident Evil: Welcome to Raccoon City,False,6.0,824
5,False,/eNI7PtK6DEYgZmHWP9gQNuff8pv.jpg,"[878, 28, 12]",624860,en,The Matrix Resurrections,"Plagued by strange memories, Neo's life takes ...",4189.812,/gZlZLxJMfnSeS60abFZMh1IvODQ.jpg,2021-12-16,The Matrix Resurrections,False,7.0,2056
6,False,/vIgyYkXkg6NC2whRbYjBD7eb3Er.jpg,"[878, 28, 12]",580489,en,Venom: Let There Be Carnage,After finding a host body in investigative rep...,3458.355,/rjkmN1dniUHVYAtwuV3Tji7FsDO.jpg,2021-09-30,Venom: Let There Be Carnage,False,7.2,5749
7,False,/dK12GIdhGP6NPGFssK2Fh265jyr.jpg,"[28, 35, 80, 53]",512195,en,Red Notice,An Interpol-issued Red Notice is a global aler...,3286.87,/lAXONuqg41NwUMuzMiFvicDET9Y.jpg,2021-11-04,Red Notice,False,6.8,2611
8,False,/lyvszvJJqqI8aqBJ70XzdCNoK0y.jpg,"[28, 12, 14, 878]",524434,en,Eternals,The Eternals are a team of ancient aliens who ...,3098.344,/5L7bclqxXtsqsitP83KpkZbgTQ3.jpg,2021-11-03,Eternals,False,7.1,1726
9,False,/cinER0ESG0eJ49kXlExM0MEWGxW.jpg,"[28, 12, 14]",566525,en,Shang-Chi and the Legend of the Ten Rings,Shang-Chi must confront the past he thought he...,1871.434,/1BIoJGKbXjdFDAqUEiA2VHqkK1Z.jpg,2021-09-01,Shang-Chi and the Legend of the Ten Rings,False,7.8,5058


In [5]:
#Single Test: Crew
movie_id = 672582

# Endpoint & response
movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
movie_response = requests.get(movie_url).json()

pprint(movie_response)

{'adult': False,
 'backdrop_path': '/yL9RRZbDVbptqLwiZcK304ck4PL.jpg',
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 27, 'name': 'Horror'}],
 'homepage': '',
 'id': 672582,
 'imdb_id': 'tt11686490',
 'original_language': 'en',
 'original_title': 'The Deep House',
 'overview': 'While diving in a remote French lake, a couple of YouTubers who '
             'specialize in underwater exploration videos discover a house '
             'submerged in the deep waters. What was initially a unique '
             'finding soon turns into a nightmare when they discover that the '
             'house was the scene of atrocious crimes. Trapped, with their '
             'oxygen reserves falling dangerously, they realize the worst is '
             'yet to come: they are not alone in the house.',
 'popularity': 387.29,
 'poster_path': '/5xhAPxRr64oQPEFnUOrttuI4ZEU.jpg',
 'production_companies': [{'id': 12689,
                           'logo_path': None,
                           

# Functions: Define API calls to extract key data points

In [6]:
# Start Timer Function (check on API call performance)
def start_timer():
    start = datetime.now()
    
    return start

In [7]:
# Stop Timer Function (check on API call performance)
def stop_timer(start):
    end = datetime.now()
    elapsed_time = (end - start)

    print(f"Total Time Elapsed:  {elapsed_time.total_seconds()} seconds")

### API CALLS:

In [34]:
# Returns most popular movies
def get_most_popular_movies(api_key):
    
    movies = []
    
    # Loop through pages to get results for movies
    for x in range(1, 301):
        page_number = x

        # Endpoint for finding most popular movies
        discover_movies = "https://api.themoviedb.org/3/discover/movie"
        most_popular_url = f"{discover_movies}?api_key={api_key}&page={page_number}&sort_by=popularity.desc"
        
        # Most popular movies
        tmdb_response = requests.get(most_popular_url).json()
        results = tmdb_response["results"]

#         for y in range(len(results)):
#             movies.append(results[y]) 
        for result in results:
            movies.append(result)
            
    return movies

In [35]:
# Returns movie details as a list
def get_movie_details(api_key, movie_ids):  
    
    movie_details = []
    
    for movie_id in movie_ids:
        # Endpoint & response
        movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
        movie_response = requests.get(movie_url).json()

        # Append results to lists
        movie_details.append(movie_response)
        
    return movie_details

In [36]:
# Returns keywords as a list
def get_movie_keywords(api_key, movie_ids):
    keyword_details = []
    
    for movie_id in movie_ids:
        # Get keywords for each movie
        keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}"
        keywords_response = requests.get(keywords_url).json()
        keywords_results = keywords_response["keywords"]
        
         # Append results to lists
        keyword_details.append(keywords_response)
        
    return keyword_details

In [37]:
# Returns credits as a list
def get_credits(api_key, movie_ids):
    credit_details = []
    
    for movie_id in movie_ids:
        # Generate movie credits
        movie_credits = f"/movie/{movie_id}/credits"
        credits_url = f"https://api.themoviedb.org/3/{movie_credits}?api_key={api_key}"

        # Get the json response for the credits
        credits_response = requests.get(credits_url).json()
        
        credit_details.append(credits_response)
        
    return credit_details

In [38]:
# Returns certifications as a list (rating: G, PG, etc.)
def get_certifications(api_key, movie_ids):
    certifications = []
    
    for movie_id in movie_ids:
        # Get certifications for each movie
        certification_url = f"https://api.themoviedb.org/3/movie/{movie_id}/release_dates?api_key={api_key}"
        
        certification_response = requests.get(certification_url).json()
        
        certifications.append(certification_response)
        
    return certifications

### GET TITLES & IDS:

In [39]:
# Returns single title for specified index number
def get_title(results, idx):
    movie_title = results[idx]["title"]
    
    return movie_title

In [40]:
# Returns ids list
def get_ids(results):
    movie_ids = []
    for x in results:
        movie_ids.append(x["id"])
        
    return movie_ids

In [41]:
# Returns titles list
def get_titles(results):
    movie_titles = []
    for x in results:
        movie_titles.append(x["title"])
        
    return movie_titles

### EXTRACT NEEDED DETAILS FROM COLUMNS: Cast, Crew, Gender, Production Countries etc. 

In [42]:
# Extract the certification (ratings)
def extract_certification(x):
    name = ""
    
    for i in x:
        # Get ratings for US
        if i['iso_3166_1'] == 'US':
             # Append results to lists
            name = i['release_dates'][0]['certification']
            break
    return name

In [43]:
# Function to get the director out of the crew
def get_director(x):
    names = []
    
    for i in x:
        if i['job'] == 'Director':
            name = i['name']
            names.append(name)
            
    if(names):
        return names
    
    return np.nan

In [44]:
# Function to get the director gender out of the crew
def get_director_gender(x):
    names = []
    
    for i in x:
        if i['job'] == 'Director':
            gender = i['gender']
            names.append(gender)
            
    if(names):
        return names
    
    return np.nan

In [45]:
def fm_percentage(results):
    fm_count = 0
    total_count = 0
    for x in results:
        if x['gender'] == 1:
            fm_count += 1
        if x['gender'] == 1 or x['gender'] == 2:
            total_count += 1
    
    if total_count == 0:        
        return np.nan
            
    return percentage_format(100 * fm_count/total_count)

In [46]:
# Function to get the producers out of the crew
def get_producers(x):
    names = []
    
    for i in x:
        if i['job'] == 'Producer':
            name = i['name']
            names.append(name)
        elif i['job'] == "Executive Producer":
            name = i['name']
            names.append(name)
        elif i['job'] == 'Co-Producer':
            name = i['name']
            names.append(name)
            
    if(names):
        return names
    
    return np.nan

In [47]:
# Function to get the screenplay writers out of the crew
def get_writers(x):
    names = []
    for i in x:
        if i['department'] == 'Writing':
            name = i['name']
            names.append(name)
            
    if(names):
        return names
    
    return np.nan

In [48]:
# Function to create cast list
def get_cast_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        
        # Condition for cast (keep top 5)
        if len(names) > 5:
            names = names[:5]
        return names

    return []

In [49]:
# Function to create lists of each feature
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        return names
        
    return []

In [50]:
# Function to get production companies
def get_production_companies(x):
    names = []
    if isinstance(x, list):
        for i in x:
            name = i['name']
            names.append(name)
        return names
        
    return []

In [51]:
# Get each production company's country of origin
def get_production_company_country(x):
    names = []
    if isinstance(x, list):
        for i in x:
            country = i['origin_country']
            names.append(country)
        return names
    return []

In [52]:
# Get list of all languages available for film
def get_languages(x):
    names = []
    for i in x:
        name = i['english_name']
#         iso = i['iso_639_1']
        names.append(name)
    
    if(names):
        return names
    
    return np.nan

In [53]:
# Create binary column for foreign language films
def original_language_binary(x):
    if (x == 'en'):
        return 0
    elif (x != 'en'):
        return 1
    elif (x == "" | x == " "):
        return 0
    else:
        return 0

In [54]:
def percentage_format(percentage):
    return "{:,.2f}%".format(percentage)

### CLEAN DATA FOR SOUP & CREATE SOUP: Used for Machine Learning 

In [55]:
# Convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else: 
            return ''

In [56]:
# Clean the overview column (by converting to lowercase)
def clean_overview(x):
    if isinstance(x, str):
        return x.lower()
    else: 
        return ''

In [57]:
# Create soup of words
def create_soup(x):
    space = ' '
    return (space.join(x['keywords_cleaned']) + space + space.join(x['cast_cleaned']) + space 
            + space.join(x['director_cleaned']) + space + space.join(x['producers_cleaned']) + space 
            + space.join(x['writers_cleaned']) + space + space.join(x['genres_cleaned']) + space 
            + space.join(x['production_companies_cleaned']))

In [58]:
# Create soup of words - 2 keywords, 2 genres & overview
def create_soup_overview(x):
    space = ' '
    return (space.join(x['keywords_cleaned']) + space + space.join(x['keywords_cleaned']) + space 
            + space.join(x['cast_cleaned']) + space + space.join(x['director_cleaned']) + space 
            + space.join(x['producers_cleaned']) + space + space.join(x['writers_cleaned']) + space 
            + space.join(x['genres_cleaned']) + space + space.join(x['genres_cleaned']) + space 
            + space.join(x['production_companies_cleaned']) + space + x['overview_cleaned'])

# CALL API

### Get Most Popular Movies JSON Results:

In [59]:
# Start the timer
start = start_timer()

In [60]:
most_popular_movies = get_most_popular_movies(api_key)
# print(most_popular_movies)

In [61]:
# Stop the timer
stop_timer(start)

Total Time Elapsed:  98.044416 seconds


### List of Movie IDs:

In [62]:
movie_ids = get_ids(most_popular_movies)
#print(movie_ids)

In [63]:
len(movie_ids)

6000

### List of Titles:

In [64]:
titles = get_titles(most_popular_movies)
# print(titles)

In [65]:
len(titles)

6000

### Print Individual Title:

In [66]:
print(get_title(most_popular_movies, 0))

Sing 2


### Get Movie Details:

In [67]:
# Start the timer
start = start_timer()

In [69]:
details = get_movie_details(api_key, movie_ids)

In [71]:
pprint(details[0:10])

[{'adult': False,
  'backdrop_path': '/70nxSw3mFBsGmtkvcs91PbjerwD.jpg',
  'belongs_to_collection': {'backdrop_path': '/rhLspFB1B8ZCkWEHFYmc3NKagzq.jpg',
                            'id': 558216,
                            'name': 'Venom Collection',
                            'poster_path': '/670x9sf0Ru8y6ezBggmYudx61yB.jpg'},
  'budget': 110000000,
  'genres': [{'id': 878, 'name': 'Science Fiction'},
             {'id': 28, 'name': 'Action'},
             {'id': 12, 'name': 'Adventure'}],
  'homepage': 'https://www.venom.movie',
  'id': 580489,
  'imdb_id': 'tt7097896',
  'original_language': 'en',
  'original_title': 'Venom: Let There Be Carnage',
  'overview': 'After finding a host body in investigative reporter Eddie '
              'Brock, the alien symbiote must face a new enemy, Carnage, the '
              'alter ego of serial killer Cletus Kasady.',
  'popularity': 8633.976,
  'poster_path': '/rjkmN1dniUHVYAtwuV3Tji7FsDO.jpg',
  'production_companies': [{'id': 7505,
       

In [70]:
# Stop the timer
stop_timer(start)

Total Time Elapsed:  2925.561604 seconds


### Get Keywords:

In [71]:
# Start the timer
start = start_timer()

In [72]:
keywords = get_movie_keywords(api_key, movie_ids)

In [73]:
# print(keywords)

In [74]:
# Stop the timer
stop_timer(start)

Total Time Elapsed:  1603.671694 seconds


### Get Credits:

In [75]:
# Start the timer
start = start_timer()

In [76]:
credits = get_credits(api_key, movie_ids)

In [77]:
# print(credits)

In [78]:
# Stop the timer
stop_timer(start)

Total Time Elapsed:  1750.395193 seconds


### Get Certifications:

In [79]:
# Start the timer
start = start_timer()

In [80]:
# Get the certifications for each movie (rating: G, PG, etc.)
certifications = get_certifications(api_key, movie_ids)
# certifications = get_certifications(api_key, [32657, 672582])

In [81]:
# print(certifications)

In [82]:
# Stop the timer
stop_timer(start)

Total Time Elapsed:  2468.58483 seconds


## Movie Details DataFrame:

In [83]:
# Convert details to json
json_details_string = json.dumps(details)
# Convert json to dataframe
movie_details_df = pd.read_json(json_details_string)

# Export to save
# movie_details_df.to_csv("./static/data/movie_details.csv", index=False)

movie_details_df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg,"{'id': 544670, 'name': 'Sing Collection', 'pos...",85000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://www.illumination.com/movie/sing-2/,438695,tt6467266,en,Sing 2,...,2021-12-01,190860000,110.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Where will your dreams take you?,Sing 2,False,7.6,322
1,False,/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg,"{'id': 2980, 'name': 'Ghostbusters Collection'...",75000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",https://ghostbusters.com/,425909,tt4513678,en,Ghostbusters: Afterlife,...,2021-11-11,191000000,124.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Discover the past. Protect the future.,Ghostbusters: Afterlife,False,7.3,806
2,False,/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg,"{'id': 531241, 'name': 'Spider-Man (Avengers) ...",200000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.spidermannowayhome.movie,634649,tt10872600,en,Spider-Man: No Way Home,...,2021-12-15,1538282364,148.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The Multiverse unleashed.,Spider-Man: No Way Home,False,8.4,4053
3,False,/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg,,50000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://movies.disney.com/encanto,568124,tt2953050,en,Encanto,...,2021-11-24,215000000,102.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's a little magic in all of us ...almost ...,Encanto,False,7.8,2660
4,False,/o76ZDm8PS9791XiuieNB93UZcRV.jpg,,40000000,"[{'id': 27, 'name': 'Horror'}, {'id': 28, 'nam...",https://www.residentevil.movie,460458,tt6920084,en,Resident Evil: Welcome to Raccoon City,...,2021-11-24,31000000,107.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Witness the beginning of evil.,Resident Evil: Welcome to Raccoon City,False,6.0,833


In [84]:
# Extract year from release date
movie_details_df["year"] = pd.DatetimeIndex(movie_details_df['release_date']).year
# Drop any N/A from year
movie_details_df.dropna(inplace=True, how="any", subset=['year','release_date'])
 
# Convert to int
movie_details_df["year"] = movie_details_df["year"].astype(int)
    
# Rename columns to prepare for inflation calculation
movie_details_df = movie_details_df.rename(columns = {
    "revenue": "original_revenue",
    "budget": "original_budget"
})

In [85]:
len(movie_details_df)

5970

## Adjust Budget & Revenue for Inflation

In [86]:
# Create dataframe for cpi
cpi_df = pd.DataFrame(columns=['year', 'cpi_2021', 'cpi_old'])

In [87]:
# Create list of years (1913-2021)
years = []
for i in range(1913, 2022):
    years.append(i)

In [88]:
# Define 2021 cpi
cpi_2021 = 269.489

# Loop through years to append to dataframe
for year in years:
    if year != 2021:
        cpi_old = cpi.get(year)
        cpi_df = cpi_df.append({"year": year, 'cpi_2021': cpi_2021, 'cpi_old': cpi_old}, ignore_index=True)
    elif year == 2021:
        cpi_df = cpi_df.append({"year": year, 'cpi_2021': cpi_2021, 'cpi_old': cpi_2021}, ignore_index=True)

In [89]:
# Convert year type to int
cpi_df['year'] = cpi_df['year'].astype(int)
cpi_df.head()

Unnamed: 0,year,cpi_2021,cpi_old
0,1913,269.489,9.9
1,1914,269.489,10.0
2,1915,269.489,10.1
3,1916,269.489,10.9
4,1917,269.489,12.8


In [90]:
# Merge movie details df with cpi df
movie_details_df = movie_details_df.merge(cpi_df, on="year")
movie_details_df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,original_budget,genres,homepage,id,imdb_id,original_language,original_title,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,cpi_2021,cpi_old
0,False,/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg,"{'id': 544670, 'name': 'Sing Collection', 'pos...",85000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://www.illumination.com/movie/sing-2/,438695,tt6467266,en,Sing 2,...,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Where will your dreams take you?,Sing 2,False,7.6,322,2021,269.489,269.489
1,False,/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg,"{'id': 2980, 'name': 'Ghostbusters Collection'...",75000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",https://ghostbusters.com/,425909,tt4513678,en,Ghostbusters: Afterlife,...,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Discover the past. Protect the future.,Ghostbusters: Afterlife,False,7.3,806,2021,269.489,269.489
2,False,/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg,"{'id': 531241, 'name': 'Spider-Man (Avengers) ...",200000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.spidermannowayhome.movie,634649,tt10872600,en,Spider-Man: No Way Home,...,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The Multiverse unleashed.,Spider-Man: No Way Home,False,8.4,4053,2021,269.489,269.489
3,False,/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg,,50000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://movies.disney.com/encanto,568124,tt2953050,en,Encanto,...,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's a little magic in all of us ...almost ...,Encanto,False,7.8,2660,2021,269.489,269.489
4,False,/o76ZDm8PS9791XiuieNB93UZcRV.jpg,,40000000,"[{'id': 27, 'name': 'Horror'}, {'id': 28, 'nam...",https://www.residentevil.movie,460458,tt6920084,en,Resident Evil: Welcome to Raccoon City,...,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Witness the beginning of evil.,Resident Evil: Welcome to Raccoon City,False,6.0,833,2021,269.489,269.489


In [91]:
# Create adjusted column for each feature
features = ["revenue", "budget"]

for feature in features:
    movie_details_df[f'adjusted_{feature}'] = (movie_details_df[f'original_{feature}'] * movie_details_df['cpi_2021']) / movie_details_df['cpi_old']

movie_details_df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,original_budget,genres,homepage,id,imdb_id,original_language,original_title,...,tagline,title,video,vote_average,vote_count,year,cpi_2021,cpi_old,adjusted_revenue,adjusted_budget
0,False,/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg,"{'id': 544670, 'name': 'Sing Collection', 'pos...",85000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://www.illumination.com/movie/sing-2/,438695,tt6467266,en,Sing 2,...,Where will your dreams take you?,Sing 2,False,7.6,322,2021,269.489,269.489,190860000.0,85000000.0
1,False,/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg,"{'id': 2980, 'name': 'Ghostbusters Collection'...",75000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",https://ghostbusters.com/,425909,tt4513678,en,Ghostbusters: Afterlife,...,Discover the past. Protect the future.,Ghostbusters: Afterlife,False,7.3,806,2021,269.489,269.489,191000000.0,75000000.0
2,False,/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg,"{'id': 531241, 'name': 'Spider-Man (Avengers) ...",200000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.spidermannowayhome.movie,634649,tt10872600,en,Spider-Man: No Way Home,...,The Multiverse unleashed.,Spider-Man: No Way Home,False,8.4,4053,2021,269.489,269.489,1538282000.0,200000000.0
3,False,/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg,,50000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://movies.disney.com/encanto,568124,tt2953050,en,Encanto,...,There's a little magic in all of us ...almost ...,Encanto,False,7.8,2660,2021,269.489,269.489,215000000.0,50000000.0
4,False,/o76ZDm8PS9791XiuieNB93UZcRV.jpg,,40000000,"[{'id': 27, 'name': 'Horror'}, {'id': 28, 'nam...",https://www.residentevil.movie,460458,tt6920084,en,Resident Evil: Welcome to Raccoon City,...,Witness the beginning of evil.,Resident Evil: Welcome to Raccoon City,False,6.0,833,2021,269.489,269.489,31000000.0,40000000.0


In [92]:
# Check values
movie_details_df[['original_budget', 'adjusted_budget', 'original_revenue', 'adjusted_revenue', 'year']]

Unnamed: 0,original_budget,adjusted_budget,original_revenue,adjusted_revenue,year
0,85000000,8.500000e+07,190860000,1.908600e+08,2021
1,75000000,7.500000e+07,191000000,1.910000e+08,2021
2,200000000,2.000000e+08,1538282364,1.538282e+09,2021
3,50000000,5.000000e+07,215000000,2.150000e+08,2021
4,40000000,4.000000e+07,31000000,3.100000e+07,2021
...,...,...,...,...,...
5850,2479000,2.311637e+07,7797728,7.271287e+07,1958
5851,1300000,2.013424e+07,650422,1.007365e+07,1927
5852,0,0.000000e+00,0,0.000000e+00,1935
5853,923000,1.421362e+07,4000000,6.159749e+07,1925


## Low Budget:

In [93]:
# Create budget bins
bins = [1, 15000000, 50000000, 150000000, 380000000] 
bin_names = ["1 to 15m", "16m to 50m", "51m to 150m", "151m to 380m"]

# Append a budget bin column
movie_details_df["budget_bins"] = pd.cut(movie_details_df["adjusted_budget"], bins, labels=bin_names)
movie_details_df["budget_bins"].value_counts()

16m to 50m      1189
51m to 150m     1006
1 to 15m         836
151m to 380m     343
Name: budget_bins, dtype: int64

In [94]:
# Low Budget
movie_details_df.loc[movie_details_df["budget_bins"] == "1 to 15m"].head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,original_budget,genres,homepage,id,imdb_id,original_language,original_title,...,title,video,vote_average,vote_count,year,cpi_2021,cpi_old,adjusted_revenue,adjusted_budget,budget_bins
14,False,/weneJTnAb1IFI94SKcaXzBFmPKH.jpg,,12400000,"[{'id': 80, 'name': 'Crime'}, {'id': 53, 'name...",,818192,tt11388416,en,Ida Red,...,Ida Red,False,6.0,20,2021,269.489,269.489,0.0,12400000.0,1 to 15m
15,False,/xGrTm3J0FTafmuQ85vF7ZCw94x6.jpg,,9100000,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",,589761,tt10648714,ru,Чернобыль,...,Chernobyl: Abyss,False,6.3,264,2021,269.489,269.489,5370393.0,9100000.0,1 to 15m
16,False,/gg2w8QYf6o5elN95RHtikQaVIsc.jpg,,13000000,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,592508,tt9531772,hi,Sooryavanshi,...,Sooryavanshi,False,5.8,68,2021,269.489,269.489,37700000.0,13000000.0,1 to 15m
33,False,/lV3UFPPxDIPelh46G9oySXN9Mcz.jpg,"{'id': 702624, 'name': 'After Collection', 'po...",14000000,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",,744275,tt13069986,en,After We Fell,...,After We Fell,False,7.2,1324,2021,269.489,269.489,19000000.0,14000000.0,1 to 15m
34,False,/hAv1GwwatyWV1RFXOfaASxgUVm4.jpg,,2800000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,657644,tt7268738,ml,മിന്നൽ മുരളി,...,Minnal Murali,False,7.2,128,2021,269.489,269.489,0.0,2800000.0,1 to 15m


## Keywords DataFrame:

In [95]:
# Convert keywords to json
json_keywords_string = json.dumps(keywords)
# Convert json to dataframe
movie_keywords_df = pd.read_json(json_keywords_string)

# Export to save
# movie_keywords_df.to_csv("./static/data/movie_keywords.csv", index=False)
pprint(json_keywords_string)

('[{"id": 438695, "keywords": [{"id": 11477, "name": "anthropomorphism"}, '
 '{"id": 12990, "name": "singing"}]}, {"id": 425909, "keywords": [{"id": 1415, '
 '"name": "small town"}, {"id": 3093, "name": "ghostbuster"}, {"id": 5609, '
 '"name": "nostalgia"}, {"id": 6155, "name": "afterlife"}, {"id": 9663, '
 '"name": "sequel"}, {"id": 162846, "name": "ghost"}, {"id": 179430, "name": '
 '"aftercreditsstinger"}, {"id": 179431, "name": "duringcreditsstinger"}, '
 '{"id": 196960, "name": "dead grandfather"}, {"id": 229863, "name": "ancient '
 'evil"}, {"id": 245207, "name": "otherworldly beings"}, {"id": 276130, '
 '"name": "paranormal events"}, {"id": 280259, "name": "father absence"}, '
 '{"id": 288025, "name": "ghostbusters"}]}, {"id": 634649, "keywords": [{"id": '
 '1701, "name": "hero"}, {"id": 5451, "name": "comic book"}, {"id": 9715, '
 '"name": "superhero"}, {"id": 9717, "name": "based on comic"}, {"id": 180547, '
 '"name": "marvel cinematic universe (mcu)"}]}, {"id": 568124, "keywo

 '"one man army"}, {"id": 206041, "name": "professional assassin"}, {"id": '
 '252153, "name": "baba yaga"}, {"id": 257090, "name": "dog man '
 'friendship"}]}, {"id": 168259, "keywords": [{"id": 830, "name": "car race"}, '
 '{"id": 3428, "name": "speed"}, {"id": 9666, "name": "street race"}, {"id": '
 '9748, "name": "revenge"}, {"id": 40870, "name": "race"}, {"id": 205399, '
 '"name": "muscle car"}]}, {"id": 793147, "keywords": []}, {"id": 809968, '
 '"keywords": [{"id": 2334, "name": "nightclub"}, {"id": 9844, "name": "car '
 'crash"}, {"id": 163764, "name": "death of brother"}, {"id": 269428, "name": '
 '"mma"}]}, {"id": 10674, "keywords": [{"id": 478, "name": "china"}, {"id": '
 '2156, "name": "homeland"}, {"id": 3289, "name": "villain"}, {"id": 4344, '
 '"name": "musical"}, {"id": 4411, "name": "sexism"}, {"id": 4613, "name": '
 '"training"}, {"id": 5600, "name": "daughter"}, {"id": 5719, "name": '
 '"cricket"}, {"id": 7376, "name": "princess"}, {"id": 9672, "name": "based on '
 '

 '194404, "name": "supervillain"}, {"id": 232939, "name": "masked '
 'supervillain"}]}, {"id": 792657, "keywords": [{"id": 15111, "name": '
 '"drinking"}, {"id": 163053, "name": "found footage"}, {"id": 184712, "name": '
 '"online chat"}, {"id": 186523, "name": "lockdown"}, {"id": 188973, "name": '
 '"pandemic"}, {"id": 263237, "name": "covid-19"}, {"id": 284165, "name": '
 '"video chat"}]}, {"id": 37645, "keywords": [{"id": 818, "name": "based on '
 'novel or book"}, {"id": 1227, "name": "cemetery"}, {"id": 1930, "name": '
 '"kidnapping"}, {"id": 3713, "name": "chase"}, {"id": 5340, "name": '
 '"investigation"}, {"id": 9748, "name": "revenge"}, {"id": 9826, "name": '
 '"murder"}, {"id": 10391, "name": "mafia"}, {"id": 11578, "name": "mobster"}, '
 '{"id": 11612, "name": "hospital"}, {"id": 14707, "name": "brutality"}, '
 '{"id": 220969, "name": "left for dead"}]}, {"id": 431819, "keywords": '
 '[{"id": 2407, "name": "fireworks"}, {"id": 3230, "name": "male friendship"}, '
 '{"id": 407

 '{"id": 2587, "name": "married couple"}, {"id": 6611, "name": "tahiti"}, '
 '{"id": 8728, "name": "couples therapy"}, {"id": 9673, "name": "love"}, '
 '{"id": 9767, "name": "beautiful woman"}, {"id": 15160, "name": "divorce"}, '
 '{"id": 18053, "name": "divorcee"}, {"id": 18480, "name": "tropical"}, {"id": '
 '179430, "name": "aftercreditsstinger"}, {"id": 179431, "name": '
 '"duringcreditsstinger"}, {"id": 215533, "name": "french polynesia"}]}, '
 '{"id": 20662, "keywords": [{"id": 4147, "name": "robin hood"}, {"id": 4393, '
 '"name": "archer"}, {"id": 10466, "name": "knight"}, {"id": 11525, "name": '
 '"sherwood forest"}, {"id": 12995, "name": "historical fiction"}, {"id": '
 '18101, "name": "bow and arrow"}, {"id": 41406, "name": "middle ages"}, '
 '{"id": 161257, "name": "medieval"}, {"id": 186686, "name": "king of '
 'england"}, {"id": 220707, "name": "12th century"}]}, {"id": 654028, '
 '"keywords": [{"id": 1991, "name": "santa claus"}, {"id": 9663, "name": '
 '"sequel"}, {"id":

 '"saving the world"}, {"id": 248, "name": "date"}, {"id": 378, "name": '
 '"prison"}, {"id": 1308, "name": "secret identity"}, {"id": 1357, "name": '
 '"fish"}, {"id": 1419, "name": "gun"}, {"id": 1718, "name": "dna"}, {"id": '
 '1919, "name": "mayor"}, {"id": 2095, "name": "anti hero"}, {"id": 2217, '
 '"name": "rain"}, {"id": 2598, "name": "museum"}, {"id": 4127, "name": '
 '"one-sided love"}, {"id": 9715, "name": "superhero"}, {"id": 11477, "name": '
 '"anthropomorphism"}, {"id": 12193, "name": "reporter"}, {"id": 179431, '
 '"name": "duringcreditsstinger"}, {"id": 209033, "name": "alien baby"}, '
 '{"id": 219826, "name": "stronger villain"}, {"id": 275546, "name": '
 '"invincible"}]}, {"id": 10545, "keywords": [{"id": 90, "name": "paris, '
 'france"}, {"id": 818, "name": "based on novel or book"}, {"id": 934, "name": '
 '"judge"}, {"id": 1523, "name": "obsession"}, {"id": 1691, "name": "dance"}, '
 '{"id": 1938, "name": "sword"}, {"id": 2422, "name": "mockery"}, {"id": 2544, '
 '"

 '{"id": 207844, "name": "college student"}]}, {"id": 776797, "keywords": []}, '
 '{"id": 242828, "keywords": [{"id": 6054, "name": "friendship"}, {"id": 7942, '
 '"name": "imaginary friend"}, {"id": 9856, "name": "flashback"}, {"id": '
 '13094, "name": "photograph"}, {"id": 210024, "name": "anime"}, {"id": '
 '212755, "name": "personal diary"}]}, {"id": 127521, "keywords": []}, {"id": '
 '11917, "keywords": [{"id": 2652, "name": "nazi"}, {"id": 4426, "name": '
 '"sadism"}, {"id": 8087, "name": "horror"}, {"id": 8636, "name": "blood '
 'splatter"}, {"id": 10714, "name": "serial killer"}, {"id": 13006, "name": '
 '"torture"}, {"id": 18058, "name": "pig mask"}, {"id": 50009, "name": '
 '"survival horror"}, {"id": 157376, "name": "death match"}, {"id": 184312, '
 '"name": "mind game"}, {"id": 234766, "name": "jigsaw"}]}, {"id": 41513, '
 '"keywords": [{"id": 305, "name": "moon"}, {"id": 2343, "name": "magic"}, '
 '{"id": 9717, "name": "based on comic"}, {"id": 15073, "name": "blue"}, '
 '

 'civilisation"}, {"id": 6092, "name": "army"}, {"id": 12988, "name": '
 '"pirate"}, {"id": 13014, "name": "orphan"}, {"id": 163049, "name": '
 '"government agent"}, {"id": 165455, "name": "floating"}, {"id": 169362, '
 '"name": "pendant"}, {"id": 182600, "name": "blue sky"}, {"id": 188747, '
 '"name": "air pirate"}, {"id": 188847, "name": "crystal"}, {"id": 210024, '
 '"name": "anime"}]}, {"id": 405775, "keywords": [{"id": 720, "name": '
 '"helicopter"}, {"id": 1589, "name": "sniper"}, {"id": 5939, "name": "iraq"}, '
 '{"id": 9826, "name": "murder"}, {"id": 13065, "name": "soldier"}, {"id": '
 '15087, "name": "iraq war"}, {"id": 18029, "name": "trapped"}, {"id": 18034, '
 '"name": "desert"}]}, {"id": 9793, "keywords": [{"id": 1508, "name": "new '
 'mexico"}, {"id": 1852, "name": "mutant"}, {"id": 6093, "name": "assault"}, '
 '{"id": 10349, "name": "survival"}, {"id": 193698, "name": "torture porn"}]}, '
 '{"id": 10661, "keywords": [{"id": 242, "name": "new york city"}, {"id": 536, '
 

 '{"id": 2509, "name": "hippopotamus"}, {"id": 2510, "name": "giraffe"}, '
 '{"id": 3645, "name": "madagascar"}, {"id": 6513, "name": "cartoon"}, {"id": '
 '7323, "name": "savannah"}, {"id": 7639, "name": "zebra"}, {"id": 9253, '
 '"name": "slapstick"}, {"id": 15149, "name": "monkey"}, {"id": 15285, "name": '
 '"spin off"}, {"id": 18165, "name": "animal"}, {"id": 160404, "name": '
 '"valentine\'s day"}, {"id": 179542, "name": "love potion"}, {"id": 181033, '
 '"name": "lemur"}, {"id": 263548, "name": "short film"}, {"id": 267848, '
 '"name": "talking animals"}]}, {"id": 9473, "keywords": [{"id": 471, "name": '
 '"mount rushmore national memorial"}, {"id": 6154, "name": "hell"}, {"id": '
 '6778, "name": "world supremacy"}, {"id": 8102, "name": "elementary school"}, '
 '{"id": 8421, "name": "saddam hussein"}, {"id": 8778, "name": "atheist"}, '
 '{"id": 9887, "name": "surrealism"}, {"id": 10138, "name": "satan"}, {"id": '
 '11317, "name": "visions of hell"}, {"id": 11324, "name": "u.s. ca

 '"alien"}, {"id": 155030, "name": "superhero team"}]}, {"id": 14003, '
 '"keywords": [{"id": 2213, "name": "tornado"}, {"id": 13141, "name": "based '
 'on manga"}, {"id": 161395, "name": "alchemy"}, {"id": 201363, "name": '
 '"weimar, germany"}, {"id": 202335, "name": "alchemist"}, {"id": 210024, '
 '"name": "anime"}]}, {"id": 22705, "keywords": [{"id": 596, "name": '
 '"adultery"}, {"id": 924, "name": "italian"}, {"id": 3182, "name": '
 '"seduction"}, {"id": 180340, "name": "voyeur"}, {"id": 190370, "name": '
 '"erotic movie"}, {"id": 211121, "name": "exhibitionist"}]}, {"id": 11253, '
 '"keywords": [{"id": 2096, "name": "auction"}, {"id": 7005, "name": "northern '
 'ireland"}, {"id": 9403, "name": "resignation"}, {"id": 9715, "name": '
 '"superhero"}, {"id": 9717, "name": "based on comic"}, {"id": 11196, "name": '
 '"rebellion"}, {"id": 18096, "name": "spear"}, {"id": 155030, "name": '
 '"superhero team"}, {"id": 163074, "name": "super villain"}, {"id": 174203, '
 '"name": "remorse"

 '41411, "name": "vengeful ghost"}, {"id": 159472, "name": "computer screen"}, '
 '{"id": 162846, "name": "ghost"}, {"id": 162914, "name": "one night"}, {"id": '
 '163053, "name": "found footage"}, {"id": 210112, "name": "skype"}, {"id": '
 '220831, "name": "facebook"}, {"id": 222333, "name": "blender"}]}, {"id": '
 '11001, "keywords": [{"id": 642, "name": "robbery"}, {"id": 1384, "name": '
 '"diamond"}, {"id": 4668, "name": "police operation"}, {"id": 7477, "name": '
 '"police everyday life"}, {"id": 9727, "name": "thief"}, {"id": 15090, '
 '"name": "police officer"}, {"id": 15321, "name": "police station"}]}, {"id": '
 '408355, "keywords": [{"id": 603, "name": "elves"}, {"id": 170362, "name": '
 '"fantasy world"}]}, {"id": 209964, "keywords": []}, {"id": 95993, '
 '"keywords": []}, {"id": 38303, "keywords": [{"id": 380, "name": "sibling '
 'relationship"}, {"id": 6038, "name": "marriage"}, {"id": 6733, "name": '
 '"bullying"}, {"id": 9823, "name": "rivalry"}, {"id": 10624, "name": '


 '{"id": 338189, "keywords": [{"id": 2398, "name": "narration"}, {"id": 10041, '
 '"name": "dysfunctional family"}, {"id": 10180, "name": "male '
 'homosexuality"}, {"id": 10181, "name": "based on play or musical"}, {"id": '
 '40895, "name": "illness"}, {"id": 158024, "name": "critically bashed"}, '
 '{"id": 158718, "name": "lgbt"}, {"id": 171670, "name": "resentment"}, {"id": '
 '232129, "name": "incommunicability"}, {"id": 232130, "name": "small talk"}, '
 '{"id": 264384, "name": "gay"}]}, {"id": 2332, "keywords": [{"id": 90, '
 '"name": "paris, france"}, {"id": 1794, "name": "yakuza"}, {"id": 4668, '
 '"name": "police operation"}, {"id": 5422, "name": "special unit"}]}, {"id": '
 '9336, "keywords": [{"id": 1482, "name": "trainer"}, {"id": 2999, "name": '
 '"recruit"}, {"id": 4358, "name": "shenanigan"}, {"id": 40957, "name": '
 '"police academy"}]}, {"id": 520466, "keywords": [{"id": 1889, "name": '
 '"lake"}, {"id": 5160, "name": "animal attack"}, {"id": 13031, "name": '
 '"creatur

 '"photographer"}, {"id": 1523, "name": "obsession"}, {"id": 2669, "name": '
 '"motel"}, {"id": 5919, "name": "necrophilia"}, {"id": 8181, "name": '
 '"swimming pool"}, {"id": 9826, "name": "murder"}, {"id": 10809, "name": '
 '"teenage girl"}, {"id": 12670, "name": "los angeles, california"}, {"id": '
 '14558, "name": "bathtub"}, {"id": 14794, "name": "lust"}, {"id": 14818, '
 '"name": "model"}, {"id": 14895, "name": "cannibal"}, {"id": 15479, "name": '
 '"fashion"}, {"id": 157465, "name": "eyeball"}, {"id": 185351, "name": "neon '
 'light"}]}, {"id": 460885, "keywords": [{"id": 1930, "name": "kidnapping"}, '
 '{"id": 3030, "name": "nightmare"}, {"id": 9748, "name": "revenge"}, {"id": '
 '9887, "name": "surrealism"}, {"id": 10084, "name": "rescue"}, {"id": 11321, '
 '"name": "animated scene"}, {"id": 12394, "name": "chainsaw"}, {"id": 13209, '
 '"name": "cabin in the woods"}, {"id": 14735, "name": "motorcycle"}, {"id": '
 '156019, "name": "lumberjack"}, {"id": 159028, "name": "cult lea

 '"hippie"}, {"id": 585, "name": "casino"}, {"id": 2380, "name": "audio '
 'tape"}, {"id": 2669, "name": "motel"}, {"id": 2957, "name": "vietnam war"}, '
 '{"id": 4431, "name": "jukebox"}, {"id": 9376, "name": "richard nixon"}, '
 '{"id": 9433, "name": "concierge"}, {"id": 11737, "name": "payphone"}, {"id": '
 '14527, "name": "storm"}, {"id": 164364, "name": "vacuum cleaner"}, {"id": '
 '176516, "name": "slot machine"}, {"id": 184426, "name": "ruse"}, {"id": '
 '200574, "name": "motel room"}, {"id": 207268, "name": "neo-noir"}, {"id": '
 '208992, "name": "1960s"}, {"id": 249977, "name": "lake tahoe"}, {"id": '
 '249979, "name": "j edgar hoover"}]}, {"id": 283591, "keywords": [{"id": 818, '
 '"name": "based on novel or book"}, {"id": 14985, "name": "ireland"}, {"id": '
 '206716, "name": "unfulfilled love"}, {"id": 207876, "name": "1920s"}, {"id": '
 '214548, "name": "1930s"}]}, {"id": 650783, "keywords": [{"id": 4617, "name": '
 '"insurance fraud"}, {"id": 10391, "name": "mafia"}, {"id"

 'gang"}, {"id": 214548, "name": "1930s"}, {"id": 215332, "name": "bonnie and '
 'clyde"}, {"id": 226499, "name": "manhunt"}, {"id": 230656, "name": '
 '"legendary hero"}, {"id": 253222, "name": "retired lawman"}]}, {"id": '
 '589681, "keywords": []}, {"id": 89492, "keywords": [{"id": 1157, "name": '
 '"husband wife relationship"}, {"id": 9673, "name": "love"}, {"id": 10235, '
 '"name": "family relationships"}, {"id": 15025, "name": "parenting"}, {"id": '
 '159947, "name": "parents"}, {"id": 169358, "name": "financial problem"}, '
 '{"id": 179431, "name": "duringcreditsstinger"}]}, {"id": 64586, "keywords": '
 '[{"id": 549, "name": "prostitute"}, {"id": 1664, "name": "eroticism"}, '
 '{"id": 5641, "name": "limousine"}, {"id": 6373, "name": "sadomasochism"}, '
 '{"id": 13059, "name": "prostitution"}, {"id": 187056, "name": "woman '
 'director"}]}, {"id": 241863, "keywords": [{"id": 18249, "name": "game"}, '
 '{"id": 235790, "name": "japanese high school"}]}, {"id": 53064, "keywords": '


 'scientist"}, {"id": 11222, "name": "prophet"}, {"id": 14544, "name": '
 '"robot"}, {"id": 14796, "name": "destruction"}, {"id": 154802, "name": '
 '"silent film"}, {"id": 179874, "name": "expressionism"}, {"id": 185459, '
 '"name": "seven deadly sins"}, {"id": 201028, "name": "depravity"}, {"id": '
 '232988, "name": "mob justice"}, {"id": 234109, "name": "downtrodden"}, '
 '{"id": 234110, "name": "saviour"}, {"id": 239175, "name": "social unrest"}, '
 '{"id": 244516, "name": "german expressionism"}, {"id": 255786, "name": '
 '"mediator"}]}, {"id": 85621, "keywords": []}, {"id": 506528, "keywords": '
 '[{"id": 531, "name": "southern usa"}, {"id": 2831, "name": "slavery"}, '
 '{"id": 5565, "name": "biography"}, {"id": 194469, "name": "harriet tubman"}, '
 '{"id": 207928, "name": "19th century"}, {"id": 209430, "name": "underground '
 'railroad"}]}, {"id": 283564, "keywords": []}, {"id": 4944, "keywords": '
 '[{"id": 521, "name": "washington dc, usa"}, {"id": 1936, "name": '
 '"blackmai

 'sea"}]}, {"id": 246790, "keywords": [{"id": 187056, "name": "woman '
 'director"}, {"id": 208349, "name": "child"}]}, {"id": 348697, "keywords": '
 '[{"id": 3182, "name": "seduction"}]}, {"id": 184, "keywords": [{"id": 818, '
 '"name": "based on novel or book"}, {"id": 822, "name": "airport"}, {"id": '
 '1449, "name": "underworld"}, {"id": 2001, "name": "arms deal"}, {"id": 2080, '
 '"name": "stewardess"}, {"id": 6149, "name": "police"}, {"id": 10051, "name": '
 '"heist"}, {"id": 10594, "name": "money"}, {"id": 12670, "name": "los '
 'angeles, california"}, {"id": 14964, "name": "drugs"}, {"id": 156121, '
 '"name": "ex-con"}, {"id": 207268, "name": "neo-noir"}]}, {"id": 60293, '
 '"keywords": [{"id": 1454, "name": "treasure"}, {"id": 6513, "name": '
 '"cartoon"}, {"id": 11477, "name": "anthropomorphism"}, {"id": 12988, "name": '
 '"pirate"}, {"id": 18165, "name": "animal"}]}, {"id": 787723, "keywords": '
 '[{"id": 2213, "name": "tornado"}, {"id": 3725, "name": "pregnancy"}, {"id": '


## Credits DataFrame:

In [96]:
# Convert credits to json
json_credits_string = json.dumps(credits)
# Convert json to dataframe
movie_credits_df = pd.read_json(json_credits_string)

# Export to save
# movie_credits_df.to_csv("./static/data/movie_credits.csv", index=False)
movie_credits_df.head()

Unnamed: 0,id,cast,crew
0,438695,"[{'adult': False, 'gender': 2, 'id': 10297, 'k...","[{'adult': False, 'gender': 2, 'id': 5720, 'kn..."
1,425909,"[{'adult': False, 'gender': 1, 'id': 1308445, ...","[{'adult': False, 'gender': 2, 'id': 561, 'kno..."
2,634649,"[{'adult': False, 'gender': 2, 'id': 1136406, ...","[{'adult': False, 'gender': 1, 'id': 2519, 'kn..."
3,568124,"[{'adult': False, 'gender': 1, 'id': 968367, '...","[{'adult': False, 'gender': 0, 'id': 8159, 'kn..."
4,460458,"[{'adult': False, 'gender': 1, 'id': 115150, '...","[{'adult': False, 'gender': 2, 'id': 4014, 'kn..."


In [97]:
movie_credits_df["cast_crew"] = movie_credits_df["cast"] + movie_credits_df["crew"]
movie_credits_df

Unnamed: 0,id,cast,crew,cast_crew
0,438695,"[{'adult': False, 'gender': 2, 'id': 10297, 'k...","[{'adult': False, 'gender': 2, 'id': 5720, 'kn...","[{'adult': False, 'gender': 2, 'id': 10297, 'k..."
1,425909,"[{'adult': False, 'gender': 1, 'id': 1308445, ...","[{'adult': False, 'gender': 2, 'id': 561, 'kno...","[{'adult': False, 'gender': 1, 'id': 1308445, ..."
2,634649,"[{'adult': False, 'gender': 2, 'id': 1136406, ...","[{'adult': False, 'gender': 1, 'id': 2519, 'kn...","[{'adult': False, 'gender': 2, 'id': 1136406, ..."
3,568124,"[{'adult': False, 'gender': 1, 'id': 968367, '...","[{'adult': False, 'gender': 0, 'id': 8159, 'kn...","[{'adult': False, 'gender': 1, 'id': 968367, '..."
4,460458,"[{'adult': False, 'gender': 1, 'id': 115150, '...","[{'adult': False, 'gender': 2, 'id': 4014, 'kn...","[{'adult': False, 'gender': 1, 'id': 115150, '..."
...,...,...,...,...
5995,513302,"[{'adult': False, 'gender': 2, 'id': 77928, 'k...","[{'adult': False, 'gender': 2, 'id': 66866, 'k...","[{'adult': False, 'gender': 2, 'id': 77928, 'k..."
5996,534259,"[{'adult': False, 'gender': 2, 'id': 1968476, ...","[{'adult': False, 'gender': 1, 'id': 1302, 'kn...","[{'adult': False, 'gender': 2, 'id': 1968476, ..."
5997,52212,"[{'adult': False, 'gender': 1, 'id': 41163, 'k...","[{'adult': False, 'gender': 2, 'id': 33804, 'k...","[{'adult': False, 'gender': 1, 'id': 41163, 'k..."
5998,659959,"[{'adult': False, 'gender': 2, 'id': 1859007, ...","[{'adult': False, 'gender': 2, 'id': 4387, 'kn...","[{'adult': False, 'gender': 2, 'id': 1859007, ..."


In [98]:
movie_credits_df['percent_fm'] = movie_credits_df['cast_crew'].apply(fm_percentage)

In [99]:
# cast_crew_fm_percentage
movie_credits_df = movie_credits_df.drop(columns=['cast_crew'])
movie_credits_df

Unnamed: 0,id,cast,crew,percent_fm
0,438695,"[{'adult': False, 'gender': 2, 'id': 10297, 'k...","[{'adult': False, 'gender': 2, 'id': 5720, 'kn...",25.00%
1,425909,"[{'adult': False, 'gender': 1, 'id': 1308445, ...","[{'adult': False, 'gender': 2, 'id': 561, 'kno...",25.40%
2,634649,"[{'adult': False, 'gender': 2, 'id': 1136406, ...","[{'adult': False, 'gender': 1, 'id': 2519, 'kn...",25.81%
3,568124,"[{'adult': False, 'gender': 1, 'id': 968367, '...","[{'adult': False, 'gender': 0, 'id': 8159, 'kn...",42.86%
4,460458,"[{'adult': False, 'gender': 1, 'id': 115150, '...","[{'adult': False, 'gender': 2, 'id': 4014, 'kn...",31.58%
...,...,...,...,...
5995,513302,"[{'adult': False, 'gender': 2, 'id': 77928, 'k...","[{'adult': False, 'gender': 2, 'id': 66866, 'k...",27.27%
5996,534259,"[{'adult': False, 'gender': 2, 'id': 1968476, ...","[{'adult': False, 'gender': 1, 'id': 1302, 'kn...",37.04%
5997,52212,"[{'adult': False, 'gender': 1, 'id': 41163, 'k...","[{'adult': False, 'gender': 2, 'id': 33804, 'k...",23.53%
5998,659959,"[{'adult': False, 'gender': 2, 'id': 1859007, ...","[{'adult': False, 'gender': 2, 'id': 4387, 'kn...",30.43%


## Certifications DataFrame:

In [100]:
# Convert certifications to json
json_certifications_string = json.dumps(certifications)
# Convert json to dataframe
movie_certifications_df = pd.read_json(json_certifications_string)

# Export to save
# movie_certifications_df.to_csv("./static/data/movie_certifications.csv", index=False)
movie_certifications_df.head()

Unnamed: 0,id,results
0,438695,"[{'iso_3166_1': 'CA', 'release_dates': [{'cert..."
1,425909,"[{'iso_3166_1': 'SE', 'release_dates': [{'cert..."
2,634649,"[{'iso_3166_1': 'MX', 'release_dates': [{'cert..."
3,568124,"[{'iso_3166_1': 'FI', 'release_dates': [{'cert..."
4,460458,"[{'iso_3166_1': 'CZ', 'release_dates': [{'cert..."


In [101]:
# Extract certification information and append to new column
movie_certifications_df['certification'] = movie_certifications_df['results'].apply(extract_certification)
movie_certifications_df = movie_certifications_df[['id', 'certification']]

In [102]:
movie_certifications_df.head()

Unnamed: 0,id,certification
0,438695,
1,425909,
2,634649,PG-13
3,568124,
4,460458,R


In [103]:
# Convert ids to ints (to merge correctly)
movie_keywords_df['id'] = movie_keywords_df['id'].astype('int')
movie_details_df['id'] = movie_details_df['id'].astype('int')
movie_credits_df['id'] = movie_credits_df['id'].astype('int')
movie_certifications_df['id'] = movie_certifications_df['id'].astype('int')

In [104]:
# Merge keywords with details
movie_df = movie_details_df.merge(movie_keywords_df, on='id')
movie_df = movie_df.merge(movie_credits_df, on='id')
movie_df = movie_df.merge(movie_certifications_df, on='id')

# Export to save
# movie_df.to_csv("./static/data/movies_merged.csv", index=False)

In [105]:
movie_df.head(20)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,original_budget,genres,homepage,id,imdb_id,original_language,original_title,...,cpi_2021,cpi_old,adjusted_revenue,adjusted_budget,budget_bins,keywords,cast,crew,percent_fm,certification
0,False,/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg,"{'id': 544670, 'name': 'Sing Collection', 'pos...",85000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://www.illumination.com/movie/sing-2/,438695,tt6467266,en,Sing 2,...,269.489,269.489,190860000.0,85000000.0,51m to 150m,"[{'id': 11477, 'name': 'anthropomorphism'}, {'...","[{'adult': False, 'gender': 2, 'id': 10297, 'k...","[{'adult': False, 'gender': 2, 'id': 5720, 'kn...",25.00%,
1,False,/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg,"{'id': 2980, 'name': 'Ghostbusters Collection'...",75000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",https://ghostbusters.com/,425909,tt4513678,en,Ghostbusters: Afterlife,...,269.489,269.489,191000000.0,75000000.0,51m to 150m,"[{'id': 1415, 'name': 'small town'}, {'id': 30...","[{'adult': False, 'gender': 1, 'id': 1308445, ...","[{'adult': False, 'gender': 2, 'id': 561, 'kno...",25.40%,
2,False,/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg,"{'id': 531241, 'name': 'Spider-Man (Avengers) ...",200000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.spidermannowayhome.movie,634649,tt10872600,en,Spider-Man: No Way Home,...,269.489,269.489,1538282000.0,200000000.0,151m to 380m,"[{'id': 1701, 'name': 'hero'}, {'id': 5451, 'n...","[{'adult': False, 'gender': 2, 'id': 1136406, ...","[{'adult': False, 'gender': 1, 'id': 2519, 'kn...",25.81%,PG-13
3,False,/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg,,50000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://movies.disney.com/encanto,568124,tt2953050,en,Encanto,...,269.489,269.489,215000000.0,50000000.0,16m to 50m,"[{'id': 2343, 'name': 'magic'}, {'id': 4344, '...","[{'adult': False, 'gender': 1, 'id': 968367, '...","[{'adult': False, 'gender': 0, 'id': 8159, 'kn...",42.86%,
4,False,/o76ZDm8PS9791XiuieNB93UZcRV.jpg,,40000000,"[{'id': 27, 'name': 'Horror'}, {'id': 28, 'nam...",https://www.residentevil.movie,460458,tt6920084,en,Resident Evil: Welcome to Raccoon City,...,269.489,269.489,31000000.0,40000000.0,16m to 50m,"[{'id': 1852, 'name': 'mutant'}, {'id': 1865, ...","[{'adult': False, 'gender': 1, 'id': 115150, '...","[{'adult': False, 'gender': 2, 'id': 4014, 'kn...",31.58%,R
5,False,/eNI7PtK6DEYgZmHWP9gQNuff8pv.jpg,"{'id': 2344, 'name': 'The Matrix Collection', ...",190000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.whatisthematrix.com,624860,tt10838180,en,The Matrix Resurrections,...,269.489,269.489,124000000.0,190000000.0,151m to 380m,"[{'id': 310, 'name': 'artificial intelligence'...","[{'adult': False, 'gender': 2, 'id': 6384, 'kn...","[{'adult': False, 'gender': 2, 'id': 1071, 'kn...",25.86%,
6,False,/vIgyYkXkg6NC2whRbYjBD7eb3Er.jpg,"{'id': 558216, 'name': 'Venom Collection', 'po...",110000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.venom.movie,580489,tt7097896,en,Venom: Let There Be Carnage,...,269.489,269.489,500000000.0,110000000.0,51m to 150m,"[{'id': 1701, 'name': 'hero'}, {'id': 2095, 'n...","[{'adult': False, 'gender': 2, 'id': 2524, 'kn...","[{'adult': False, 'gender': 2, 'id': 149, 'kno...",23.68%,PG-13
7,False,/dK12GIdhGP6NPGFssK2Fh265jyr.jpg,,160000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",https://www.netflix.com/us/title/81161626,512195,tt7991608,en,Red Notice,...,269.489,269.489,178143.0,160000000.0,151m to 380m,"[{'id': 1812, 'name': 'fbi'}, {'id': 6710, 'na...","[{'adult': False, 'gender': 2, 'id': 18918, 'k...","[{'adult': False, 'gender': 2, 'id': 9543, 'kn...",14.58%,PG-13
8,False,/lyvszvJJqqI8aqBJ70XzdCNoK0y.jpg,,200000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.marvel.com/movies/the-eternals,524434,tt9032400,en,Eternals,...,269.489,269.489,399437200.0,200000000.0,151m to 380m,"[{'id': 3925, 'name': 'boredom'}, {'id': 6152,...","[{'adult': False, 'gender': 1, 'id': 97576, 'k...","[{'adult': False, 'gender': 2, 'id': 1722, 'kn...",26.47%,
9,False,/cinER0ESG0eJ49kXlExM0MEWGxW.jpg,"{'id': 912503, 'name': 'Shang-Chi Collection',...",150000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.marvel.com/movies/shang-chi-and-th...,566525,tt9376612,en,Shang-Chi and the Legend of the Ten Rings,...,269.489,269.489,430238400.0,150000000.0,51m to 150m,"[{'id': 779, 'name': 'martial arts'}, {'id': 9...","[{'adult': False, 'gender': 2, 'id': 1489211, ...","[{'adult': False, 'gender': 1, 'id': 7232, 'kn...",34.29%,PG-13


## Call Functions to Extract Details:

In [106]:
# Director
movie_df['director'] = movie_df['crew'].apply(get_director)

# Director Gender
movie_df['director_gender'] = movie_df['crew'].apply(get_director_gender)

# Producers
movie_df['producers'] = movie_df['crew'].apply(get_producers)

# Screenplay writers
movie_df['writers'] = movie_df['crew'].apply(get_writers)

# Cast
movie_df['cast'] = movie_df['cast'].apply(get_cast_list)

# Production Company Country of Origin
movie_df['production_company_origin'] = movie_df['production_companies'].apply(get_production_company_country)

# Spoken Languages in Movie
movie_df['spoken_languages']  = movie_df['spoken_languages'].apply(get_languages)

# Create foreign language column for if the original language was not English
movie_df['foreign_language'] = movie_df['original_language'].apply(original_language_binary)

# Get List: Keywords, genres, & production countries
features = ['keywords', 'genres', 'production_companies']
            
for feature in features:
    movie_df[feature] = movie_df[feature].apply(get_list)

In [107]:
# Show the results
movie_df[['title', 'cast', 'director', 'director_gender', 'percent_fm', 'producers', 'writers', 'keywords', 'genres', 'production_companies', 'production_company_origin', 'spoken_languages', 'original_language', 'foreign_language']].head()

Unnamed: 0,title,cast,director,director_gender,percent_fm,producers,writers,keywords,genres,production_companies,production_company_origin,spoken_languages,original_language,foreign_language
0,Sing 2,"[Matthew McConaughey, Reese Witherspoon, Scarl...",[Garth Jennings],[2],25.00%,"[Christopher Meledandri, Janet Healy, Dana Kru...","[Garth Jennings, Garth Jennings]","[anthropomorphism, singing]","[Animation, Comedy, Family, Music]","[Illumination Entertainment, Universal Pictures]","[US, US]",[English],en,0
1,Ghostbusters: Afterlife,"[Carrie Coon, Finn Wolfhard, Mckenna Grace, Pa...",[Jason Reitman],[2],25.40%,"[Dan Aykroyd, Ivan Reitman, Gil Kenan, Michael...","[Dan Aykroyd, Harold Ramis, Jason Reitman, Gil...","[small town, ghostbuster, nostalgia, afterlife...","[Comedy, Fantasy, Adventure]","[Columbia Pictures, Bron Studios, The Montecit...","[US, CA, US, US]",[English],en,0
2,Spider-Man: No Way Home,"[Tom Holland, Zendaya, Benedict Cumberbatch, J...",[Jon Watts],[2],25.81%,"[Avi Arad, Kevin Feige, Louis D'Esposito, JoAn...","[Stan Lee, Stan Lee, Steve Ditko, Steve Ditko,...","[hero, comic book, superhero, based on comic, ...","[Action, Adventure, Science Fiction]","[Marvel Studios, Pascal Pictures, Columbia Pic...","[US, US, US]","[English, Tagalog]",en,0
3,Encanto,"[Stephanie Beatriz, María Cecilia Botero, John...","[Byron Howard, Jared Bush]","[2, 2]",42.86%,"[Clark Spencer, Jennifer Lee, Yvett Merino Flo...","[Jared Bush, Charise Castro Smith]","[magic, musical, forest, family relationships,...","[Animation, Comedy, Family, Fantasy]","[Walt Disney Animation Studios, Walt Disney Pi...","[US, US]","[English, Spanish]",en,0
4,Resident Evil: Welcome to Raccoon City,"[Kaya Scodelario, Robbie Amell, Hannah John-Ka...",[Johannes Roberts],[2],31.58%,"[Paul W. S. Anderson, Jeremy Bolt, Robert Kulz...","[Johannes Roberts, Noboru Sugimura, Kenichi Iw...","[mutant, biological weapon, quarantine, zombie...","[Horror, Action, Science Fiction]","[Constantin Film, Tea Shop & Film Company, Dav...","[DE, GB, FR, GB, US]",[English],en,0


In [108]:
# Drop NA from selected columns
movie_df.dropna(inplace=True, how="any", subset=['genres', 'production_companies', 'keywords', 'cast'])
movie_df.dropna(inplace=True, how="any", subset=['director', 'producers', 'writers'])

In [109]:
len(movie_df)

4992

In [110]:
# Clean Data: Convert all features to lowercase and remove spaces
features = ['cast', 'director', 'producers', 'writers', 'keywords', 'genres', 'production_companies']

for feature in features:
    movie_df[f"{feature}_cleaned"] = movie_df[feature].apply(clean_data)

In [111]:
# Call the clean overview function
movie_df['overview_cleaned'] = movie_df['overview'].apply(clean_overview)

In [112]:
# Create soup columns
movie_df['soup'] = movie_df.apply(create_soup, axis = 1)
movie_df['soup_overview'] = movie_df.apply(create_soup_overview, axis = 1)

# Export to save
# movie_df.to_csv("./static/data/movies_cleaned_soup.csv", index=False)

In [113]:
movie_df[['soup']].head(5)

Unnamed: 0,soup
0,anthropomorphism singing matthewmcconaughey re...
1,smalltown ghostbuster nostalgia afterlife sequ...
2,hero comicbook superhero basedoncomic marvelci...
3,magic musical forest familyrelationships femal...
4,mutant biologicalweapon quarantine zombie base...


In [114]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4992 entries, 0 to 5854
Data columns (total 52 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   adult                         4992 non-null   bool    
 1   backdrop_path                 4936 non-null   object  
 2   belongs_to_collection         1624 non-null   object  
 3   original_budget               4992 non-null   int64   
 4   genres                        4992 non-null   object  
 5   homepage                      4990 non-null   object  
 6   id                            4992 non-null   int64   
 7   imdb_id                       4985 non-null   object  
 8   original_language             4992 non-null   object  
 9   original_title                4992 non-null   object  
 10  overview                      4992 non-null   object  
 11  popularity                    4992 non-null   float64 
 12  poster_path                   4991 non-null   ob

## Configuration for Posters:

In [115]:
configuration_url = f"https://api.themoviedb.org/3/configuration?api_key={api_key}"
config_response = requests.get(configuration_url).json()
# config_response

In [116]:
# Get images structure
images_url = config_response['images']
# Get Base URL
secure_base_url = images_url['secure_base_url']
# secure_base_url

In [117]:
# Get the size of poster: 2: w185px, 4: w500px, 5: 780px
# images_url['poster_sizes']
poster_size = images_url['poster_sizes'][5]
# poster_size

In [118]:
# Copy poster paths to new df
poster_df = movie_df[['poster_path']].copy()

In [119]:
# Create column with full image path for posters
poster_df['poster_url'] = secure_base_url + poster_size + poster_df['poster_path']

In [120]:
# Export to separate csv
# poster_df.to_csv("./static/data/poster_path.csv", index=False)
poster_df.head()

Unnamed: 0,poster_path,poster_url
0,/aWeKITRFbbwY8txG5uCj4rMCfSP.jpg,https://image.tmdb.org/t/p/w780/aWeKITRFbbwY8t...
1,/sg4xJaufDiQl7caFEskBtQXfD4x.jpg,https://image.tmdb.org/t/p/w780/sg4xJaufDiQl7c...
2,/1g0dhYtq4irTY1GPXvft6k4YLjm.jpg,https://image.tmdb.org/t/p/w780/1g0dhYtq4irTY1...
3,/4j0PNHkMr5ax3IA8tjtxcmPU3QT.jpg,https://image.tmdb.org/t/p/w780/4j0PNHkMr5ax3I...
4,/7uRbWOXxpWDMtnsd2PF3clu65jc.jpg,https://image.tmdb.org/t/p/w780/7uRbWOXxpWDMtn...


In [121]:
# Create column with full image path for posters (in real df this time)
movie_df['poster_url'] = secure_base_url + poster_size + movie_df['poster_path']

# TESTING

## Export CSV:

In [122]:
# Create a lowercase column for easier search
movie_df["lowercase_title"] = movie_df['title'].apply(lambda x: x.lower())

In [123]:
# # Save file - used for calling for information
movie_df.to_csv("../static/data/movie_db.csv", index=False)

In [124]:
movie_df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,original_budget,genres,homepage,id,imdb_id,original_language,original_title,...,producers_cleaned,writers_cleaned,keywords_cleaned,genres_cleaned,production_companies_cleaned,overview_cleaned,soup,soup_overview,poster_url,lowercase_title
0,False,/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg,"{'id': 544670, 'name': 'Sing Collection', 'pos...",85000000,"[Animation, Comedy, Family, Music]",https://www.illumination.com/movie/sing-2/,438695,tt6467266,en,Sing 2,...,"[christophermeledandri, janethealy, danakrupin...","[garthjennings, garthjennings]","[anthropomorphism, singing]","[animation, comedy, family, music]","[illuminationentertainment, universalpictures]",buster and his new cast now have their sights ...,anthropomorphism singing matthewmcconaughey re...,anthropomorphism singing anthropomorphism sing...,https://image.tmdb.org/t/p/w780/aWeKITRFbbwY8t...,sing 2
1,False,/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg,"{'id': 2980, 'name': 'Ghostbusters Collection'...",75000000,"[Comedy, Fantasy, Adventure]",https://ghostbusters.com/,425909,tt4513678,en,Ghostbusters: Afterlife,...,"[danaykroyd, ivanreitman, gilkenan, michaelbeu...","[danaykroyd, haroldramis, jasonreitman, gilkenan]","[smalltown, ghostbuster, nostalgia, afterlife,...","[comedy, fantasy, adventure]","[columbiapictures, bronstudios, themontecitopi...",when a single mom and her two kids arrive in a...,smalltown ghostbuster nostalgia afterlife sequ...,smalltown ghostbuster nostalgia afterlife sequ...,https://image.tmdb.org/t/p/w780/sg4xJaufDiQl7c...,ghostbusters: afterlife
2,False,/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg,"{'id': 531241, 'name': 'Spider-Man (Avengers) ...",200000000,"[Action, Adventure, Science Fiction]",https://www.spidermannowayhome.movie,634649,tt10872600,en,Spider-Man: No Way Home,...,"[aviarad, kevinfeige, louisd'esposito, joannpe...","[stanlee, stanlee, steveditko, steveditko, chr...","[hero, comicbook, superhero, basedoncomic, mar...","[action, adventure, sciencefiction]","[marvelstudios, pascalpictures, columbiapictures]",peter parker is unmasked and no longer able to...,hero comicbook superhero basedoncomic marvelci...,hero comicbook superhero basedoncomic marvelci...,https://image.tmdb.org/t/p/w780/1g0dhYtq4irTY1...,spider-man: no way home
3,False,/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg,,50000000,"[Animation, Comedy, Family, Fantasy]",https://movies.disney.com/encanto,568124,tt2953050,en,Encanto,...,"[clarkspencer, jenniferlee, yvettmerinoflores]","[jaredbush, charisecastrosmith]","[magic, musical, forest, familyrelationships, ...","[animation, comedy, family, fantasy]","[waltdisneyanimationstudios, waltdisneypictures]","the tale of an extraordinary family, the madri...",magic musical forest familyrelationships femal...,magic musical forest familyrelationships femal...,https://image.tmdb.org/t/p/w780/4j0PNHkMr5ax3I...,encanto
4,False,/o76ZDm8PS9791XiuieNB93UZcRV.jpg,,40000000,"[Horror, Action, Science Fiction]",https://www.residentevil.movie,460458,tt6920084,en,Resident Evil: Welcome to Raccoon City,...,"[paulw.s.anderson, jeremybolt, robertkulzer, m...","[johannesroberts, noborusugimura, kenichiiwao,...","[mutant, biologicalweapon, quarantine, zombie,...","[horror, action, sciencefiction]","[constantinfilm, teashop&filmcompany, davisfil...",once the booming home of pharmaceutical giant ...,mutant biologicalweapon quarantine zombie base...,mutant biologicalweapon quarantine zombie base...,https://image.tmdb.org/t/p/w780/7uRbWOXxpWDMtn...,resident evil: welcome to raccoon city


# Machine Learning Recommender:

In [125]:
from sklearn.feature_extraction.text import CountVectorizer

# Use Count Vectorizer to create counts for each word
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movie_df['soup_overview'])

In [126]:
count_matrix.shape

(4992, 61382)

In [127]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [128]:
# Reset the index
# movie_df = movie_df.reset_index()
# Create series with index & titles of movies
indices = pd.Series(movie_df.index, index=movie_df['title']).drop_duplicates()

In [129]:
movie_df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,original_budget,genres,homepage,id,imdb_id,original_language,original_title,...,producers_cleaned,writers_cleaned,keywords_cleaned,genres_cleaned,production_companies_cleaned,overview_cleaned,soup,soup_overview,poster_url,lowercase_title
0,False,/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg,"{'id': 544670, 'name': 'Sing Collection', 'pos...",85000000,"[Animation, Comedy, Family, Music]",https://www.illumination.com/movie/sing-2/,438695,tt6467266,en,Sing 2,...,"[christophermeledandri, janethealy, danakrupin...","[garthjennings, garthjennings]","[anthropomorphism, singing]","[animation, comedy, family, music]","[illuminationentertainment, universalpictures]",buster and his new cast now have their sights ...,anthropomorphism singing matthewmcconaughey re...,anthropomorphism singing anthropomorphism sing...,https://image.tmdb.org/t/p/w780/aWeKITRFbbwY8t...,sing 2
1,False,/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg,"{'id': 2980, 'name': 'Ghostbusters Collection'...",75000000,"[Comedy, Fantasy, Adventure]",https://ghostbusters.com/,425909,tt4513678,en,Ghostbusters: Afterlife,...,"[danaykroyd, ivanreitman, gilkenan, michaelbeu...","[danaykroyd, haroldramis, jasonreitman, gilkenan]","[smalltown, ghostbuster, nostalgia, afterlife,...","[comedy, fantasy, adventure]","[columbiapictures, bronstudios, themontecitopi...",when a single mom and her two kids arrive in a...,smalltown ghostbuster nostalgia afterlife sequ...,smalltown ghostbuster nostalgia afterlife sequ...,https://image.tmdb.org/t/p/w780/sg4xJaufDiQl7c...,ghostbusters: afterlife
2,False,/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg,"{'id': 531241, 'name': 'Spider-Man (Avengers) ...",200000000,"[Action, Adventure, Science Fiction]",https://www.spidermannowayhome.movie,634649,tt10872600,en,Spider-Man: No Way Home,...,"[aviarad, kevinfeige, louisd'esposito, joannpe...","[stanlee, stanlee, steveditko, steveditko, chr...","[hero, comicbook, superhero, basedoncomic, mar...","[action, adventure, sciencefiction]","[marvelstudios, pascalpictures, columbiapictures]",peter parker is unmasked and no longer able to...,hero comicbook superhero basedoncomic marvelci...,hero comicbook superhero basedoncomic marvelci...,https://image.tmdb.org/t/p/w780/1g0dhYtq4irTY1...,spider-man: no way home
3,False,/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg,,50000000,"[Animation, Comedy, Family, Fantasy]",https://movies.disney.com/encanto,568124,tt2953050,en,Encanto,...,"[clarkspencer, jenniferlee, yvettmerinoflores]","[jaredbush, charisecastrosmith]","[magic, musical, forest, familyrelationships, ...","[animation, comedy, family, fantasy]","[waltdisneyanimationstudios, waltdisneypictures]","the tale of an extraordinary family, the madri...",magic musical forest familyrelationships femal...,magic musical forest familyrelationships femal...,https://image.tmdb.org/t/p/w780/4j0PNHkMr5ax3I...,encanto
4,False,/o76ZDm8PS9791XiuieNB93UZcRV.jpg,,40000000,"[Horror, Action, Science Fiction]",https://www.residentevil.movie,460458,tt6920084,en,Resident Evil: Welcome to Raccoon City,...,"[paulw.s.anderson, jeremybolt, robertkulzer, m...","[johannesroberts, noborusugimura, kenichiiwao,...","[mutant, biologicalweapon, quarantine, zombie,...","[horror, action, sciencefiction]","[constantinfilm, teashop&filmcompany, davisfil...",once the booming home of pharmaceutical giant ...,mutant biologicalweapon quarantine zombie base...,mutant biologicalweapon quarantine zombie base...,https://image.tmdb.org/t/p/w780/7uRbWOXxpWDMtn...,resident evil: welcome to raccoon city


## Function to Recommend Titles:

In [130]:
def get_similarity_scores(title, cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the most similar movies
    sim_scores = sim_scores[1:11]
#     print(sim_scores)
    
    # Convert list to DataFrame
    sim_scores_df = pd.DataFrame(sim_scores, columns = ["index", "similarity_score"])
    
    # Return top 10 most similar scores
    return sim_scores_df

In [131]:
def get_recommendations(original_df, score_df):
    original_df = original_df.merge(score_df, on="index")
    original_df = original_df.sort_values("similarity_score", ascending=False)
    
    return original_df

In [132]:
# Test out function
movie_title = "Get Out"
similarity_scores_df = get_similarity_scores(movie_title, cosine_sim)

In [None]:
# Call Get Recommendations Function
recommendations = get_recommendations(movie_df, similarity_scores_df)

In [None]:
# Print out dataframe
# recommendations[['title', 'similarity_score']].to_csv("./soup_test_3.csv")
recommendations[['title', 'similarity_score', 'id']].head(10)

### Convert columns to list for SQL Database connection

In [135]:
movie_df_columns = list(movie_df.columns)
movie_df_columns

['adult',
 'backdrop_path',
 'belongs_to_collection',
 'original_budget',
 'genres',
 'homepage',
 'id',
 'imdb_id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'original_revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'video',
 'vote_average',
 'vote_count',
 'year',
 'cpi_2021',
 'cpi_old',
 'adjusted_revenue',
 'adjusted_budget',
 'budget_bins',
 'keywords',
 'cast',
 'crew',
 'percent_fm',
 'certification',
 'director',
 'director_gender',
 'producers',
 'writers',
 'production_company_origin',
 'foreign_language',
 'cast_cleaned',
 'director_cleaned',
 'producers_cleaned',
 'writers_cleaned',
 'keywords_cleaned',
 'genres_cleaned',
 'production_companies_cleaned',
 'overview_cleaned',
 'soup',
 'soup_overview',
 'poster_url',
 'lowercase_title']

In [136]:
for column_name in list(movie_df.columns):
    movie_df[column_name] = movie_df[column_name].apply(json.dumps)

In [137]:
movie_dict = movie_df.applymap(lambda x: isinstance(x, (dict, list))).all()
print(movie_dict)

adult                           False
backdrop_path                   False
belongs_to_collection           False
original_budget                 False
genres                          False
homepage                        False
id                              False
imdb_id                         False
original_language               False
original_title                  False
overview                        False
popularity                      False
poster_path                     False
production_companies            False
production_countries            False
release_date                    False
original_revenue                False
runtime                         False
spoken_languages                False
status                          False
tagline                         False
title                           False
video                           False
vote_average                    False
vote_count                      False
year                            False
cpi_2021    

## Database:

In [138]:
from sqlalchemy import create_engine, inspect

In [139]:
# configure the connection string
rds_connection_string = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'
        
# connect to the database
engine = create_engine(rds_connection_string)
conn = engine.connect()

In [140]:
engine.execute("DROP TABLE IF EXISTS movies")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fbf7cbd7a90>

In [141]:
# Append data to table
movie_df.to_sql(name='movies', con=conn, if_exists='append', index=False)

In [142]:
# Use inspector to find table names
Inspector = inspect(engine)
Inspector.get_table_names()

['movies',
 'duplicate_search',
 'low_budget_filter',
 'no_filter',
 'female_filter',
 'international_filter']

In [145]:
# Check movies table
pd.read_sql_query('select * from movies', con=conn)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,original_budget,genres,homepage,id,imdb_id,original_language,original_title,...,producers_cleaned,writers_cleaned,keywords_cleaned,genres_cleaned,production_companies_cleaned,overview_cleaned,soup,soup_overview,poster_url,lowercase_title
0,false,"""/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg""","{""id"": 544670, ""name"": ""Sing Collection"", ""pos...",85000000,"[""Animation"", ""Comedy"", ""Family"", ""Music""]","""https://www.illumination.com/movie/sing-2/""",438695,"""tt6467266""","""en""","""Sing 2""",...,"[""christophermeledandri"", ""janethealy"", ""danak...","[""garthjennings"", ""garthjennings""]","[""anthropomorphism"", ""singing""]","[""animation"", ""comedy"", ""family"", ""music""]","[""illuminationentertainment"", ""universalpictur...","""buster and his new cast now have their sights...","""anthropomorphism singing matthewmcconaughey r...","""anthropomorphism singing anthropomorphism sin...","""https://image.tmdb.org/t/p/w780/aWeKITRFbbwY8...","""sing 2"""
1,false,"""/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg""","{""id"": 2980, ""name"": ""Ghostbusters Collection""...",75000000,"[""Comedy"", ""Fantasy"", ""Adventure""]","""https://ghostbusters.com/""",425909,"""tt4513678""","""en""","""Ghostbusters: Afterlife""",...,"[""danaykroyd"", ""ivanreitman"", ""gilkenan"", ""mic...","[""danaykroyd"", ""haroldramis"", ""jasonreitman"", ...","[""smalltown"", ""ghostbuster"", ""nostalgia"", ""aft...","[""comedy"", ""fantasy"", ""adventure""]","[""columbiapictures"", ""bronstudios"", ""themontec...","""when a single mom and her two kids arrive in ...","""smalltown ghostbuster nostalgia afterlife seq...","""smalltown ghostbuster nostalgia afterlife seq...","""https://image.tmdb.org/t/p/w780/sg4xJaufDiQl7...","""ghostbusters: afterlife"""
2,false,"""/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg""","{""id"": 531241, ""name"": ""Spider-Man (Avengers) ...",200000000,"[""Action"", ""Adventure"", ""Science Fiction""]","""https://www.spidermannowayhome.movie""",634649,"""tt10872600""","""en""","""Spider-Man: No Way Home""",...,"[""aviarad"", ""kevinfeige"", ""louisd'esposito"", ""...","[""stanlee"", ""stanlee"", ""steveditko"", ""stevedit...","[""hero"", ""comicbook"", ""superhero"", ""basedoncom...","[""action"", ""adventure"", ""sciencefiction""]","[""marvelstudios"", ""pascalpictures"", ""columbiap...","""peter parker is unmasked and no longer able t...","""hero comicbook superhero basedoncomic marvelc...","""hero comicbook superhero basedoncomic marvelc...","""https://image.tmdb.org/t/p/w780/1g0dhYtq4irTY...","""spider-man: no way home"""
3,false,"""/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg""",,50000000,"[""Animation"", ""Comedy"", ""Family"", ""Fantasy""]","""https://movies.disney.com/encanto""",568124,"""tt2953050""","""en""","""Encanto""",...,"[""clarkspencer"", ""jenniferlee"", ""yvettmerinofl...","[""jaredbush"", ""charisecastrosmith""]","[""magic"", ""musical"", ""forest"", ""familyrelation...","[""animation"", ""comedy"", ""family"", ""fantasy""]","[""waltdisneyanimationstudios"", ""waltdisneypict...","""the tale of an extraordinary family, the madr...","""magic musical forest familyrelationships fema...","""magic musical forest familyrelationships fema...","""https://image.tmdb.org/t/p/w780/4j0PNHkMr5ax3...","""encanto"""
4,false,"""/o76ZDm8PS9791XiuieNB93UZcRV.jpg""",,40000000,"[""Horror"", ""Action"", ""Science Fiction""]","""https://www.residentevil.movie""",460458,"""tt6920084""","""en""","""Resident Evil: Welcome to Raccoon City""",...,"[""paulw.s.anderson"", ""jeremybolt"", ""robertkulz...","[""johannesroberts"", ""noborusugimura"", ""kenichi...","[""mutant"", ""biologicalweapon"", ""quarantine"", ""...","[""horror"", ""action"", ""sciencefiction""]","[""constantinfilm"", ""teashop&filmcompany"", ""dav...","""once the booming home of pharmaceutical giant...","""mutant biologicalweapon quarantine zombie bas...","""mutant biologicalweapon quarantine zombie bas...","""https://image.tmdb.org/t/p/w780/7uRbWOXxpWDMt...","""resident evil: welcome to raccoon city"""
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4987,false,"""/97Nv4cIsVaf8nesmpjNW5uWWQY3.jpg""",,2479000,"[""Mystery"", ""Romance"", ""Thriller""]","""""",426,"""tt0052357""","""en""","""Vertigo""",...,"[""alfredhitchcock""]","[""pierreboileau"", ""thomasnarcejac"", ""aleccoppe...","[""sanfrancisco,california"", ""detective"", ""sens...","[""mystery"", ""romance"", ""thriller""]","[""paramount"", ""alfredj.hitchcockproductions""]","""a retired san francisco detective suffering f...","""sanfrancisco,california detective senseofguil...","""sanfrancisco,california detective senseofguil...","""https://image.tmdb.org/t/p/w780/dg9escdBIAAAT...","""vertigo"""
4988,false,"""/r8pUcom5Mw8igtBpj3AHSAUvH0R.jpg""",,1300000,"[""Drama"", ""Science Fiction""]","""""",19,"""tt0017136""","""de""","""Metropolis""",...,"[""erichpommer""]","[""fritzlang"", ""theavonharbou"", ""theavonharbou""]","[""manvsmachine"", ""basedonnovelorbook"", ""underg...","[""drama"", ""sciencefiction""]","[""ufa""]","""in a futuristic city sharply divided between ...","""manvsmachine basedonnovelorbook undergroundwo...","""manvsmachine basedonnovelorbook undergroundwo...","""https://image.tmdb.org/t/p/w780/hUK9rewffKGqt...","""metropolis"""
4989,false,"""/2SCZfhWvFgjmVrpGWN24UpXZVbf.jpg""",,0,"[""Mystery"", ""Thriller""]","""""",260,"""tt0026029""","""en""","""The 39 Steps""",...,"[""michaelbalcon""]","[""charlesbennett"", ""ianhay"", ""johnbuchan""]","[""london,england"", ""scotland"", ""falselyaccused...","[""mystery"", ""thriller""]","[""gaumont-britishpicturecorporation""]","""richard hanney has a rude awakening when a gl...","""london,england scotland falselyaccused scotla...","""london,england scotland falselyaccused scotla...","""https://image.tmdb.org/t/p/w780/paI9Tmqm2cZG6...","""the 39 steps"""
4990,false,"""/jIqp5xXY2aw1kxnBPyCXkKQBOJ3.jpg""",,923000,"[""Adventure"", ""Comedy"", ""Drama""]","""""",962,"""tt0015864""","""en""","""The Gold Rush""",...,"[""charliechaplin""]","[""charliechaplin""]","[""river"", ""gold"", ""dance"", ""worker"", ""cabin"", ...","[""adventure"", ""comedy"", ""drama""]","[""charleschaplinproductions"", ""unitedartists""]","""a lone prospector ventures into alaska lookin...","""river gold dance worker cabin goldrush thanks...","""river gold dance worker cabin goldrush thanks...","""https://image.tmdb.org/t/p/w780/eQRFo1qwRREYw...","""the gold rush"""
