# The Movie Database: Data Extraction & Cleaning

In [1]:
#  %pip install cpi

In [2]:
# Import dependencies
import pandas as pd
import json
import requests
from pprint import pprint
import numpy as np
import cpi
from datetime import datetime

# Import config
from config import api_key, db_user, db_password, db_host, db_port, db_name

In [3]:
cpi.update()

### Testing:

In [4]:
# Single Test: Discover Most Popular Movies
page_number = 1

# Endpoint for finding most popular movies
discover_movies = "https://api.themoviedb.org/3/discover/movie"
most_popular_url = f"{discover_movies}?api_key={api_key}&page={page_number}&sort_by=popularity.desc"

# Most popular movies
tmdb_response = requests.get(most_popular_url).json()
results = tmdb_response["results"]

json_string = json.dumps(results)
df = pd.read_json(json_string)
df

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/c6H7Z4u73ir3cIoCteuhJh7UCAR.jpg,"[28, 12, 14, 878]",524434,en,Eternals,The Eternals are a team of ancient aliens who ...,11424.242,/b6qUu00iIIkXX13szFy7d0CyNcg.jpg,2021-11-03,Eternals,False,7.3,3139
1,False,/ur19n7Zabzf7wfhK4MrlFDJiZPI.jpg,"[16, 10751, 14, 35, 12]",585083,en,Hotel Transylvania: Transformania,"When Van Helsing's mysterious invention, the ""...",10200.442,/teCy1egGQa0y8ULJvlrDHQKnxBL.jpg,2022-01-14,Hotel Transylvania: Transformania,False,7.8,770
2,False,/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg,"[28, 12, 878]",634649,en,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5874.67,/1g0dhYtq4irTY1GPXvft6k4YLjm.jpg,2021-12-15,Spider-Man: No Way Home,False,8.4,4367
3,False,/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg,"[16, 35, 10751, 10402]",438695,en,Sing 2,Buster and his new cast now have their sights ...,5211.181,/aWeKITRFbbwY8txG5uCj4rMCfSP.jpg,2021-12-01,Sing 2,False,8.3,1110
4,False,/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg,"[16, 35, 10751, 14]",568124,en,Encanto,"The tale of an extraordinary family, the Madri...",4832.255,/4j0PNHkMr5ax3IA8tjtxcmPU3QT.jpg,2021-11-24,Encanto,False,7.8,3157
5,False,/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg,"[35, 14, 12]",425909,en,Ghostbusters: Afterlife,When a single mom and her two kids arrive in a...,3894.636,/sg4xJaufDiQl7caFEskBtQXfD4x.jpg,2021-11-11,Ghostbusters: Afterlife,False,7.7,1363
6,False,/yfNbZ34Yt2S0DdgGH38bLDhJPiM.jpg,"[28, 53]",860623,en,Last Man Down,After civilization succumbs to a deadly pandem...,3783.29,/4B7liCxNCZIZGONmAMkCnxVlZQV.jpg,2021-10-19,Last Man Down,False,6.8,124
7,False,/o76ZDm8PS9791XiuieNB93UZcRV.jpg,"[27, 28, 878]",460458,en,Resident Evil: Welcome to Raccoon City,Once the booming home of pharmaceutical giant ...,3107.294,/7uRbWOXxpWDMtnsd2PF3clu65jc.jpg,2021-11-24,Resident Evil: Welcome to Raccoon City,False,6.1,936
8,False,/eNI7PtK6DEYgZmHWP9gQNuff8pv.jpg,"[878, 28, 12]",624860,en,The Matrix Resurrections,"Plagued by strange memories, Neo's life takes ...",2920.594,/8c4a8kE7PizaGQQnditMmI1xbRp.jpg,2021-12-16,The Matrix Resurrections,False,6.9,2346
9,False,/vIgyYkXkg6NC2whRbYjBD7eb3Er.jpg,"[878, 28, 12]",580489,en,Venom: Let There Be Carnage,After finding a host body in investigative rep...,2444.605,/rjkmN1dniUHVYAtwuV3Tji7FsDO.jpg,2021-09-30,Venom: Let There Be Carnage,False,7.1,5911


In [5]:
#Single Test: Crew
movie_id = 672582

# Endpoint & response
movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
movie_response = requests.get(movie_url).json()

pprint(movie_response)

{'adult': False,
 'backdrop_path': '/yL9RRZbDVbptqLwiZcK304ck4PL.jpg',
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 27, 'name': 'Horror'}],
 'homepage': '',
 'id': 672582,
 'imdb_id': 'tt11686490',
 'original_language': 'en',
 'original_title': 'The Deep House',
 'overview': 'While diving in a remote French lake, a couple of YouTubers who '
             'specialize in underwater exploration videos discover a house '
             'submerged in the deep waters. What was initially a unique '
             'finding soon turns into a nightmare when they discover that the '
             'house was the scene of atrocious crimes. Trapped, with their '
             'oxygen reserves falling dangerously, they realize the worst is '
             'yet to come: they are not alone in the house.',
 'popularity': 427.552,
 'poster_path': '/5xhAPxRr64oQPEFnUOrttuI4ZEU.jpg',
 'production_companies': [{'id': 12689,
                           'logo_path': None,
                          

# Functions: Define API calls to extract key data points

In [6]:
# Start Timer Function (check on API call performance)
def start_timer():
    start = datetime.now()
    
    return start

In [7]:
# Stop Timer Function (check on API call performance)
def stop_timer(start):
    end = datetime.now()
    elapsed_time = (end - start)

    print(f"Total Time Elapsed:  {elapsed_time.total_seconds()} seconds")

### API CALLS:

In [8]:
# Returns most popular movies
def get_most_popular_movies(api_key):
    
    movies = []
    
    # Loop through pages to get results for movies
    for x in range(1, 201):
        page_number = x

        # Endpoint for finding most popular movies
        discover_movies = "https://api.themoviedb.org/3/discover/movie"
        most_popular_url = f"{discover_movies}?api_key={api_key}&page={page_number}&sort_by=popularity.desc"
        
        # Most popular movies
        tmdb_response = requests.get(most_popular_url).json()
        results = tmdb_response["results"]

#         for y in range(len(results)):
#             movies.append(results[y]) 
        for result in results:
            movies.append(result)
            
    return movies

In [9]:
# Returns movie details as a list
def get_movie_details(api_key, movie_ids):  
    
    movie_details = []
    
    for movie_id in movie_ids:
        # Endpoint & response
        movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
        movie_response = requests.get(movie_url).json()

        # Append results to lists
        movie_details.append(movie_response)
        
    return movie_details

In [10]:
# Returns keywords as a list
def get_movie_keywords(api_key, movie_ids):
    keyword_details = []
    
    for movie_id in movie_ids:
        # Get keywords for each movie
        keywords_url = f"https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}"
        keywords_response = requests.get(keywords_url).json()
        keywords_results = keywords_response["keywords"]
        
         # Append results to lists
        keyword_details.append(keywords_response)
        
    return keyword_details

In [11]:
# Returns credits as a list
def get_credits(api_key, movie_ids):
    credit_details = []
    
    for movie_id in movie_ids:
        # Generate movie credits
        movie_credits = f"/movie/{movie_id}/credits"
        credits_url = f"https://api.themoviedb.org/3/{movie_credits}?api_key={api_key}"

        # Get the json response for the credits
        credits_response = requests.get(credits_url).json()
        
        credit_details.append(credits_response)
        
    return credit_details

In [12]:
# Returns certifications as a list (rating: G, PG, etc.)
def get_certifications(api_key, movie_ids):
    certifications = []
    
    for movie_id in movie_ids:
        # Get certifications for each movie
        certification_url = f"https://api.themoviedb.org/3/movie/{movie_id}/release_dates?api_key={api_key}"
        
        certification_response = requests.get(certification_url).json()
        
        certifications.append(certification_response)
        
    return certifications

### GET TITLES & IDS:

In [13]:
# Returns single title for specified index number
def get_title(results, idx):
    movie_title = results[idx]["title"]
    
    return movie_title

In [14]:
# Returns ids list
def get_ids(results):
    movie_ids = []
    for x in results:
        movie_ids.append(x["id"])
        
    return movie_ids

In [15]:
# Returns titles list
def get_titles(results):
    movie_titles = []
    for x in results:
        movie_titles.append(x["title"])
        
    return movie_titles

### EXTRACT NEEDED DETAILS FROM COLUMNS: Cast, Crew, Gender, Production Countries etc. 

In [16]:
# Extract the certification (ratings)
def extract_certification(x):
    name = ""
    
    for i in x:
        # Get ratings for US
        if i['iso_3166_1'] == 'US':
             # Append results to lists
            name = i['release_dates'][0]['certification']
            break
    return name

In [17]:
# Function to get the director out of the crew
def get_director(x):
    names = []
    
    for i in x:
        if i['job'] == 'Director':
            name = i['name']
            names.append(name)
            
    if(names):
        return names
    
    return np.nan

In [18]:
# Function to get the director gender out of the crew
def get_director_gender(x):
    names = []
    
    for i in x:
        if i['job'] == 'Director':
            gender = i['gender']
            names.append(gender)
            
    if(names):
        return names
    
    return np.nan

In [19]:
def fm_percentage(results):
    fm_count = 0
    total_count = 0
    for x in results:
        if x['gender'] == 1:
            fm_count += 1
        if x['gender'] == 1 or x['gender'] == 2:
            total_count += 1
    
    if total_count == 0:        
        return np.nan
            
    return percentage_format(100 * fm_count/total_count)

In [20]:
# Function to get the producers out of the crew
def get_producers(x):
    names = []
    
    for i in x:
        if i['job'] == 'Producer':
            name = i['name']
            names.append(name)
        elif i['job'] == "Executive Producer":
            name = i['name']
            names.append(name)
        elif i['job'] == 'Co-Producer':
            name = i['name']
            names.append(name)
            
    if(names):
        return names
    
    return np.nan

In [21]:
# Function to get the screenplay writers out of the crew
def get_writers(x):
    names = []
    for i in x:
        if i['department'] == 'Writing':
            name = i['name']
            names.append(name)
            
    if(names):
        return names
    
    return np.nan

In [22]:
# Function to create cast list
def get_cast_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        
        # Condition for cast (keep top 5)
        if len(names) > 5:
            names = names[:5]
        return names

    return []

In [23]:
# Function to create lists of each feature
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        return names
        
    return []

In [24]:
# Function to get production companies
def get_production_companies(x):
    names = []
    if isinstance(x, list):
        for i in x:
            name = i['name']
            names.append(name)
        return names
        
    return []

In [25]:
# Get each production company's country of origin
def get_production_company_country(x):
    names = []
    if isinstance(x, list):
        for i in x:
            country = i['origin_country']
            names.append(country)
        return names
    return []

In [26]:
# Get list of all languages available for film
def get_languages(x):
    names = []
    for i in x:
        name = i['english_name']
#         iso = i['iso_639_1']
        names.append(name)
    
    if(names):
        return names
    
    return np.nan

In [27]:
# Create binary column for foreign language films
def original_language_binary(x):
    if (x == 'en'):
        return 0
    elif (x != 'en'):
        return 1
    elif (x == "" | x == " "):
        return 0
    else:
        return 0

In [28]:
def percentage_format(percentage):
    return "{:,.2f}%".format(percentage)

### CLEAN DATA FOR SOUP & CREATE SOUP: Used for Machine Learning 

In [29]:
# Convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else: 
            return ''

In [30]:
# Clean the overview column (by converting to lowercase)
def clean_overview(x):
    if isinstance(x, str):
        return x.lower()
    else: 
        return ''

In [31]:
# Create soup of words
def create_soup(x):
    space = ' '
    return (space.join(x['keywords_cleaned']) + space + space.join(x['cast_cleaned']) + space 
            + space.join(x['director_cleaned']) + space + space.join(x['producers_cleaned']) + space 
            + space.join(x['writers_cleaned']) + space + space.join(x['genres_cleaned']) + space 
            + space.join(x['production_companies_cleaned']))

In [32]:
# Create soup of words - 2 keywords, 2 genres & overview
def create_soup_overview(x):
    space = ' '
    return (space.join(x['keywords_cleaned']) + space + space.join(x['keywords_cleaned']) + space 
            + space.join(x['cast_cleaned']) + space + space.join(x['director_cleaned']) + space 
            + space.join(x['producers_cleaned']) + space + space.join(x['writers_cleaned']) + space 
            + space.join(x['genres_cleaned']) + space + space.join(x['genres_cleaned']) + space 
            + space.join(x['production_companies_cleaned']) + space + x['overview_cleaned'])

# CALL API

### Get Most Popular Movies JSON Results:

In [33]:
# Start the timer
start = start_timer()

In [34]:
most_popular_movies = get_most_popular_movies(api_key)
# print(most_popular_movies)

In [35]:
# Stop the timer
stop_timer(start)

Total Time Elapsed:  104.428939 seconds


### List of Movie IDs:

In [36]:
movie_ids = get_ids(most_popular_movies)
#print(movie_ids)

In [37]:
len(movie_ids)

4000

### List of Titles:

In [38]:
titles = get_titles(most_popular_movies)
# print(titles)

In [39]:
len(titles)

4000

### Print Individual Title:

In [40]:
print(get_title(most_popular_movies, 0))

Eternals


### Get Movie Details:

In [41]:
# Start the timer
start = start_timer()

In [42]:
details = get_movie_details(api_key, movie_ids)

In [43]:
pprint(details[0:10])

[{'adult': False,
  'backdrop_path': '/c6H7Z4u73ir3cIoCteuhJh7UCAR.jpg',
  'belongs_to_collection': None,
  'budget': 200000000,
  'genres': [{'id': 28, 'name': 'Action'},
             {'id': 12, 'name': 'Adventure'},
             {'id': 14, 'name': 'Fantasy'},
             {'id': 878, 'name': 'Science Fiction'}],
  'homepage': 'https://www.marvel.com/movies/the-eternals',
  'id': 524434,
  'imdb_id': 'tt9032400',
  'original_language': 'en',
  'original_title': 'Eternals',
  'overview': 'The Eternals are a team of ancient aliens who have been living '
              'on Earth in secret for thousands of years. When an unexpected '
              'tragedy forces them out of the shadows, they are forced to '
              'reunite against mankind’s most ancient enemy, the Deviants.',
  'popularity': 13124.352,
  'poster_path': '/b6qUu00iIIkXX13szFy7d0CyNcg.jpg',
  'production_companies': [{'id': 420,
                            'logo_path': '/hUzeosd33nzE5MCNsZxCGEKTXaQ.png',
             

In [44]:
# Stop the timer
stop_timer(start)

Total Time Elapsed:  1177.901674 seconds


### Get Keywords:

In [45]:
# Start the timer
start = start_timer()

In [46]:
keywords = get_movie_keywords(api_key, movie_ids)

In [47]:
# print(keywords)

In [48]:
# Stop the timer
stop_timer(start)

Total Time Elapsed:  739.83646 seconds


### Get Credits:

In [49]:
# Start the timer
start = start_timer()

In [50]:
credits = get_credits(api_key, movie_ids)

In [51]:
# print(credits)

In [52]:
# Stop the timer
stop_timer(start)

Total Time Elapsed:  718.748118 seconds


### Get Certifications:

In [53]:
# Start the timer
start = start_timer()

In [54]:
# Get the certifications for each movie (rating: G, PG, etc.)
certifications = get_certifications(api_key, movie_ids)
# certifications = get_certifications(api_key, [32657, 672582])

In [55]:
# print(certifications)

In [56]:
# Stop the timer
stop_timer(start)

Total Time Elapsed:  790.482216 seconds


## Movie Details DataFrame:

In [63]:
from io import StringIO
# Convert details to json
json_details_string = json.dumps(details)
# Convert json to dataframe
movie_details_df = pd.read_json(StringIO(json_details_string))

# Export to save
# movie_details_df.to_csv("./static/data/movie_details.csv", index=False)

movie_details_df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/c6H7Z4u73ir3cIoCteuhJh7UCAR.jpg,,200000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.marvel.com/movies/the-eternals,524434,tt9032400,en,Eternals,...,2021-11-03,401842256,156.0,"[{'english_name': 'Arabic', 'iso_639_1': 'ar',...",Released,In the beginning...,Eternals,False,7.3,3130
1,False,/ur19n7Zabzf7wfhK4MrlFDJiZPI.jpg,"{'id': 185103, 'name': 'Hotel Transylvania Col...",0,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",https://www.hoteltmovie.com,585083,tt9848626,en,Hotel Transylvania: Transformania,...,2022-01-14,0,88.0,"[{'english_name': 'Danish', 'iso_639_1': 'da',...",Released,Change can be scary.,Hotel Transylvania: Transformania,False,7.8,763
2,False,/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg,"{'id': 531241, 'name': 'Spider-Man (Avengers) ...",200000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.spidermannowayhome.movie,634649,tt10872600,en,Spider-Man: No Way Home,...,2021-12-15,1631853496,148.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The Multiverse unleashed.,Spider-Man: No Way Home,False,8.4,4367
3,False,/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg,"{'id': 544670, 'name': 'Sing Collection', 'pos...",85000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://www.illumination.com/movie/sing-2/,438695,tt6467266,en,Sing 2,...,2021-12-01,215000000,110.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Where will your dreams take you?,Sing 2,False,8.3,1101
4,False,/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg,,50000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://movies.disney.com/encanto,568124,tt2953050,en,Encanto,...,2021-11-24,215000000,102.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's a little magic in all of us ...almost ...,Encanto,False,7.8,3154


In [64]:
# Extract year from release date
movie_details_df["year"] = pd.DatetimeIndex(movie_details_df['release_date']).year
# Drop any N/A from year
movie_details_df.dropna(inplace=True, how="any", subset=['year','release_date'])
 
# Convert to int
movie_details_df["year"] = movie_details_df["year"].astype(int)
    
# Rename columns to prepare for inflation calculation
movie_details_df = movie_details_df.rename(columns = {
    "revenue": "original_revenue",
    "budget": "original_budget"
})

In [65]:
len(movie_details_df)

3978

## Adjust Budget & Revenue for Inflation

In [66]:
# Create dataframe for cpi
cpi_df = pd.DataFrame(columns=['year', 'cpi_2021', 'cpi_old'])

In [67]:
# Create list of years (1913-2021)
years = []
for i in range(1913, 2022):
    years.append(i)

In [68]:
# Define 2021 cpi
cpi_2021 = 269.489

# Loop through years to append to dataframe
for year in years:
    if year != 2021:
        cpi_old = cpi.get(year)
        cpi_df = cpi_df.append({"year": year, 'cpi_2021': cpi_2021, 'cpi_old': cpi_old}, ignore_index=True)
    elif year == 2021:
        cpi_df = cpi_df.append({"year": year, 'cpi_2021': cpi_2021, 'cpi_old': cpi_2021}, ignore_index=True)

In [69]:
# Convert year type to int
cpi_df['year'] = cpi_df['year'].astype(int)
cpi_df.head()

Unnamed: 0,year,cpi_2021,cpi_old
0,1913,269.489,9.9
1,1914,269.489,10.0
2,1915,269.489,10.1
3,1916,269.489,10.9
4,1917,269.489,12.8


In [70]:
# Merge movie details df with cpi df
movie_details_df = movie_details_df.merge(cpi_df, on="year")
movie_details_df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,original_budget,genres,homepage,id,imdb_id,original_language,original_title,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,cpi_2021,cpi_old
0,False,/c6H7Z4u73ir3cIoCteuhJh7UCAR.jpg,,200000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.marvel.com/movies/the-eternals,524434,tt9032400,en,Eternals,...,"[{'english_name': 'Arabic', 'iso_639_1': 'ar',...",Released,In the beginning...,Eternals,False,7.3,3130,2021,269.489,269.489
1,False,/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg,"{'id': 531241, 'name': 'Spider-Man (Avengers) ...",200000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.spidermannowayhome.movie,634649,tt10872600,en,Spider-Man: No Way Home,...,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The Multiverse unleashed.,Spider-Man: No Way Home,False,8.4,4367,2021,269.489,269.489
2,False,/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg,"{'id': 544670, 'name': 'Sing Collection', 'pos...",85000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://www.illumination.com/movie/sing-2/,438695,tt6467266,en,Sing 2,...,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Where will your dreams take you?,Sing 2,False,8.3,1101,2021,269.489,269.489
3,False,/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg,,50000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://movies.disney.com/encanto,568124,tt2953050,en,Encanto,...,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's a little magic in all of us ...almost ...,Encanto,False,7.8,3154,2021,269.489,269.489
4,False,/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg,"{'id': 2980, 'name': 'Ghostbusters Collection'...",75000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",https://ghostbusters.com/,425909,tt4513678,en,Ghostbusters: Afterlife,...,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Discover the past. Protect the future.,Ghostbusters: Afterlife,False,7.7,1363,2021,269.489,269.489


In [71]:
# Create adjusted column for each feature
features = ["revenue", "budget"]

for feature in features:
    movie_details_df[f'adjusted_{feature}'] = (movie_details_df[f'original_{feature}'] * movie_details_df['cpi_2021']) / movie_details_df['cpi_old']

movie_details_df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,original_budget,genres,homepage,id,imdb_id,original_language,original_title,...,tagline,title,video,vote_average,vote_count,year,cpi_2021,cpi_old,adjusted_revenue,adjusted_budget
0,False,/c6H7Z4u73ir3cIoCteuhJh7UCAR.jpg,,200000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.marvel.com/movies/the-eternals,524434,tt9032400,en,Eternals,...,In the beginning...,Eternals,False,7.3,3130,2021,269.489,269.489,401842300.0,200000000.0
1,False,/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg,"{'id': 531241, 'name': 'Spider-Man (Avengers) ...",200000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.spidermannowayhome.movie,634649,tt10872600,en,Spider-Man: No Way Home,...,The Multiverse unleashed.,Spider-Man: No Way Home,False,8.4,4367,2021,269.489,269.489,1631853000.0,200000000.0
2,False,/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg,"{'id': 544670, 'name': 'Sing Collection', 'pos...",85000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://www.illumination.com/movie/sing-2/,438695,tt6467266,en,Sing 2,...,Where will your dreams take you?,Sing 2,False,8.3,1101,2021,269.489,269.489,215000000.0,85000000.0
3,False,/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg,,50000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://movies.disney.com/encanto,568124,tt2953050,en,Encanto,...,There's a little magic in all of us ...almost ...,Encanto,False,7.8,3154,2021,269.489,269.489,215000000.0,50000000.0
4,False,/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg,"{'id': 2980, 'name': 'Ghostbusters Collection'...",75000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",https://ghostbusters.com/,425909,tt4513678,en,Ghostbusters: Afterlife,...,Discover the past. Protect the future.,Ghostbusters: Afterlife,False,7.7,1363,2021,269.489,269.489,191000000.0,75000000.0


In [72]:
# Check values
movie_details_df[['original_budget', 'adjusted_budget', 'original_revenue', 'adjusted_revenue', 'year']]

Unnamed: 0,original_budget,adjusted_budget,original_revenue,adjusted_revenue,year
0,200000000,2.000000e+08,401842256,4.018423e+08,2021
1,200000000,2.000000e+08,1631853496,1.631853e+09,2021
2,85000000,8.500000e+07,215000000,2.150000e+08,2021
3,50000000,5.000000e+07,215000000,2.150000e+08,2021
4,75000000,7.500000e+07,191000000,1.910000e+08,2021
...,...,...,...,...,...
3890,0,0.000000e+00,0,0.000000e+00,1969
3891,6500000,4.772966e+07,81974493,6.019407e+08,1969
3892,5500000,4.705363e+07,141195658,1.207958e+09,1965
3893,672000,1.393051e+07,10000000,2.072992e+08,1933


## Low Budget:

In [73]:
# Create budget bins
bins = [1, 15000000, 50000000, 150000000, 380000000] 
bin_names = ["1 to 15m", "16m to 50m", "51m to 150m", "151m to 380m"]

# Append a budget bin column
movie_details_df["budget_bins"] = pd.cut(movie_details_df["adjusted_budget"], bins, labels=bin_names)
movie_details_df["budget_bins"].value_counts()

16m to 50m      709
51m to 150m     694
1 to 15m        520
151m to 380m    314
Name: budget_bins, dtype: int64

In [74]:
# Low Budget
movie_details_df.loc[movie_details_df["budget_bins"] == "1 to 15m"].head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,original_budget,genres,homepage,id,imdb_id,original_language,original_title,...,title,video,vote_average,vote_count,year,cpi_2021,cpi_old,adjusted_revenue,adjusted_budget,budget_bins
18,False,/xGrTm3J0FTafmuQ85vF7ZCw94x6.jpg,,9100000,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",,589761,tt10648714,ru,Чернобыль,...,Chernobyl: Abyss,False,6.3,269,2021,269.489,269.489,5370393.0,9100000.0,1 to 15m
24,False,/7w06baRS9VPm5RYz8lawTCLiR4j.jpg,,13000000,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,592508,tt9531772,hi,Sooryavanshi,...,Sooryavanshi,False,5.9,71,2021,269.489,269.489,37700000.0,13000000.0,1 to 15m
38,False,/lV3UFPPxDIPelh46G9oySXN9Mcz.jpg,"{'id': 702624, 'name': 'After Collection', 'po...",14000000,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",,744275,tt13069986,en,After We Fell,...,After We Fell,False,7.2,1362,2021,269.489,269.489,19000000.0,14000000.0,1 to 15m
42,False,/weneJTnAb1IFI94SKcaXzBFmPKH.jpg,,12400000,"[{'id': 80, 'name': 'Crime'}, {'id': 53, 'name...",,818192,tt11388416,en,Ida Red,...,Ida Red,False,6.0,20,2021,269.489,269.489,0.0,12400000.0,1 to 15m
50,False,/hAv1GwwatyWV1RFXOfaASxgUVm4.jpg,,2800000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,657644,tt7268738,ml,മിന്നൽ മുരളി,...,Minnal Murali,False,7.1,146,2021,269.489,269.489,0.0,2800000.0,1 to 15m


## Keywords DataFrame:

In [75]:
# Convert keywords to json
json_keywords_string = json.dumps(keywords)
# Convert json to dataframe
movie_keywords_df = pd.read_json(json_keywords_string)

# Export to save
# movie_keywords_df.to_csv("./static/data/movie_keywords.csv", index=False)
# pprint(json_keywords_string)

## Credits DataFrame:

In [76]:
# Convert credits to json
json_credits_string = json.dumps(credits)
# Convert json to dataframe
movie_credits_df = pd.read_json(json_credits_string)

# Export to save
# movie_credits_df.to_csv("./static/data/movie_credits.csv", index=False)
movie_credits_df.head()

Unnamed: 0,id,cast,crew
0,524434,"[{'adult': False, 'gender': 1, 'id': 97576, 'k...","[{'adult': False, 'gender': 2, 'id': 1722, 'kn..."
1,585083,"[{'adult': False, 'gender': 1, 'id': 77948, 'k...","[{'adult': False, 'gender': 2, 'id': 5666, 'kn..."
2,634649,"[{'adult': False, 'gender': 2, 'id': 1136406, ...","[{'adult': False, 'gender': 1, 'id': 2519, 'kn..."
3,438695,"[{'adult': False, 'gender': 2, 'id': 10297, 'k...","[{'adult': False, 'gender': 2, 'id': 5720, 'kn..."
4,568124,"[{'adult': False, 'gender': 1, 'id': 968367, '...","[{'adult': False, 'gender': 0, 'id': 8159, 'kn..."


In [77]:
movie_credits_df["cast_crew"] = movie_credits_df["cast"] + movie_credits_df["crew"]
movie_credits_df

Unnamed: 0,id,cast,crew,cast_crew
0,524434,"[{'adult': False, 'gender': 1, 'id': 97576, 'k...","[{'adult': False, 'gender': 2, 'id': 1722, 'kn...","[{'adult': False, 'gender': 1, 'id': 97576, 'k..."
1,585083,"[{'adult': False, 'gender': 1, 'id': 77948, 'k...","[{'adult': False, 'gender': 2, 'id': 5666, 'kn...","[{'adult': False, 'gender': 1, 'id': 77948, 'k..."
2,634649,"[{'adult': False, 'gender': 2, 'id': 1136406, ...","[{'adult': False, 'gender': 1, 'id': 2519, 'kn...","[{'adult': False, 'gender': 2, 'id': 1136406, ..."
3,438695,"[{'adult': False, 'gender': 2, 'id': 10297, 'k...","[{'adult': False, 'gender': 2, 'id': 5720, 'kn...","[{'adult': False, 'gender': 2, 'id': 10297, 'k..."
4,568124,"[{'adult': False, 'gender': 1, 'id': 968367, '...","[{'adult': False, 'gender': 0, 'id': 8159, 'kn...","[{'adult': False, 'gender': 1, 'id': 968367, '..."
...,...,...,...,...
3995,27,"[{'adult': False, 'gender': 2, 'id': 176, 'kno...","[{'adult': False, 'gender': 2, 'id': 172, 'kno...","[{'adult': False, 'gender': 2, 'id': 176, 'kno..."
3996,9654,"[{'adult': False, 'gender': 2, 'id': 13240, 'k...","[{'adult': False, 'gender': 2, 'id': 497, 'kno...","[{'adult': False, 'gender': 2, 'id': 13240, 'k..."
3997,658009,"[{'adult': False, 'gender': 2, 'id': 130597, '...","[{'adult': False, 'gender': 2, 'id': 2578785, ...","[{'adult': False, 'gender': 2, 'id': 130597, '..."
3998,1635,"[{'adult': False, 'gender': 2, 'id': 3061, 'kn...","[{'adult': False, 'gender': 2, 'id': 947, 'kno...","[{'adult': False, 'gender': 2, 'id': 3061, 'kn..."


In [78]:
movie_credits_df['percent_fm'] = movie_credits_df['cast_crew'].apply(fm_percentage)

In [79]:
# cast_crew_fm_percentage
movie_credits_df = movie_credits_df.drop(columns=['cast_crew'])
movie_credits_df

Unnamed: 0,id,cast,crew,percent_fm
0,524434,"[{'adult': False, 'gender': 1, 'id': 97576, 'k...","[{'adult': False, 'gender': 2, 'id': 1722, 'kn...",27.54%
1,585083,"[{'adult': False, 'gender': 1, 'id': 77948, 'k...","[{'adult': False, 'gender': 2, 'id': 5666, 'kn...",40.38%
2,634649,"[{'adult': False, 'gender': 2, 'id': 1136406, ...","[{'adult': False, 'gender': 1, 'id': 2519, 'kn...",25.26%
3,438695,"[{'adult': False, 'gender': 2, 'id': 10297, 'k...","[{'adult': False, 'gender': 2, 'id': 5720, 'kn...",25.00%
4,568124,"[{'adult': False, 'gender': 1, 'id': 968367, '...","[{'adult': False, 'gender': 0, 'id': 8159, 'kn...",42.86%
...,...,...,...,...
3995,27,"[{'adult': False, 'gender': 2, 'id': 176, 'kno...","[{'adult': False, 'gender': 2, 'id': 172, 'kno...",9.09%
3996,9654,"[{'adult': False, 'gender': 2, 'id': 13240, 'k...","[{'adult': False, 'gender': 2, 'id': 497, 'kno...",22.00%
3997,658009,"[{'adult': False, 'gender': 2, 'id': 130597, '...","[{'adult': False, 'gender': 2, 'id': 2578785, ...",28.57%
3998,1635,"[{'adult': False, 'gender': 2, 'id': 3061, 'kn...","[{'adult': False, 'gender': 2, 'id': 947, 'kno...",19.79%


## Certifications DataFrame:

In [81]:
# Convert certifications to json
json_certifications_string = json.dumps(certifications)
# Convert json to dataframe
movie_certifications_df = pd.read_json(StringIO(json_certifications_string))

# Export to save
# movie_certifications_df.to_csv("./static/data/movie_certifications.csv", index=False)
movie_certifications_df.head()

Unnamed: 0,id,results
0,524434,"[{'iso_3166_1': 'IE', 'release_dates': [{'cert..."
1,585083,"[{'iso_3166_1': 'FR', 'release_dates': [{'cert..."
2,634649,"[{'iso_3166_1': 'NZ', 'release_dates': [{'cert..."
3,438695,"[{'iso_3166_1': 'PL', 'release_dates': [{'cert..."
4,568124,"[{'iso_3166_1': 'FR', 'release_dates': [{'cert..."


In [82]:
# Extract certification information and append to new column
movie_certifications_df['certification'] = movie_certifications_df['results'].apply(extract_certification)
movie_certifications_df = movie_certifications_df[['id', 'certification']]

In [83]:
movie_certifications_df.head()

Unnamed: 0,id,certification
0,524434,
1,585083,PG
2,634649,PG-13
3,438695,
4,568124,


In [84]:
# Convert ids to ints (to merge correctly)
movie_keywords_df['id'] = movie_keywords_df['id'].astype('int')
movie_details_df['id'] = movie_details_df['id'].astype('int')
movie_credits_df['id'] = movie_credits_df['id'].astype('int')
movie_certifications_df['id'] = movie_certifications_df['id'].astype('int')

In [85]:
# Merge keywords with details
movie_df = movie_details_df.merge(movie_keywords_df, on='id')
movie_df = movie_df.merge(movie_credits_df, on='id')
movie_df = movie_df.merge(movie_certifications_df, on='id')

# Export to save
# movie_df.to_csv("./static/data/movies_merged.csv", index=False)

In [86]:
movie_df.head(20)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,original_budget,genres,homepage,id,imdb_id,original_language,original_title,...,cpi_2021,cpi_old,adjusted_revenue,adjusted_budget,budget_bins,keywords,cast,crew,percent_fm,certification
0,False,/c6H7Z4u73ir3cIoCteuhJh7UCAR.jpg,,200000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.marvel.com/movies/the-eternals,524434,tt9032400,en,Eternals,...,269.489,269.489,401842300.0,200000000.0,151m to 380m,"[{'id': 3925, 'name': 'boredom'}, {'id': 6152,...","[{'adult': False, 'gender': 1, 'id': 97576, 'k...","[{'adult': False, 'gender': 2, 'id': 1722, 'kn...",27.54%,
1,False,/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg,"{'id': 531241, 'name': 'Spider-Man (Avengers) ...",200000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.spidermannowayhome.movie,634649,tt10872600,en,Spider-Man: No Way Home,...,269.489,269.489,1631853000.0,200000000.0,151m to 380m,"[{'id': 1701, 'name': 'hero'}, {'id': 5451, 'n...","[{'adult': False, 'gender': 2, 'id': 1136406, ...","[{'adult': False, 'gender': 1, 'id': 2519, 'kn...",25.26%,PG-13
2,False,/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg,"{'id': 544670, 'name': 'Sing Collection', 'pos...",85000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://www.illumination.com/movie/sing-2/,438695,tt6467266,en,Sing 2,...,269.489,269.489,215000000.0,85000000.0,51m to 150m,"[{'id': 11477, 'name': 'anthropomorphism'}, {'...","[{'adult': False, 'gender': 2, 'id': 10297, 'k...","[{'adult': False, 'gender': 2, 'id': 5720, 'kn...",25.00%,
3,False,/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg,,50000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://movies.disney.com/encanto,568124,tt2953050,en,Encanto,...,269.489,269.489,215000000.0,50000000.0,16m to 50m,"[{'id': 2343, 'name': 'magic'}, {'id': 4344, '...","[{'adult': False, 'gender': 1, 'id': 968367, '...","[{'adult': False, 'gender': 0, 'id': 8159, 'kn...",42.86%,
4,False,/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg,"{'id': 2980, 'name': 'Ghostbusters Collection'...",75000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",https://ghostbusters.com/,425909,tt4513678,en,Ghostbusters: Afterlife,...,269.489,269.489,191000000.0,75000000.0,51m to 150m,"[{'id': 1415, 'name': 'small town'}, {'id': 30...","[{'adult': False, 'gender': 1, 'id': 1308445, ...","[{'adult': False, 'gender': 2, 'id': 561, 'kno...",25.40%,
5,False,/yfNbZ34Yt2S0DdgGH38bLDhJPiM.jpg,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,860623,tt12335692,en,Last Man Down,...,269.489,269.489,0.0,0.0,,[],"[{'adult': False, 'gender': 2, 'id': 1481440, ...","[{'adult': False, 'gender': 2, 'id': 1481440, ...",50.00%,R
6,False,/o76ZDm8PS9791XiuieNB93UZcRV.jpg,,40000000,"[{'id': 27, 'name': 'Horror'}, {'id': 28, 'nam...",https://www.residentevil.movie,460458,tt6920084,en,Resident Evil: Welcome to Raccoon City,...,269.489,269.489,31000000.0,40000000.0,16m to 50m,"[{'id': 1852, 'name': 'mutant'}, {'id': 1865, ...","[{'adult': False, 'gender': 1, 'id': 115150, '...","[{'adult': False, 'gender': 2, 'id': 4014, 'kn...",31.58%,R
7,False,/eNI7PtK6DEYgZmHWP9gQNuff8pv.jpg,"{'id': 2344, 'name': 'The Matrix Collection', ...",190000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.whatisthematrix.com,624860,tt10838180,en,The Matrix Resurrections,...,269.489,269.489,148000000.0,190000000.0,151m to 380m,"[{'id': 310, 'name': 'artificial intelligence'...","[{'adult': False, 'gender': 2, 'id': 6384, 'kn...","[{'adult': False, 'gender': 2, 'id': 1071, 'kn...",25.86%,
8,False,/vIgyYkXkg6NC2whRbYjBD7eb3Er.jpg,"{'id': 558216, 'name': 'Venom Collection', 'po...",110000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.venom.movie,580489,tt7097896,en,Venom: Let There Be Carnage,...,269.489,269.489,500000000.0,110000000.0,51m to 150m,"[{'id': 1701, 'name': 'hero'}, {'id': 2095, 'n...","[{'adult': False, 'gender': 2, 'id': 2524, 'kn...","[{'adult': False, 'gender': 2, 'id': 149, 'kno...",23.08%,PG-13
9,False,/AmLpWYm9R3Ur2FLPgj5CH3wR8wp.jpg,,0,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.netflix.com/title/81038410,739413,tt13029044,en,Mother/Android,...,269.489,269.489,0.0,0.0,,"[{'id': 310, 'name': 'artificial intelligence'...","[{'adult': False, 'gender': 1, 'id': 56734, 'k...","[{'adult': False, 'gender': 2, 'id': 32278, 'k...",34.48%,R


## Call Functions to Extract Details:

In [87]:
# Director
movie_df['director'] = movie_df['crew'].apply(get_director)

# Director Gender
movie_df['director_gender'] = movie_df['crew'].apply(get_director_gender)

# Producers
movie_df['producers'] = movie_df['crew'].apply(get_producers)

# Screenplay writers
movie_df['writers'] = movie_df['crew'].apply(get_writers)

# Cast
movie_df['cast'] = movie_df['cast'].apply(get_cast_list)

# Production Company Country of Origin
movie_df['production_company_origin'] = movie_df['production_companies'].apply(get_production_company_country)

# Spoken Languages in Movie
movie_df['spoken_languages']  = movie_df['spoken_languages'].apply(get_languages)

# Create foreign language column for if the original language was not English
movie_df['foreign_language'] = movie_df['original_language'].apply(original_language_binary)

# Get List: Keywords, genres, & production countries
features = ['keywords', 'genres', 'production_companies']
            
for feature in features:
    movie_df[feature] = movie_df[feature].apply(get_list)

In [88]:
# Show the results
movie_df[['title', 'cast', 'director', 'director_gender', 'percent_fm', 'producers', 'writers', 'keywords', 'genres', 'production_companies', 'production_company_origin', 'spoken_languages', 'original_language', 'foreign_language']].head()

Unnamed: 0,title,cast,director,director_gender,percent_fm,producers,writers,keywords,genres,production_companies,production_company_origin,spoken_languages,original_language,foreign_language
0,Eternals,"[Gemma Chan, Richard Madden, Angelina Jolie, K...",[Chloé Zhao],[1],27.54%,"[Kevin Feige, Louis D'Esposito, Victoria Alons...","[Jack Kirby, Patrick Burleigh, Ryan Firpo, Rya...","[boredom, supernatural, superhero, based on co...","[Action, Adventure, Fantasy, Science Fiction]",[Marvel Studios],[US],"[Arabic, English, Hindi, Latin, Spanish]",en,0
1,Spider-Man: No Way Home,"[Tom Holland, Zendaya, Benedict Cumberbatch, J...",[Jon Watts],[2],25.26%,"[Avi Arad, Kevin Feige, Louis D'Esposito, JoAn...","[Stan Lee, Stan Lee, Steve Ditko, Steve Ditko,...","[hero, comic book, superhero, based on comic, ...","[Action, Adventure, Science Fiction]","[Marvel Studios, Pascal Pictures, Columbia Pic...","[US, US, US]","[English, Tagalog]",en,0
2,Sing 2,"[Matthew McConaughey, Reese Witherspoon, Scarl...",[Garth Jennings],[2],25.00%,"[Christopher Meledandri, Janet Healy, Dana Kru...","[Garth Jennings, Garth Jennings]","[anthropomorphism, singing]","[Animation, Comedy, Family, Music]","[Illumination Entertainment, Universal Pictures]","[US, US]",[English],en,0
3,Encanto,"[Stephanie Beatriz, María Cecilia Botero, John...","[Byron Howard, Jared Bush]","[2, 2]",42.86%,"[Clark Spencer, Jennifer Lee, Yvett Merino Flo...","[Jared Bush, Charise Castro Smith]","[magic, musical, forest, family relationships,...","[Animation, Comedy, Family, Fantasy]","[Walt Disney Animation Studios, Walt Disney Pi...","[US, US]","[English, Spanish]",en,0
4,Ghostbusters: Afterlife,"[Carrie Coon, Finn Wolfhard, Mckenna Grace, Pa...",[Jason Reitman],[2],25.40%,"[Dan Aykroyd, Ivan Reitman, Gil Kenan, Michael...","[Dan Aykroyd, Harold Ramis, Jason Reitman, Gil...","[small town, ghostbuster, nostalgia, afterlife...","[Comedy, Fantasy, Adventure]","[Columbia Pictures, Bron Studios, The Montecit...","[US, CA, US, US]","[English, Ukrainian]",en,0


In [89]:
# Drop NA from selected columns
movie_df.dropna(inplace=True, how="any", subset=['genres', 'production_companies', 'keywords', 'cast'])
movie_df.dropna(inplace=True, how="any", subset=['director', 'producers', 'writers'])

In [90]:
len(movie_df)

3308

In [91]:
# Clean Data: Convert all features to lowercase and remove spaces
features = ['cast', 'director', 'producers', 'writers', 'keywords', 'genres', 'production_companies']

for feature in features:
    movie_df[f"{feature}_cleaned"] = movie_df[feature].apply(clean_data)

In [92]:
# Call the clean overview function
movie_df['overview_cleaned'] = movie_df['overview'].apply(clean_overview)

In [93]:
# Create soup columns
movie_df['soup'] = movie_df.apply(create_soup, axis = 1)
movie_df['soup_overview'] = movie_df.apply(create_soup_overview, axis = 1)

# Export to save
# movie_df.to_csv("./static/data/movies_cleaned_soup.csv", index=False)

In [94]:
movie_df[['soup']].head(5)

Unnamed: 0,soup
0,boredom supernatural superhero basedoncomic su...
1,hero comicbook superhero basedoncomic marvelci...
2,anthropomorphism singing matthewmcconaughey re...
3,magic musical forest familyrelationships femal...
4,smalltown ghostbuster nostalgia afterlife sequ...


In [95]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3308 entries, 0 to 3894
Data columns (total 52 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   adult                         3308 non-null   bool    
 1   backdrop_path                 3281 non-null   object  
 2   belongs_to_collection         1231 non-null   object  
 3   original_budget               3308 non-null   int64   
 4   genres                        3308 non-null   object  
 5   homepage                      3308 non-null   object  
 6   id                            3308 non-null   int64   
 7   imdb_id                       3303 non-null   object  
 8   original_language             3308 non-null   object  
 9   original_title                3308 non-null   object  
 10  overview                      3308 non-null   object  
 11  popularity                    3308 non-null   float64 
 12  poster_path                   3307 non-null   ob

## Configuration for Posters:

In [96]:
configuration_url = f"https://api.themoviedb.org/3/configuration?api_key={api_key}"
config_response = requests.get(configuration_url).json()
# config_response

In [97]:
# Get images structure
images_url = config_response['images']
# Get Base URL
secure_base_url = images_url['secure_base_url']
# secure_base_url

In [98]:
# Get the size of poster: 2: w185px, 4: w500px, 5: 780px
# images_url['poster_sizes']
poster_size = images_url['poster_sizes'][5]
# poster_size

In [99]:
# Copy poster paths to new df
poster_df = movie_df[['poster_path']].copy()

In [100]:
# Create column with full image path for posters
poster_df['poster_url'] = secure_base_url + poster_size + poster_df['poster_path']

In [101]:
# Export to separate csv
# poster_df.to_csv("./static/data/poster_path.csv", index=False)
poster_df.head()

Unnamed: 0,poster_path,poster_url
0,/b6qUu00iIIkXX13szFy7d0CyNcg.jpg,https://image.tmdb.org/t/p/w780/b6qUu00iIIkXX1...
1,/1g0dhYtq4irTY1GPXvft6k4YLjm.jpg,https://image.tmdb.org/t/p/w780/1g0dhYtq4irTY1...
2,/aWeKITRFbbwY8txG5uCj4rMCfSP.jpg,https://image.tmdb.org/t/p/w780/aWeKITRFbbwY8t...
3,/4j0PNHkMr5ax3IA8tjtxcmPU3QT.jpg,https://image.tmdb.org/t/p/w780/4j0PNHkMr5ax3I...
4,/sg4xJaufDiQl7caFEskBtQXfD4x.jpg,https://image.tmdb.org/t/p/w780/sg4xJaufDiQl7c...


In [102]:
# Create column with full image path for posters (in real df this time)
movie_df['poster_url'] = secure_base_url + poster_size + movie_df['poster_path']

# TESTING

## Export CSV:

In [103]:
# Create a lowercase column for easier search
movie_df["lowercase_title"] = movie_df['title'].apply(lambda x: x.lower())

In [104]:
# # Save file - used for calling for information
movie_df.to_csv("../static/data/movie_db.csv", index=False)

In [105]:
movie_df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,original_budget,genres,homepage,id,imdb_id,original_language,original_title,...,producers_cleaned,writers_cleaned,keywords_cleaned,genres_cleaned,production_companies_cleaned,overview_cleaned,soup,soup_overview,poster_url,lowercase_title
0,False,/c6H7Z4u73ir3cIoCteuhJh7UCAR.jpg,,200000000,"[Action, Adventure, Fantasy, Science Fiction]",https://www.marvel.com/movies/the-eternals,524434,tt9032400,en,Eternals,...,"[kevinfeige, louisd'esposito, victoriaalonso, ...","[jackkirby, patrickburleigh, ryanfirpo, ryanfi...","[boredom, supernatural, superhero, basedoncomi...","[action, adventure, fantasy, sciencefiction]",[marvelstudios],the eternals are a team of ancient aliens who ...,boredom supernatural superhero basedoncomic su...,boredom supernatural superhero basedoncomic su...,https://image.tmdb.org/t/p/w780/b6qUu00iIIkXX1...,eternals
1,False,/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg,"{'id': 531241, 'name': 'Spider-Man (Avengers) ...",200000000,"[Action, Adventure, Science Fiction]",https://www.spidermannowayhome.movie,634649,tt10872600,en,Spider-Man: No Way Home,...,"[aviarad, kevinfeige, louisd'esposito, joannpe...","[stanlee, stanlee, steveditko, steveditko, chr...","[hero, comicbook, superhero, basedoncomic, mar...","[action, adventure, sciencefiction]","[marvelstudios, pascalpictures, columbiapictures]",peter parker is unmasked and no longer able to...,hero comicbook superhero basedoncomic marvelci...,hero comicbook superhero basedoncomic marvelci...,https://image.tmdb.org/t/p/w780/1g0dhYtq4irTY1...,spider-man: no way home
2,False,/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg,"{'id': 544670, 'name': 'Sing Collection', 'pos...",85000000,"[Animation, Comedy, Family, Music]",https://www.illumination.com/movie/sing-2/,438695,tt6467266,en,Sing 2,...,"[christophermeledandri, janethealy, danakrupin...","[garthjennings, garthjennings]","[anthropomorphism, singing]","[animation, comedy, family, music]","[illuminationentertainment, universalpictures]",buster and his new cast now have their sights ...,anthropomorphism singing matthewmcconaughey re...,anthropomorphism singing anthropomorphism sing...,https://image.tmdb.org/t/p/w780/aWeKITRFbbwY8t...,sing 2
3,False,/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg,,50000000,"[Animation, Comedy, Family, Fantasy]",https://movies.disney.com/encanto,568124,tt2953050,en,Encanto,...,"[clarkspencer, jenniferlee, yvettmerinoflores]","[jaredbush, charisecastrosmith]","[magic, musical, forest, familyrelationships, ...","[animation, comedy, family, fantasy]","[waltdisneyanimationstudios, waltdisneypictures]","the tale of an extraordinary family, the madri...",magic musical forest familyrelationships femal...,magic musical forest familyrelationships femal...,https://image.tmdb.org/t/p/w780/4j0PNHkMr5ax3I...,encanto
4,False,/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg,"{'id': 2980, 'name': 'Ghostbusters Collection'...",75000000,"[Comedy, Fantasy, Adventure]",https://ghostbusters.com/,425909,tt4513678,en,Ghostbusters: Afterlife,...,"[danaykroyd, ivanreitman, gilkenan, michaelbeu...","[danaykroyd, haroldramis, jasonreitman, gilkenan]","[smalltown, ghostbuster, nostalgia, afterlife,...","[comedy, fantasy, adventure]","[columbiapictures, bronstudios, themontecitopi...",when a single mom and her two kids arrive in a...,smalltown ghostbuster nostalgia afterlife sequ...,smalltown ghostbuster nostalgia afterlife sequ...,https://image.tmdb.org/t/p/w780/sg4xJaufDiQl7c...,ghostbusters: afterlife


# Machine Learning Recommender:

In [106]:
from sklearn.feature_extraction.text import CountVectorizer

# Use Count Vectorizer to create counts for each word
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movie_df['soup_overview'])

In [107]:
count_matrix.shape

(3308, 46996)

In [108]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [109]:
# Reset the index
# movie_df = movie_df.reset_index()
# Create series with index & titles of movies
indices = pd.Series(movie_df.index, index=movie_df['title']).drop_duplicates()

In [110]:
movie_df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,original_budget,genres,homepage,id,imdb_id,original_language,original_title,...,producers_cleaned,writers_cleaned,keywords_cleaned,genres_cleaned,production_companies_cleaned,overview_cleaned,soup,soup_overview,poster_url,lowercase_title
0,False,/c6H7Z4u73ir3cIoCteuhJh7UCAR.jpg,,200000000,"[Action, Adventure, Fantasy, Science Fiction]",https://www.marvel.com/movies/the-eternals,524434,tt9032400,en,Eternals,...,"[kevinfeige, louisd'esposito, victoriaalonso, ...","[jackkirby, patrickburleigh, ryanfirpo, ryanfi...","[boredom, supernatural, superhero, basedoncomi...","[action, adventure, fantasy, sciencefiction]",[marvelstudios],the eternals are a team of ancient aliens who ...,boredom supernatural superhero basedoncomic su...,boredom supernatural superhero basedoncomic su...,https://image.tmdb.org/t/p/w780/b6qUu00iIIkXX1...,eternals
1,False,/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg,"{'id': 531241, 'name': 'Spider-Man (Avengers) ...",200000000,"[Action, Adventure, Science Fiction]",https://www.spidermannowayhome.movie,634649,tt10872600,en,Spider-Man: No Way Home,...,"[aviarad, kevinfeige, louisd'esposito, joannpe...","[stanlee, stanlee, steveditko, steveditko, chr...","[hero, comicbook, superhero, basedoncomic, mar...","[action, adventure, sciencefiction]","[marvelstudios, pascalpictures, columbiapictures]",peter parker is unmasked and no longer able to...,hero comicbook superhero basedoncomic marvelci...,hero comicbook superhero basedoncomic marvelci...,https://image.tmdb.org/t/p/w780/1g0dhYtq4irTY1...,spider-man: no way home
2,False,/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg,"{'id': 544670, 'name': 'Sing Collection', 'pos...",85000000,"[Animation, Comedy, Family, Music]",https://www.illumination.com/movie/sing-2/,438695,tt6467266,en,Sing 2,...,"[christophermeledandri, janethealy, danakrupin...","[garthjennings, garthjennings]","[anthropomorphism, singing]","[animation, comedy, family, music]","[illuminationentertainment, universalpictures]",buster and his new cast now have their sights ...,anthropomorphism singing matthewmcconaughey re...,anthropomorphism singing anthropomorphism sing...,https://image.tmdb.org/t/p/w780/aWeKITRFbbwY8t...,sing 2
3,False,/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg,,50000000,"[Animation, Comedy, Family, Fantasy]",https://movies.disney.com/encanto,568124,tt2953050,en,Encanto,...,"[clarkspencer, jenniferlee, yvettmerinoflores]","[jaredbush, charisecastrosmith]","[magic, musical, forest, familyrelationships, ...","[animation, comedy, family, fantasy]","[waltdisneyanimationstudios, waltdisneypictures]","the tale of an extraordinary family, the madri...",magic musical forest familyrelationships femal...,magic musical forest familyrelationships femal...,https://image.tmdb.org/t/p/w780/4j0PNHkMr5ax3I...,encanto
4,False,/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg,"{'id': 2980, 'name': 'Ghostbusters Collection'...",75000000,"[Comedy, Fantasy, Adventure]",https://ghostbusters.com/,425909,tt4513678,en,Ghostbusters: Afterlife,...,"[danaykroyd, ivanreitman, gilkenan, michaelbeu...","[danaykroyd, haroldramis, jasonreitman, gilkenan]","[smalltown, ghostbuster, nostalgia, afterlife,...","[comedy, fantasy, adventure]","[columbiapictures, bronstudios, themontecitopi...",when a single mom and her two kids arrive in a...,smalltown ghostbuster nostalgia afterlife sequ...,smalltown ghostbuster nostalgia afterlife sequ...,https://image.tmdb.org/t/p/w780/sg4xJaufDiQl7c...,ghostbusters: afterlife


## Function to Recommend Titles:

In [111]:
def get_similarity_scores(title, cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the most similar movies
    sim_scores = sim_scores[1:11]
#     print(sim_scores)
    
    # Convert list to DataFrame
    sim_scores_df = pd.DataFrame(sim_scores, columns = ["index", "similarity_score"])
    
    # Return top 10 most similar scores
    return sim_scores_df

In [112]:
def get_recommendations(original_df, score_df):
    original_df = original_df.merge(score_df, on="index")
    original_df = original_df.sort_values("similarity_score", ascending=False)
    
    return original_df

In [113]:
# Test out function
movie_title = "Get Out"
similarity_scores_df = get_similarity_scores(movie_title, cosine_sim)

### Convert columns to list for SQL Database connection

In [114]:
movie_df_columns = list(movie_df.columns)
movie_df_columns

['adult',
 'backdrop_path',
 'belongs_to_collection',
 'original_budget',
 'genres',
 'homepage',
 'id',
 'imdb_id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'original_revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'video',
 'vote_average',
 'vote_count',
 'year',
 'cpi_2021',
 'cpi_old',
 'adjusted_revenue',
 'adjusted_budget',
 'budget_bins',
 'keywords',
 'cast',
 'crew',
 'percent_fm',
 'certification',
 'director',
 'director_gender',
 'producers',
 'writers',
 'production_company_origin',
 'foreign_language',
 'cast_cleaned',
 'director_cleaned',
 'producers_cleaned',
 'writers_cleaned',
 'keywords_cleaned',
 'genres_cleaned',
 'production_companies_cleaned',
 'overview_cleaned',
 'soup',
 'soup_overview',
 'poster_url',
 'lowercase_title']

In [115]:
for column_name in list(movie_df.columns):
    movie_df[column_name] = movie_df[column_name].apply(json.dumps)

In [116]:
movie_dict = movie_df.applymap(lambda x: isinstance(x, (dict, list))).all()
print(movie_dict)

adult                           False
backdrop_path                   False
belongs_to_collection           False
original_budget                 False
genres                          False
homepage                        False
id                              False
imdb_id                         False
original_language               False
original_title                  False
overview                        False
popularity                      False
poster_path                     False
production_companies            False
production_countries            False
release_date                    False
original_revenue                False
runtime                         False
spoken_languages                False
status                          False
tagline                         False
title                           False
video                           False
vote_average                    False
vote_count                      False
year                            False
cpi_2021    

## Database:

In [117]:
from sqlalchemy import create_engine, inspect

In [118]:
# configure the connection string
rds_connection_string = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'
        
# connect to the database
engine = create_engine(rds_connection_string)
conn = engine.connect()

In [119]:
engine.execute("DROP TABLE IF EXISTS movies")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fb26e673e10>

In [120]:
# Append data to table
movie_df.to_sql(name='movies', con=conn, if_exists='append', index=False)

In [121]:
# Use inspector to find table names
Inspector = inspect(engine)
Inspector.get_table_names()

['duplicate_search',
 'no_filter',
 'female_filter',
 'international_filter',
 'low_budget_filter',
 'movies']

In [122]:
# Check movies table
pd.read_sql_query('select * from movies', con=conn)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,original_budget,genres,homepage,id,imdb_id,original_language,original_title,...,producers_cleaned,writers_cleaned,keywords_cleaned,genres_cleaned,production_companies_cleaned,overview_cleaned,soup,soup_overview,poster_url,lowercase_title
0,false,"""/c6H7Z4u73ir3cIoCteuhJh7UCAR.jpg""",,200000000,"[""Action"", ""Adventure"", ""Fantasy"", ""Science Fi...","""https://www.marvel.com/movies/the-eternals""",524434,"""tt9032400""","""en""","""Eternals""",...,"[""kevinfeige"", ""louisd'esposito"", ""victoriaalo...","[""jackkirby"", ""patrickburleigh"", ""ryanfirpo"", ...","[""boredom"", ""supernatural"", ""superhero"", ""base...","[""action"", ""adventure"", ""fantasy"", ""sciencefic...","[""marvelstudios""]","""the eternals are a team of ancient aliens who...","""boredom supernatural superhero basedoncomic s...","""boredom supernatural superhero basedoncomic s...","""https://image.tmdb.org/t/p/w780/b6qUu00iIIkXX...","""eternals"""
1,false,"""/1Rr5SrvHxMXHu5RjKpaMba8VTzi.jpg""","{""id"": 531241, ""name"": ""Spider-Man (Avengers) ...",200000000,"[""Action"", ""Adventure"", ""Science Fiction""]","""https://www.spidermannowayhome.movie""",634649,"""tt10872600""","""en""","""Spider-Man: No Way Home""",...,"[""aviarad"", ""kevinfeige"", ""louisd'esposito"", ""...","[""stanlee"", ""stanlee"", ""steveditko"", ""stevedit...","[""hero"", ""comicbook"", ""superhero"", ""basedoncom...","[""action"", ""adventure"", ""sciencefiction""]","[""marvelstudios"", ""pascalpictures"", ""columbiap...","""peter parker is unmasked and no longer able t...","""hero comicbook superhero basedoncomic marvelc...","""hero comicbook superhero basedoncomic marvelc...","""https://image.tmdb.org/t/p/w780/1g0dhYtq4irTY...","""spider-man: no way home"""
2,false,"""/tutaKitJJIaqZPyMz7rxrhb4Yxm.jpg""","{""id"": 544670, ""name"": ""Sing Collection"", ""pos...",85000000,"[""Animation"", ""Comedy"", ""Family"", ""Music""]","""https://www.illumination.com/movie/sing-2/""",438695,"""tt6467266""","""en""","""Sing 2""",...,"[""christophermeledandri"", ""janethealy"", ""danak...","[""garthjennings"", ""garthjennings""]","[""anthropomorphism"", ""singing""]","[""animation"", ""comedy"", ""family"", ""music""]","[""illuminationentertainment"", ""universalpictur...","""buster and his new cast now have their sights...","""anthropomorphism singing matthewmcconaughey r...","""anthropomorphism singing anthropomorphism sin...","""https://image.tmdb.org/t/p/w780/aWeKITRFbbwY8...","""sing 2"""
3,false,"""/3G1Q5xF40HkUBJXxt2DQgQzKTp5.jpg""",,50000000,"[""Animation"", ""Comedy"", ""Family"", ""Fantasy""]","""https://movies.disney.com/encanto""",568124,"""tt2953050""","""en""","""Encanto""",...,"[""clarkspencer"", ""jenniferlee"", ""yvettmerinofl...","[""jaredbush"", ""charisecastrosmith""]","[""magic"", ""musical"", ""forest"", ""familyrelation...","[""animation"", ""comedy"", ""family"", ""fantasy""]","[""waltdisneyanimationstudios"", ""waltdisneypict...","""the tale of an extraordinary family, the madr...","""magic musical forest familyrelationships fema...","""magic musical forest familyrelationships fema...","""https://image.tmdb.org/t/p/w780/4j0PNHkMr5ax3...","""encanto"""
4,false,"""/EnDlndEvw6Ptpp8HIwmRcSSNKQ.jpg""","{""id"": 2980, ""name"": ""Ghostbusters Collection""...",75000000,"[""Comedy"", ""Fantasy"", ""Adventure""]","""https://ghostbusters.com/""",425909,"""tt4513678""","""en""","""Ghostbusters: Afterlife""",...,"[""danaykroyd"", ""ivanreitman"", ""gilkenan"", ""mic...","[""danaykroyd"", ""haroldramis"", ""jasonreitman"", ...","[""smalltown"", ""ghostbuster"", ""nostalgia"", ""aft...","[""comedy"", ""fantasy"", ""adventure""]","[""columbiapictures"", ""bronstudios"", ""themontec...","""when a single mom and her two kids arrive in ...","""smalltown ghostbuster nostalgia afterlife seq...","""smalltown ghostbuster nostalgia afterlife seq...","""https://image.tmdb.org/t/p/w780/sg4xJaufDiQl7...","""ghostbusters: afterlife"""
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3303,false,"""/dApZi5U56RxvcFN6zeUJ3wSfONN.jpg""",,0,"[""Horror"", ""Science Fiction""]","""""",39020,"""tt0062433""","""es""","""La venganza del sexo""",...,"[""emiliovieyra"", ""orestestrucco""]","[""emiliovieyra"", ""emiliovieyra""]","[""madscientist""]","[""horror"", ""sciencefiction""]","[""productoresargentinosasociados""]","""a doctor kidnaps young couples and performs b...","""madscientist ricardobauleo gloriaprat aldobar...","""madscientist madscientist ricardobauleo glori...","""https://image.tmdb.org/t/p/w780/h10JuE2o3Rr7O...","""the curious dr. humpp"""
3304,false,"""/1kdyLiE2Mqt7cxnivdPX3qckvfW.jpg""","{""id"": 645, ""name"": ""James Bond Collection"", ""...",6500000,"[""Adventure"", ""Action"", ""Thriller""]","""https://www.mgm.com/movies/on-her-majestys-se...",668,"""tt0064757""","""en""","""On Her Majesty's Secret Service""",...,"[""albertr.broccoli"", ""harrysaltzman"", ""stanley...","[""ianfleming"", ""richardmaibaum""]","[""london,england"", ""suicide"", ""england"", ""base...","[""adventure"", ""action"", ""thriller""]","[""eonproductions"", ""unitedartists""]","""james bond tracks his archnemesis, ernst blof...","""london,england suicide england basedonnovelor...","""london,england suicide england basedonnovelor...","""https://image.tmdb.org/t/p/w780/iLAp1ODaZ8lbB...","""on her majesty's secret service"""
3305,false,"""/7mn0IFuByr9bsmlprotTdeO71Km.jpg""","{""id"": 645, ""name"": ""James Bond Collection"", ""...",5500000,"[""Adventure"", ""Action"", ""Thriller""]","""https://www.mgm.com/movies/thunderball""",660,"""tt0059800""","""en""","""Thunderball""",...,"[""kevinmcclory""]","[""ianfleming"", ""richardmaibaum"", ""kevinmcclory...","[""paris,france"", ""spy"", ""sea"", ""florida"", ""fig...","[""adventure"", ""action"", ""thriller""]","[""eonproductions"", ""danjaq"", ""unitedartists""]","""a criminal organization has obtained two nucl...","""paris,france spy sea florida fighterpilot san...","""paris,france spy sea florida fighterpilot san...","""https://image.tmdb.org/t/p/w780/oJ5ybB57eidPL...","""thunderball"""
3306,false,"""/lYfHa1AtkUMtplrKx7SRLHpwonW.jpg""","{""id"": 135495, ""name"": ""King Kong (1933) Colle...",672000,"[""Adventure"", ""Horror"", ""Fantasy""]","""""",244,"""tt0024216""","""en""","""King Kong""",...,"[""davido.selznick"", ""merianc.cooper"", ""ernestb...","[""merianc.cooper"", ""jamesashmorecreelman"", ""ru...","[""newyorkcity"", ""screenplay"", ""moviebusiness"",...","[""adventure"", ""horror"", ""fantasy""]","[""rkoradiopictures""]","""adventurous filmmaker, carl denham, sets out ...","""newyorkcity screenplay moviebusiness exoticis...","""newyorkcity screenplay moviebusiness exoticis...","""https://image.tmdb.org/t/p/w780/lHlnxKL5GbgRi...","""king kong"""
