In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import seaborn as sns
import requests
import json

In [3]:
api_key = "0fecc2c4c89ffb94c6f4fed5e5cdad57"
base_url = "https://api.themoviedb.org/3"

In [4]:
# list of movie IDs
movie_ids = [0, 299534, 19995, 140607, 299536, 597, 135397, 
             420818, 24428, 168259, 99861, 284054, 12445, 
             181808, 330457, 351286, 109445, 321612, 260513]

# List to store data
movie_data = []

# Loop to Fetch data for each movie ID
for movie_id in movie_ids:
    if movie_id == 0:  # Skip invalid ID
        continue
        
    # Endpoint for movie details
    movie_endpoint = f"{base_url}/movie/{movie_id}"
    
    # Parameters
    params = {
        "api_key": api_key,
        "append_to_response": "adult, imdb_id, original_title, video, homepage,credits,keywords,release_dates,revenue,budget,production_countries,production_companies,cast,cast_size,crew,crew_size,popularity,runtime"
    }
    
    #  API request
    response = requests.get(movie_endpoint, params=params)
    
    # Checking if request was successful
    if response.status_code == 200:
        data = response.json()
        movie_data.append(data)
        print(f"Successfully fetched data for movie ID: {movie_id}")
    else:
        print(f"Failed to fetch data for movie ID: {movie_id}. Status code: {response.status_code}")
    
    # delay to avoid rate limiting
    time.sleep(0.25)

try:
    # Convert to DataFrame
    if movie_data:
        # Extract relevant fields from the complex JSON response
        movies_df = pd.DataFrame([
            {
            'id': movie.get('id'),
            'adult': movie.get('adult'),
            'imdb_id': movie.get('imdb_id'),
            'original_title': movie.get('original_title'),
            'video': movie.get('video'), 
            'homepage': movie.get('homepage'),
            'title': movie.get('title'),
            'tagline': movie.get('tagline'),
            'release_date': movie.get('release_date'),
            'genres': [genre['name'] for genre in movie.get('genres', [])],
            'belongs_to_collection': movie.get('belongs_to_collection'),
            'original_language': movie.get('original_language'),
            'budget': movie.get('budget'),
            'revenue': movie.get('revenue'),
            'runtime': movie.get('runtime'),
            'vote_average': movie.get('vote_average'),
            'vote_count': movie.get('vote_count'),
            'popularity': movie.get('popularity'),
            'production_countries': movie.get('production_countries'),
            'director': [crew['name'] for crew in movie.get('credits', {}).get('crew', []) 
                   if crew.get('job') == 'Director'],
            'production_companies': [company['name'] for company in movie.get('production_companies', [])]
            }
            for movie in movie_data
        ])
except Exception as e:
    print(f"Error fetching data for movie ID {movie_id}: {str(e)}")

# Save raw data to CSV
movies_df.to_csv("raw_movie_data.csv", index=False)
print("Data saved to raw_movie_data.csv")

Successfully fetched data for movie ID: 299534
Successfully fetched data for movie ID: 19995
Successfully fetched data for movie ID: 140607
Successfully fetched data for movie ID: 299536
Successfully fetched data for movie ID: 597
Successfully fetched data for movie ID: 135397
Successfully fetched data for movie ID: 420818
Successfully fetched data for movie ID: 24428
Successfully fetched data for movie ID: 168259
Successfully fetched data for movie ID: 99861
Successfully fetched data for movie ID: 284054
Successfully fetched data for movie ID: 12445
Successfully fetched data for movie ID: 181808
Successfully fetched data for movie ID: 330457
Successfully fetched data for movie ID: 351286
Successfully fetched data for movie ID: 109445
Successfully fetched data for movie ID: 321612
Successfully fetched data for movie ID: 260513
Data saved to raw_movie_data.csv


In [5]:
# Display columns
pd.set_option('display.max_columns', None)

In [6]:
# Trying to see all columns in a list
print("\nColumns in DataFrame:")
print(movies_df.columns.tolist())


Columns in DataFrame:
['id', 'adult', 'imdb_id', 'original_title', 'video', 'homepage', 'title', 'tagline', 'release_date', 'genres', 'belongs_to_collection', 'original_language', 'budget', 'revenue', 'runtime', 'vote_average', 'vote_count', 'popularity', 'production_countries', 'director', 'production_companies']


In [None]:
movies_df

In [8]:
movies_final = movies_df.drop(['adult', 'imdb_id', 'original_title', 'video', 'homepage'],axis=1)

In [None]:
movies_final

In [10]:
import ast
import ast

# Clean and format specified columns
def extract_names(column_data):
    try:
        if isinstance(column_data, str):
            # If it's a string, try to evaluate it as literal
            column_data = ast.literal_eval(column_data)
        
        if isinstance(column_data, list):
            return "|".join([entry.get("name", "") for entry in column_data if isinstance(entry, dict)])
        return ""
    except:
        return ""

def extract_collection_name(collection_data):
    try:
        if isinstance(collection_data, str):
            # If it's a string, try to evaluate it as literal
            collection_data = ast.literal_eval(collection_data)
            
        if isinstance(collection_data, dict):
            return collection_data.get("name", "")
        return ""
    except:
        return ""

# Apply transformations
movies_final["genres"] = movies_final["genres"].apply(lambda x: "|".join(x) if isinstance(x, list) else "")
movies_final["production_countries"] = movies_final["production_countries"].apply(extract_names)
movies_final["production_companies"] = movies_final["production_companies"].apply(lambda x: "|".join(x) if isinstance(x, list) else "")
movies_final["belongs_to_collection"] = movies_final["belongs_to_collection"].apply(extract_collection_name)

# Preview cleaned DataFrame
movies_final.head()


Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget,revenue,runtime,vote_average,vote_count,popularity,production_countries,director,production_companies
0,299534,Avengers: Endgame,Avenge the fallen.,2019-04-24,Adventure|Science Fiction|Action,The Avengers Collection,en,356000000,2799439100,181,8.238,26214,26.9856,United States of America,"[Anthony Russo, Joe Russo]",Marvel Studios
1,19995,Avatar,Enter the world of Pandora.,2009-12-15,Action|Adventure|Fantasy|Science Fiction,Avatar Collection,en,237000000,2923706026,162,7.588,32126,32.3911,United States of America|United Kingdom,[James Cameron],Dune Entertainment|Lightstorm Entertainment|20...
2,140607,Star Wars: The Force Awakens,Every generation has a story.,2015-12-15,Adventure|Action|Science Fiction,Star Wars Collection,en,245000000,2068223624,136,7.3,19667,13.0107,United States of America,[J.J. Abrams],Lucasfilm Ltd.|Bad Robot
3,299536,Avengers: Infinity War,Destiny arrives all the same.,2018-04-25,Adventure|Action|Science Fiction,The Avengers Collection,en,300000000,2052415039,149,8.235,30394,34.0939,United States of America,"[Joe Russo, Anthony Russo]",Marvel Studios
4,597,Titanic,Nothing on Earth could come between them.,1997-11-18,Drama|Romance,,en,200000000,2264162353,194,7.905,25873,37.215,United States of America,[James Cameron],Paramount Pictures|20th Century Fox|Lightstorm...


In [11]:
# Check original_language distribution
print("Original Language Distribution:")
print(movies_final['original_language'].value_counts())
print("\n")

# Check genres distribution 
print("Unique Genres:")
print(movies_final['genres'].value_counts())
print("\n")

# Check production countries
print("Production Countries Distribution:")
print(movies_final['production_countries'].value_counts())
print("\n")

# Check collections
print("Collections Distribution:")
print(movies_final['belongs_to_collection'].value_counts())
print("\n")

# Check for any missing or null values across all columns
print("Missing Values Count:")
print(movies_final.isnull().sum())

Original Language Distribution:
original_language
en    18
Name: count, dtype: int64


Unique Genres:
genres
Adventure|Action|Science Fiction             3
Action|Adventure|Science Fiction|Thriller    2
Action|Adventure|Science Fiction             2
Adventure|Science Fiction|Action             1
Action|Adventure|Fantasy|Science Fiction     1
Drama|Romance                                1
Adventure|Drama|Family|Animation             1
Science Fiction|Action|Adventure             1
Action|Thriller|Crime                        1
Fantasy|Adventure                            1
Family|Animation|Adventure|Comedy|Fantasy    1
Animation|Family|Adventure|Fantasy           1
Family|Fantasy|Romance                       1
Action|Adventure|Animation|Family            1
Name: count, dtype: int64


Production Countries Distribution:
production_countries
United States of America                   16
United States of America|United Kingdom     1
United Kingdom|United States of America     1
Name: count

In [12]:
# Convert budget, id, popularity to numeric
movies_final['budget'] = pd.to_numeric(movies_final['budget'], errors='coerce')
movies_final['id'] = pd.to_numeric(movies_final['id'], errors='coerce')
movies_final['popularity'] = pd.to_numeric(movies_final['popularity'], errors='coerce')

# Convert release_date to datetime
movies_final['release_date'] = pd.to_datetime(movies_final['release_date'])

In [13]:
print(movies_final.dtypes)


id                                int64
title                            object
tagline                          object
release_date             datetime64[ns]
genres                           object
belongs_to_collection            object
original_language                object
budget                            int64
revenue                           int64
runtime                           int64
vote_average                    float64
vote_count                        int64
popularity                      float64
production_countries             object
director                         object
production_companies             object
dtype: object


In [14]:
# Convert budget and revenue to millions (for better readability)
movies_final['budget_millions'] = movies_final['budget'] / 1000000
movies_final['revenue_millions'] = movies_final['revenue'] / 1000000

# Calculate ROI (Return on Investment)
movies_final['roi'] = (movies_final['revenue'] - movies_final['budget']) / movies_final['budget']

# Replace 0 values with NaN for budget and revenue
movies_final['budget'] = movies_final['budget'].replace(0, np.nan)
movies_final['revenue'] = movies_final['revenue'].replace(0, np.nan)

# Create a boolean mask for low vote counts (you can adjust the threshold)
low_votes_mask = movies_final['vote_count'] < 1000

# Adjust vote_average for movies with low vote counts
# We'll weight them less in the overall rating
C = movies_final['vote_average'].mean()  # mean rating across all movies
m = 1000  # minimum votes required (can be adjusted)

movies_final['adjusted_rating'] = (movies_final['vote_count'] * movies_final['vote_average'] + m * C) / (movies_final['vote_count'] + m)

# Replace empty or placeholder taglines
movies_final['tagline'] = movies_final['tagline'].replace(['', 'No Tagline', 'No Data'], np.nan)

# Display the first few rows with new columns
print("\nSummary of financial metrics:")
print(movies_final[['title', 'budget_millions', 'revenue_millions', 'roi', 'vote_average', 'adjusted_rating']].head())


Summary of financial metrics:
                          title  budget_millions  revenue_millions        roi  \
0             Avengers: Endgame            356.0       2799.439100   6.863593   
1                        Avatar            237.0       2923.706026  11.336312   
2  Star Wars: The Force Awakens            245.0       2068.223624   7.441729   
3        Avengers: Infinity War            300.0       2052.415039   5.841383   
4                       Titanic            200.0       2264.162353  10.320812   

   vote_average  adjusted_rating  
0         8.238         8.206601  
1         7.588         7.581827  
2         7.300         7.304040  
3         8.235         8.207877  
4         7.905         7.885594  


In [15]:
movies_final

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget,revenue,runtime,vote_average,vote_count,popularity,production_countries,director,production_companies,budget_millions,revenue_millions,roi,adjusted_rating
0,299534,Avengers: Endgame,Avenge the fallen.,2019-04-24,Adventure|Science Fiction|Action,The Avengers Collection,en,356000000,2799439100,181,8.238,26214,26.9856,United States of America,"[Anthony Russo, Joe Russo]",Marvel Studios,356.0,2799.4391,6.863593,8.206601
1,19995,Avatar,Enter the world of Pandora.,2009-12-15,Action|Adventure|Fantasy|Science Fiction,Avatar Collection,en,237000000,2923706026,162,7.588,32126,32.3911,United States of America|United Kingdom,[James Cameron],Dune Entertainment|Lightstorm Entertainment|20...,237.0,2923.706026,11.336312,7.581827
2,140607,Star Wars: The Force Awakens,Every generation has a story.,2015-12-15,Adventure|Action|Science Fiction,Star Wars Collection,en,245000000,2068223624,136,7.3,19667,13.0107,United States of America,[J.J. Abrams],Lucasfilm Ltd.|Bad Robot,245.0,2068.223624,7.441729,7.30404
3,299536,Avengers: Infinity War,Destiny arrives all the same.,2018-04-25,Adventure|Action|Science Fiction,The Avengers Collection,en,300000000,2052415039,149,8.235,30394,34.0939,United States of America,"[Joe Russo, Anthony Russo]",Marvel Studios,300.0,2052.415039,5.841383,8.207877
4,597,Titanic,Nothing on Earth could come between them.,1997-11-18,Drama|Romance,,en,200000000,2264162353,194,7.905,25873,37.215,United States of America,[James Cameron],Paramount Pictures|20th Century Fox|Lightstorm...,200.0,2264.162353,10.320812,7.885594
5,135397,Jurassic World,The park is open.,2015-06-06,Action|Adventure|Science Fiction|Thriller,Jurassic Park Collection,en,150000000,1671537444,124,6.693,20628,19.2426,United States of America,[Colin Trevorrow],Amblin Entertainment|Universal Pictures,150.0,1671.537444,10.143583,6.724926
6,420818,The Lion King,The king has returned.,2019-07-12,Adventure|Drama|Family|Animation,The Lion King (Reboot) Collection,en,260000000,1662020819,118,7.11,10303,22.4849,United States of America,[Jon Favreau],Walt Disney Pictures|Fairview Entertainment,260.0,1662.020819,5.392388,7.134197
7,24428,The Avengers,Some assembly required.,2012-04-25,Science Fiction|Action|Adventure,The Avengers Collection,en,220000000,1518815515,143,7.735,31521,35.1429,United States of America,[Joss Whedon],Marvel Studios,220.0,1518.815515,5.903707,7.724192
8,168259,Furious 7,Vengeance hits home.,2015-04-01,Action|Thriller|Crime,The Fast and the Furious Collection,en,190000000,1515400000,137,7.225,10764,14.7346,United States of America,[James Wan],Original Film|One Race|Universal Pictures,190.0,1515.4,6.975789,7.238473
9,99861,Avengers: Age of Ultron,A new age has come.,2015-04-22,Action|Adventure|Science Fiction,The Avengers Collection,en,365000000,1405403694,141,7.271,23350,17.9154,United States of America,[Joss Whedon],Marvel Studios,365.0,1405.403694,2.850421,7.27562


In [16]:
# Remove duplicates based on 'id' and 'title'
movies_final = movies_final.drop_duplicates(subset=['id', 'title'])

# Remove rows with null 'id' or 'title'
movies_final = movies_final.dropna(subset=['id', 'title'])

# Keep rows where at least 10 columns have values
movies_final = movies_final.dropna(thresh=10)

In [18]:
# Rename the budget_millions and revenue_millions columns
movies_final = movies_final.rename(columns={
    'budget_millions': 'budget_musd',
    'revenue_millions': 'revenue_musd'
})

In [19]:
# Define the desired column order
column_order = ['id', 'title', 'tagline', 'release_date', 'genres', 'belongs_to_collection', 
                'original_language', 'budget_musd', 'revenue_musd', 'production_companies', 
                'production_countries', 'vote_count', 'vote_average', 'popularity', 'director','runtime']

# Reorder columns (only including columns that exist in the DataFrame)
movies_final = movies_final[column_order]

In [20]:
movies_final

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,production_countries,vote_count,vote_average,popularity,director,runtime
0,299534,Avengers: Endgame,Avenge the fallen.,2019-04-24,Adventure|Science Fiction|Action,The Avengers Collection,en,356.0,2799.4391,Marvel Studios,United States of America,26214,8.238,26.9856,"[Anthony Russo, Joe Russo]",181
1,19995,Avatar,Enter the world of Pandora.,2009-12-15,Action|Adventure|Fantasy|Science Fiction,Avatar Collection,en,237.0,2923.706026,Dune Entertainment|Lightstorm Entertainment|20...,United States of America|United Kingdom,32126,7.588,32.3911,[James Cameron],162
2,140607,Star Wars: The Force Awakens,Every generation has a story.,2015-12-15,Adventure|Action|Science Fiction,Star Wars Collection,en,245.0,2068.223624,Lucasfilm Ltd.|Bad Robot,United States of America,19667,7.3,13.0107,[J.J. Abrams],136
3,299536,Avengers: Infinity War,Destiny arrives all the same.,2018-04-25,Adventure|Action|Science Fiction,The Avengers Collection,en,300.0,2052.415039,Marvel Studios,United States of America,30394,8.235,34.0939,"[Joe Russo, Anthony Russo]",149
4,597,Titanic,Nothing on Earth could come between them.,1997-11-18,Drama|Romance,,en,200.0,2264.162353,Paramount Pictures|20th Century Fox|Lightstorm...,United States of America,25873,7.905,37.215,[James Cameron],194
5,135397,Jurassic World,The park is open.,2015-06-06,Action|Adventure|Science Fiction|Thriller,Jurassic Park Collection,en,150.0,1671.537444,Amblin Entertainment|Universal Pictures,United States of America,20628,6.693,19.2426,[Colin Trevorrow],124
6,420818,The Lion King,The king has returned.,2019-07-12,Adventure|Drama|Family|Animation,The Lion King (Reboot) Collection,en,260.0,1662.020819,Walt Disney Pictures|Fairview Entertainment,United States of America,10303,7.11,22.4849,[Jon Favreau],118
7,24428,The Avengers,Some assembly required.,2012-04-25,Science Fiction|Action|Adventure,The Avengers Collection,en,220.0,1518.815515,Marvel Studios,United States of America,31521,7.735,35.1429,[Joss Whedon],143
8,168259,Furious 7,Vengeance hits home.,2015-04-01,Action|Thriller|Crime,The Fast and the Furious Collection,en,190.0,1515.4,Original Film|One Race|Universal Pictures,United States of America,10764,7.225,14.7346,[James Wan],137
9,99861,Avengers: Age of Ultron,A new age has come.,2015-04-22,Action|Adventure|Science Fiction,The Avengers Collection,en,365.0,1405.403694,Marvel Studios,United States of America,23350,7.271,17.9154,[Joss Whedon],141


In [21]:
# Calculate profit and ROI
movies_final['profit_musd'] = movies_final['revenue_musd'] - movies_final['budget_musd']
movies_final['roi'] = movies_final['revenue_musd'] / movies_final['budget_musd']

# Create rankings
rankings = {}

# Highest Revenue
rankings['Highest Revenue'] = movies_final.nlargest(5, 'revenue_musd')[['title', 'revenue_musd']]

# Highest Budget  
rankings['Highest Budget'] = movies_final.nlargest(5, 'budget_musd')[['title', 'budget_musd']]

# Highest Profit
rankings['Highest Profit'] = movies_final.nlargest(5, 'profit_musd')[['title', 'profit_musd']]

# Lowest Profit
rankings['Lowest Profit'] = movies_final.nsmallest(5, 'profit_musd')[['title', 'profit_musd']]

# Highest ROI (Budget >= 10M)
budget_filter = movies_final['budget_musd'] >= 10
rankings['Highest ROI'] = movies_final[budget_filter].nlargest(5, 'roi')[['title', 'roi']]

# Lowest ROI (Budget >= 10M)
rankings['Lowest ROI'] = movies_final[budget_filter].nsmallest(5, 'roi')[['title', 'roi']]

# Most Voted
rankings['Most Voted'] = movies_final.nlargest(5, 'vote_count')[['title', 'vote_count']]

# Highest Rated (>= 10 votes)
votes_filter = movies_final['vote_count'] >= 10
rankings['Highest Rated'] = movies_final[votes_filter].nlargest(5, 'vote_average')[['title', 'vote_average']]

# Lowest Rated (>= 10 votes)
rankings['Lowest Rated'] = movies_final[votes_filter].nsmallest(5, 'vote_average')[['title', 'vote_average']]

# Most Popular
rankings['Most Popular'] = movies_final.nlargest(5, 'popularity')[['title', 'popularity']]

# Print all rankings
for category, data in rankings.items():
    print(f"\n{category}:")
    print(data)


Highest Revenue:
                          title  revenue_musd
1                        Avatar   2923.706026
0             Avengers: Endgame   2799.439100
4                       Titanic   2264.162353
2  Star Wars: The Force Awakens   2068.223624
3        Avengers: Infinity War   2052.415039

Highest Budget:
                          title  budget_musd
9       Avengers: Age of Ultron        365.0
0             Avengers: Endgame        356.0
3        Avengers: Infinity War        300.0
6                 The Lion King        260.0
2  Star Wars: The Force Awakens        245.0

Highest Profit:
                          title  profit_musd
1                        Avatar  2686.706026
0             Avengers: Endgame  2443.439100
4                       Titanic  2064.162353
2  Star Wars: The Force Awakens  1823.223624
3        Avengers: Infinity War  1752.415039

Lowest Profit:
                       title  profit_musd
9    Avengers: Age of Ultron  1040.403694
17             Incredibles 2  10

In [22]:
# Create franchise vs standalone comparison
franchise_mask = movies_final['belongs_to_collection'].notna() & (movies_final['belongs_to_collection'] != '')
franchise_movies = movies_final[franchise_mask]
standalone_movies = movies_final[~franchise_mask]

comparison = pd.DataFrame({
    'Franchise Movies': [
        franchise_movies['revenue_musd'].mean(),
        franchise_movies['roi'].median(), 
        franchise_movies['budget_musd'].mean(),
        franchise_movies['popularity'].mean(),
        franchise_movies['vote_average'].mean()
    ],
    'Standalone Movies': [
        standalone_movies['revenue_musd'].mean(),
        standalone_movies['roi'].median(),
        standalone_movies['budget_musd'].mean(), 
        standalone_movies['popularity'].mean(),
        standalone_movies['vote_average'].mean()
    ]
}, index=['Mean Revenue ($M)', 'Median ROI', 'Mean Budget ($M)', 'Mean Popularity', 'Mean Rating'])

comparison['Difference'] = comparison['Franchise Movies'] - comparison['Standalone Movies']
print("\nFranchise vs Standalone Movie Comparison:")
print(comparison.round(2))


Franchise vs Standalone Movie Comparison:
                   Franchise Movies  Standalone Movies  Difference
Mean Revenue ($M)           1682.64            1765.14      -82.50
Median ROI                     7.79               9.62       -1.83
Mean Budget ($M)             219.88             180.00       39.88
Mean Popularity               20.05              27.47       -7.42
Mean Rating                    7.38               7.44       -0.06
