## DATA COLLECTION AND EXTRACTION FROM THE MOVIE DATABASE

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import seaborn as sns
import requests
import json

api_key = ""
base_url = "https://api.themoviedb.org/3/movie/"
movie_ids = [0, 299534, 19995, 140607, 299536, 597, 135397,
             420818, 24428, 168259, 99861, 284054, 12445,
             181808, 330457, 351286, 109445, 321612, 260513]

movie_data = []

# Single loop to fetch data for each valid movie ID
for movie_id in movie_ids:
    if movie_id == 0:  # Skip invalid ID
        continue
        
    url = f"{base_url}{movie_id}?api_key={api_key}"
    try:
        response = requests.get(url)
        time.sleep(1)  # Pause to respect API rate limits

        # Raise an exception for bad status codes
        response.raise_for_status()  
        data = response.json()
        movie_data.append(data)
        print(f"Fetched data for movie ID: {movie_id} - {data.get('title')}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for movie ID {movie_id}: {e}")
    except ValueError:
        print(f"Error decoding JSON for movie ID {movie_id}")

# Creating a Pandas DataFrame from the fetched data
df = pd.DataFrame(movie_data)

# Create DataFrame and save to CSV
df.to_csv("../DE Project.csv", index=False)


Fetched data for movie ID: 299534 - Avengers: Endgame
Fetched data for movie ID: 19995 - Avatar
Fetched data for movie ID: 140607 - Star Wars: The Force Awakens
Fetched data for movie ID: 299536 - Avengers: Infinity War
Fetched data for movie ID: 597 - Titanic
Fetched data for movie ID: 135397 - Jurassic World
Fetched data for movie ID: 420818 - The Lion King
Fetched data for movie ID: 24428 - The Avengers
Fetched data for movie ID: 168259 - Furious 7
Fetched data for movie ID: 99861 - Avengers: Age of Ultron
Fetched data for movie ID: 284054 - Black Panther
Fetched data for movie ID: 12445 - Harry Potter and the Deathly Hallows: Part 2
Fetched data for movie ID: 181808 - Star Wars: The Last Jedi
Fetched data for movie ID: 330457 - Frozen II
Fetched data for movie ID: 351286 - Jurassic World: Fallen Kingdom
Fetched data for movie ID: 109445 - Frozen
Fetched data for movie ID: 321612 - Beauty and the Beast
Fetched data for movie ID: 260513 - Incredibles 2


CREDIT INFORMATION EXTRACTION AND PREPROCESSING

In [None]:


# Function to fetch movie credits (cast and crew)
def fetch_credits(movie_id):
    url = f"{base_url}{movie_id}/credits?api_key={api_key}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        
        cast = [member['name'] for member in data.get('cast', [])]
        cast_size = len(cast)
        
        # Get Director(s)
        directors = [member['name'] for member in data.get('crew', []) if member['job'] == 'Director']
        crew_size = len(data.get('crew', []))
        
        return {
            'movie_id': movie_id,
            'cast': cast,
            'cast_size': cast_size,
            'director': directors,
            'crew_size': crew_size
        }
    else:
        print(f"Movie ID {movie_id} not found.")
        return {
            'movie_id': movie_id,
            'cast': [],
            'cast_size': 0,
            'director': [],
            'crew_size': 0
        }

# Fetch data for all movie IDs
credits_data = [fetch_credits(movie_id) for movie_id in movie_ids]

# Create DataFrame
df_credits = pd.DataFrame(credits_data)

df_credits.to_csv("../data/extracted_df1.csv", index=False)



In [None]:
# Display information about the DataFrame
df_credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movie_id   19 non-null     int64 
 1   cast       19 non-null     object
 2   cast_size  19 non-null     int64 
 3   director   19 non-null     object
 4   crew_size  19 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 892.0+ bytes
