## Import Libraries

In [51]:
!pip install requests pandas python-dotenv



DEPRECATION: Loading egg at c:\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330


#import library 
import requests
import pandas as pd
import csv
import os
from dotenv import load_dotenv

In [1]:
import requests
import pandas as pd
import csv
import os
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [2]:
load_dotenv()
tmdb_api_key = os.getenv('TMDB_API_KEY')

## Fetch movie year range & attributes

In [3]:
YEARS = range(1950, 2025)

In [4]:
#CSV header based on OMDb movie data fields
CSV_HEADER = [
    'title', 'release_date', 'runtime', 'genres', 'director',
    'writer', 'actors', 'overview', 'language', 'country', 'awards', 'poster_path',
    'ratings', 'imdb_id', 'budget', 'revenue', 'production_companies', 'homepage'
]

## Function to get movie ID, store movie data into csv file

In [5]:
#directory for CSV files
os.makedirs('./data', exist_ok=True)

In [6]:
#list of movie IDs for a given year
def get_id_list(year, page=1):
    url = f'https://api.themoviedb.org/3/discover/movie?api_key={tmdb_api_key}&primary_release_year={year}&page={page}'
    session = requests.Session()
    retries = Retry(
        total=5,
        backoff_factor=1,
        status_forcelist=[500, 502, 503, 504],
        raise_on_status=True
    )
    session.mount('https://', HTTPAdapter(max_retries=retries))
    
    try:
        response = session.get(url, verify=False)
        response.raise_for_status()  # Will raise HTTPError for bad responses
        data = response.json()
        return data.get('results', [])
    except requests.exceptions.HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except requests.exceptions.ConnectionError as conn_err:
        print(f'Connection error occurred: {conn_err}')
    except requests.exceptions.Timeout as timeout_err:
        print(f'Timeout error occurred: {timeout_err}')
    except requests.exceptions.RequestException as req_err:
        print(f'Request error occurred: {req_err}')
    return []

In [15]:
#fetch movie data by ID
def get_data(movie_id):
    url = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb_api_key}'
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))
    response = requests.get(url)
    try:
        response = session.get(url, verify=False)
        response.raise_for_status()  # Will raise HTTPError for bad responses
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
    return None

In [16]:
#movie data to a CSV file
def write_file(file_name, movie_dict):
    with open(file_name, 'a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=movie_dict.keys())
        writer.writerow(movie_dict)

## Loop for iterate through the list of years

In [17]:

for year in YEARS:
    print(f"Fetching data for the year: {year}")
    movie_list = []
    page = 1
    while True:
        ids = get_id_list(year, page)
        if not ids:
            break
        movie_list.extend(ids)
        page += 1

     # Extract unique movie IDs from movie_list
    unique_movie_ids = {movie['id'] for movie in movie_list}

    print(unique_movie_ids)

    # Creating file and writing header
    FILE_NAME = f'./data/{year}_movie_collection_data.csv'

    with open(FILE_NAME, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=CSV_HEADER)
        writer.writeheader()

    # Iterate through the list of IDs to get data
    for movie_id in unique_movie_ids:
        movie_data = get_data(movie_id)
        print(movie_data)
        if movie_data:
            write_file(FILE_NAME, movie_data)

    print(f"Data for the year {year} saved to {FILE_NAME}")

print("Data fetching and saving complete.")

Fetching data for the year: 1950




{745479, 466960, 401425, 221207, 843800, 344090, 704539, 843804, 950301, 843806, 843814, 778281, 540714, 401451, 843818, 843821, 65586, 843829, 843830, 843834, 843836, 81981, 843838, 1048647, 606279, 843849, 843856, 344145, 802898, 360532, 843860, 1106009, 876638, 295008, 295009, 557155, 295012, 139373, 385135, 327791, 614522, 41084, 950398, 344195, 368777, 147598, 1294490, 458908, 385185, 57522, 73908, 123070, 172223, 573638, 983241, 114893, 114896, 737489, 344283, 303325, 1237225, 303338, 1138925, 196862, 286978, 975124, 598295, 57628, 311586, 90404, 1163563, 483628, 737585, 426289, 319806, 393537, 852290, 188741, 999749, 393552, 737616, 295255, 33117, 147810, 1065321, 188783, 188786, 1073522, 999803, 393607, 754057, 328076, 1073548, 147862, 557472, 90533, 696744, 795052, 860590, 418232, 352706, 65991, 172500, 344544, 557542, 401895, 287210, 475651, 475674, 548, 16939, 737840, 983605, 434743, 393786, 16958, 746046, 729675, 41552, 483922, 599, 229976, 66140, 303709, 164447, 98912, 139



KeyboardInterrupt: 

In [None]:
# for year in YEARS:
#     # Grab list of ids for all films made in {YEAR}
#     movie_list = list(set(get_id_list(tmdb_api_key, year)))

#     FILE_NAME = f'./data/{year}_movie_collection_data.csv'

#     # Creating file
#     with open(FILE_NAME, 'w') as f:
#         writer = csv.writer(f)
#         writer.writerow(CSV_HEADER)

#     # Iterate through list of ids to get data
#     for id in movie_list:
#         dict = get_data(tmdb_api_key, id)
#         write_file(FILE_NAME, dict)