In [None]:
# Importing packages
import requests
import pandas as pd
import json
import time
from tqdm import tqdm

In [None]:
# Creating empty data frame to hold all the movies
all_movies = pd.DataFrame()

# Creating list of years to grab
years = [2016, 2017, 2018, 2019, 2022, 2023, 2024]
# Creating list of pages to grab
pages = list(range(1,11,1))

# Calculating the total number of iterations for the progress bar
num_iter = len(years) * len(pages)

# Setting up the progress bar
with tqdm(total=num_iter) as pbar:
  # Looping through all the years and pages
  for year in years:
    for page in pages:

      # Specifying the url for where the data will be
      url = f"https://api.themoviedb.org/3/discover/movie?certification.gte=PG-13&certification_country=US&include_adult=false&include_video=false&language=en-US&page={page}&primary_release_year={year}&sort_by=popularity.desc&vote_count.gte=100&with_genres=28%7C12%7C27%7C878%7C53&with_origin_country=US&with_original_language=en"
      # Specifying my api key
      headers = {

      }

      # Using request to get the json file from the url using my api key
      response = requests.get(url, headers=headers)
      # Converting the json file to a dictionary type
      data = json.loads(response.text)

      # Grabbing the results from data and concating it with the all_movies df
      movies = data['results']
      df = pd.DataFrame(movies)
      all_movies = pd.concat([all_movies, df], ignore_index=True)

      # Updating the progress bar
      pbar.update(1)
    # Setting up a break to not overwhelm the api
    time.sleep(10)

100%|██████████| 70/70 [01:20<00:00,  1.15s/it]


In [None]:
# Grabbing the genre_ids, id, release_date, and title column
all_movies = all_movies[['genre_ids', 'id', 'release_date', 'title']]

In [None]:
# Verifying everything looks alright
all_movies

Unnamed: 0,genre_ids,id,release_date,title
0,"[80, 53, 18]",302946,2016-10-13,The Accountant
1,[27],259693,2016-06-08,The Conjuring 2
2,"[28, 12, 878]",330459,2016-12-14,Rogue One: A Star Wars Story
3,"[12, 28, 878]",271110,2016-04-27,Captain America: Civil War
4,"[28, 12, 35]",293660,2016-02-09,Deadpool
...,...,...,...,...
1131,"[878, 53]",790462,2024-01-19,I.S.S.
1132,"[9648, 53, 80, 27]",1059345,2024-01-19,Cult Killer
1133,"[18, 80, 53]",1090874,2024-02-22,Mea Culpa
1134,"[53, 18]",1248753,2024-09-27,Amber Alert


In [None]:
# Converting the release_date column for a datatime columns
all_movies['release_date'] = pd.to_datetime(all_movies['release_date'])

# Creating function fo get the season based on the month
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

# Creating seasons columns in all_movies
all_movies['season'] = all_movies['release_date'].dt.month.apply(get_season)

In [None]:
# Creating budget_id dataframe to hold the movie id, budget, and box office earnings.
budget_id = pd.DataFrame(columns = ['id', 'budget', 'box office'])

# Creating list of all the id's from the all_movies.
movie_ids = list(all_movies['id'].values)
# Setting num_breaks to 0
num_breaks = 0
# Printing how many id's will be retrieved
print(f'Retrieving id\'s for {len(movie_ids)} movies')
# Looping through all movie_ids
for i in movie_ids:
    # Adding 1 to num_breaks
    num_breaks = num_breaks + 1

    # Calling the api with out key
    url = f"https://api.themoviedb.org/3/movie/{i}?language=en-US"
    headers = {

    }
    response = requests.get(url, headers=headers)

    # Converting the response.text to a dictionary json type
    data = json.loads(response.text)

    # Grabbing the id, budget, and revenue extries
    movie_id = data['id']
    movie_budget = data['budget']
    movie_box_office = data['revenue']

    # Updating the budget_id data frame
    budget_id.loc[len(budget_id.index)] = [movie_id, movie_budget, movie_box_office]

    # If num_breaks + 1 divided by 50 had a remainder of 0
    if (num_breaks + 1) % 50 == 0:
      # Then take a break for 10 seconds
      time.sleep(10)

# Printing when all the id's are gotten
print('Done!')


Retrieving id's for 1136 movies
Done!


In [None]:
# Merging all_movies df and budget_id df based on the id column.
merged_movies = pd.merge(all_movies, budget_id, on='id')
merged_movies

Unnamed: 0,genre_ids,id,release_date,title,season,budget,box office
0,"[80, 53, 18]",302946,2016-10-13,The Accountant,Fall,44000000,155160045
1,[27],259693,2016-06-08,The Conjuring 2,Summer,40000000,322811702
2,"[28, 12, 878]",330459,2016-12-14,Rogue One: A Star Wars Story,Winter,200000000,1056057273
3,"[12, 28, 878]",271110,2016-04-27,Captain America: Civil War,Spring,250000000,1155046416
4,"[28, 12, 35]",293660,2016-02-09,Deadpool,Winter,58000000,782837347
...,...,...,...,...,...,...,...
1131,"[878, 53]",790462,2024-01-19,I.S.S.,Winter,13800000,6605079
1132,"[9648, 53, 80, 27]",1059345,2024-01-19,Cult Killer,Winter,0,0
1133,"[18, 80, 53]",1090874,2024-02-22,Mea Culpa,Winter,0,0
1134,"[53, 18]",1248753,2024-09-27,Amber Alert,Fall,0,0


In [None]:
# Getting all movies when a budget greater than 0 and box office earnings greater than 0
merged_movies = merged_movies[merged_movies['budget'] > 0]
merged_movies = merged_movies[merged_movies['box office'] > 0]
merged_movies

Unnamed: 0,genre_ids,id,release_date,title,season,budget,box office
0,"[80, 53, 18]",302946,2016-10-13,The Accountant,Fall,44000000,155160045
1,[27],259693,2016-06-08,The Conjuring 2,Summer,40000000,322811702
2,"[28, 12, 878]",330459,2016-12-14,Rogue One: A Star Wars Story,Winter,200000000,1056057273
3,"[12, 28, 878]",271110,2016-04-27,Captain America: Civil War,Spring,250000000,1155046416
4,"[28, 12, 35]",293660,2016-02-09,Deadpool,Winter,58000000,782837347
...,...,...,...,...,...,...,...
1106,"[12, 18]",618588,2024-03-15,Arthur the King,Spring,19000000,40829138
1117,[27],1072342,2024-01-03,Night Swim,Winter,15000000,54771241
1124,"[35, 27, 10749]",993784,2024-02-07,Lisa Frankenstein,Winter,13400000,9927714
1129,"[12, 35, 18]",976584,2024-01-11,The Book of Clarence,Winter,40000000,6205230


In [None]:
# Creating separate dataframes for each season
movies_fall = merged_movies[merged_movies['season'] == 'Fall']
movies_spring = merged_movies[merged_movies['season'] == 'Spring']
movies_winter = merged_movies[merged_movies['season'] == 'Winter']
movies_summer = merged_movies[merged_movies['season'] == 'Summer']

# Taking random samples of 50 movies from each season with random state for reproducibility
movies_fall_sample = movies_fall.sample(n=50, random_state=23)
movies_spring_sample = movies_spring.sample(n=50, random_state=23)
movies_winter_sample = movies_winter.sample(n=50, random_state=23)
movies_summer_sample = movies_summer.sample(n=50, random_state=23)

# Concating all the randomly sampled movies together
tot_movies = pd.concat([movies_fall_sample, movies_spring_sample, movies_summer_sample, movies_winter_sample])

# Exporting as movies_fifty.csv, with no index
tot_movies.to_csv('movies_fifty.csv', index = False)