# Author: Tao Shan

# Description: Data Generation for metadata, posters, movie descriptions for TMDB movie dataset by API.

API documentation: https://developers.themoviedb.org/3/getting-started/introduction

API Setting: https://www.themoviedb.org/settings/api



### large version: with 10000 user's records, 24279 movies



In [1]:
import requests
import json
from tqdm.auto import tqdm
import time
import os
import sys
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
DIR_PATH = "/content/drive/MyDrive/cs680 final project/MovieLen_25M_data"

In [4]:
# Rating Information
df_ratings = pd.read_csv(
    os.path.join(DIR_PATH, f"ratings.csv")
)
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [5]:
# use first 10000 user's data, because of limit in RAM (the final data potentially at least 10 GB)
unique_user_ids = df_ratings['userId'].unique()[:10000]
df_ratings = df_ratings[df_ratings['userId'].isin(unique_user_ids)]
df_ratings.shape

(1496612, 4)

In [6]:
# reference between TMDB and MovieLen
df_links = pd.read_csv(
    os.path.join(DIR_PATH, f"links.csv")
)
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [7]:
df = df_ratings.merge(df_links, on='movieId', how='left')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,296,5.0,1147880044,110912,680.0
1,1,306,3.5,1147868817,111495,110.0
2,1,307,5.0,1147868828,108394,108.0
3,1,665,5.0,1147878820,114787,11902.0
4,1,899,3.5,1147868510,45152,872.0


In [8]:
unique_tmdb_id = df['tmdbId'].unique()

In [9]:
len(unique_tmdb_id)

24279

In [10]:
API_KEY = 'cba8e5484754b57689000edac0bca0d7'
BASE_URL = 'https://api.themoviedb.org/3'
POSTER_BASE_URL = 'https://image.tmdb.org/t/p/w500'

def fetch_movie_data(movie_id):
    url = f"{BASE_URL}/movie/{movie_id}?api_key={API_KEY}&language=en-US"
    response = requests.get(url)
    data = response.json()

    if 'status_code' in data and data['status_code'] == 34:
        return None

    movie_data = {
        'id': data['id'],
        'title': data['title'],
        'original_title': data['original_title'],
        'genres': [genre['name'] for genre in data['genres']],
        'overview': data['overview'],
        'tagline': data['tagline'],
        'release_date': data['release_date'],
        'runtime': data['runtime'],
        'popularity': data['popularity'],
        'vote_count': data['vote_count'],
        'vote_average': data['vote_average'],
        'poster_path': data['poster_path'],
        'poster_url': POSTER_BASE_URL + data['poster_path'] if data['poster_path'] else None,
        'original_language': data['original_language'],
        'status': data['status'],
        'production_companies': [company['name'] for company in data['production_companies']],
        'production_countries': [country['name'] for country in data['production_countries']],
        'spoken_languages': [language['english_name'] for language in data['spoken_languages']],
    }

    return movie_data

start_movie_id = 1
total_movies = 500
movie_data_list = []
movie_id = start_movie_id

# the free account only has access for 40 per 10 seconds.
requests_per_window = 40
window_duration = 10  # in seconds
request_interval = window_duration / requests_per_window

for movie_id in tqdm(unique_tmdb_id):
    start_time = time.time()
    current_time = time.time()
    elapsed_time = current_time - start_time
    if elapsed_time < request_interval:
        time.sleep(request_interval - elapsed_time)
    movie_data = fetch_movie_data(movie_id)
    if movie_data and movie_data['poster_url']: #make sure no missing in poster urls
        movie_data_list.append(movie_data)

df = pd.DataFrame(movie_data_list)


  0%|          | 0/24279 [00:00<?, ?it/s]

In [11]:
df.head()

Unnamed: 0,id,title,original_title,genres,overview,tagline,release_date,runtime,popularity,vote_count,vote_average,poster_path,poster_url,original_language,status,production_companies,production_countries,spoken_languages
0,680,Pulp Fiction,Pulp Fiction,"[Thriller, Crime]","A burger-loving hit man, his philosophical par...",Just because you are a character doesn't mean ...,1994-09-10,154,79.449,24942,8.49,/d5iIlFn5s0ImszYzBPb8JPIfbXD.jpg,https://image.tmdb.org/t/p/w500/d5iIlFn5s0Imsz...,en,Released,"[Miramax, A Band Apart, Jersey Films]",[United States of America],"[English, Spanish, French]"
1,110,Three Colors: Red,Trois couleurs : Rouge,"[Drama, Mystery, Romance]","Valentine, a student model in Geneva, struggle...",,1994-05-12,100,23.652,1149,7.956,/JHmsBiX1tjCKqAul1lzC20WcAW.jpg,https://image.tmdb.org/t/p/w500/JHmsBiX1tjCKqA...,fr,Released,"[Zespół Filmowy TOR, Le Studio Canal+, France ...","[France, Poland, Switzerland]",[French]
2,108,Three Colors: Blue,Trois couleurs : Bleu,[Drama],Julie is haunted by her grief after living thr...,,1993-08-01,98,18.841,1412,7.689,/33wsWxzsNstI8N7dvuwzFmj1qBd.jpg,https://image.tmdb.org/t/p/w500/33wsWxzsNstI8N...,fr,Released,"[France 3 Cinéma, CED Productions, Miramax]","[France, Poland, Switzerland]","[French, Polish]"
3,11902,Underground,Подземље,"[Comedy, Drama, War]",A group of Serbian socialists prepares for the...,ONCE UPON A TIME THERE WAS A COUNTRY,1995-04-11,170,12.863,554,7.741,/h8N6y13t4VusrDdH5PzTkwvBvgN.jpg,https://image.tmdb.org/t/p/w500/h8N6y13t4VusrD...,sr,Released,"[Komuna, Barrandov Studio]","[Bulgaria, Czech Republic, France, Germany, Hu...","[French, German, Serbian]"
4,872,Singin' in the Rain,Singin' in the Rain,"[Comedy, Music, Romance, Drama]","In 1927 Hollywood, a silent film production co...",What a Glorious Feeling!,1952-04-09,103,24.599,2710,8.165,/671EPwBsHGHBk0cdOeZqmOK0XB3.jpg,https://image.tmdb.org/t/p/w500/671EPwBsHGHBk0...,en,Released,[Metro-Goldwyn-Mayer],[United States of America],[English]


In [12]:
unique_types = df['id'].apply(type).unique()
print(unique_types)

[<class 'int'>]


In [13]:
df.shape

(23875, 18)

In [14]:
import re
def parse_list(s):
    # Remove surrounding brackets and split the string by comma and optional whitespace
    s = str(s)
    items = re.split(',\s*', s.strip('[]'))
    # Remove any quotes from the items and return the list
    return [item.strip("'\"") for item in items]

In [15]:
from ast import literal_eval
df.reset_index(inplace=True, drop=True)
df['id'] = df['id'].astype('int')
for col in df.columns[1:]:
    unique_types = df[col].apply(type).unique()[0]
    print(col,': ',unique_types)
    if unique_types == list:
        df[col] = df[col].apply(parse_list)
    else:
        df[col] = df[col].astype(unique_types)

title :  <class 'str'>
original_title :  <class 'str'>
genres :  <class 'list'>
overview :  <class 'str'>
tagline :  <class 'str'>
release_date :  <class 'str'>
runtime :  <class 'int'>
popularity :  <class 'float'>
vote_count :  <class 'int'>
vote_average :  <class 'float'>
poster_path :  <class 'str'>
poster_url :  <class 'str'>
original_language :  <class 'str'>
status :  <class 'str'>
production_companies :  <class 'list'>
production_countries :  <class 'list'>
spoken_languages :  <class 'list'>


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23875 entries, 0 to 23874
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    23875 non-null  int64  
 1   title                 23875 non-null  object 
 2   original_title        23875 non-null  object 
 3   genres                23875 non-null  object 
 4   overview              23875 non-null  object 
 5   tagline               23875 non-null  object 
 6   release_date          23875 non-null  object 
 7   runtime               23875 non-null  int64  
 8   popularity            23875 non-null  float64
 9   vote_count            23875 non-null  int64  
 10  vote_average          23875 non-null  float64
 11  poster_path           23875 non-null  object 
 12  poster_url            23875 non-null  object 
 13  original_language     23875 non-null  object 
 14  status                23875 non-null  object 
 15  production_companie

In [17]:
df.to_csv('movie_data.csv', index=False)

In [18]:
from google.colab import files

files.download("/content/movie_data.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
import os
import requests

def download_poster(movie_data):
    if not movie_data['poster_url']:
        return 0

    response = requests.get(movie_data['poster_url'])
    file_path = os.path.join('posters', f"{movie_data['id']}.jpg")

    with open(file_path, 'wb') as f:
        f.write(response.content)
    return 1

if not os.path.exists('posters'):
    os.makedirs('posters')
poster_count = 0
for movie_data in tqdm(movie_data_list):
    poster_count += download_poster(movie_data)
print(f"Downloaded {poster_count} posters.")

  0%|          | 0/23875 [00:00<?, ?it/s]

Downloaded 23875 posters.


In [20]:
!zip -r /content/posters.zip /content/posters

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: content/posters/916.jpg (deflated 1%)
  adding: content/posters/23196.jpg (deflated 1%)
  adding: content/posters/285549.jpg (deflated 0%)
  adding: content/posters/436274.jpg (deflated 0%)
  adding: content/posters/41391.jpg (deflated 0%)
  adding: content/posters/23637.jpg (deflated 0%)
  adding: content/posters/19157.jpg (deflated 0%)
  adding: content/posters/188509.jpg (deflated 0%)
  adding: content/posters/51277.jpg (deflated 1%)
  adding: content/posters/37307.jpg (deflated 0%)
  adding: content/posters/521777.jpg (deflated 1%)
  adding: content/posters/103953.jpg (deflated 0%)
  adding: content/posters/11031.jpg (deflated 1%)
  adding: content/posters/395883.jpg (deflated 1%)
  adding: content/posters/2395.jpg (deflated 0%)
  adding: content/posters/26505.jpg (deflated 0%)
  adding: content/posters/276907.jpg (deflated 3%)
  adding: content/posters/12592.jpg (deflated 0%)
  adding: content/posters/11646

In [21]:
from google.colab import files

files.download("/content/posters.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>