### IMPORT LIBRARIES

In [1]:
import pandas as pd
import requests
from datetime import datetime, timedelta
import gzip
from io import BytesIO
import warnings
warnings.filterwarnings("ignore")

### DAILY EXPORT MOVIES

In [5]:
file_path = "src/TMDB_daily_export_movies.csv"

try:
    # Tenter de charger le fichier existant
    data = pd.read_csv(file_path)
    data = data.sort_values('popularity', ascending = False).head(10000)
    print("Fichier chargé et trié avec succès.")
    
except FileNotFoundError:
    # Get yesterday's date
    today = datetime.now()
    yesterday = today - timedelta(days=1)
    formatted_date = yesterday.strftime("%m_%d_%Y")

    # Get the URL file
    url_begin = 'http://files.tmdb.org/p/exports/'
    url_end = f'movie_ids_{formatted_date}.json.gz'
    full_url = url_begin + url_end

    # Send a GET request to the URL
    response = requests.get(full_url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Read the gzipped content into a BytesIO object
        compressed_data = BytesIO(response.content)

        # Use gzip to decompress the data
        with gzip.GzipFile(fileobj=compressed_data, mode='rb') as f:
            # Read the decompressed JSON data
            json_data = f.read().decode('utf-8')

        # Load JSON data into a Pandas DataFrame
        data = pd.read_json(json_data, lines = True)

    else:
        # Print an error message if the request was not successful
        print(f"Failed to download the file. Status code: {response.status_code}")
        
    # Filter columns
    data = data.loc[:,['id', 'original_title', 'popularity']]

    # Sort by popularity and filter n first rows
    n = 20000
    data = data.sort_values(by=['popularity'], ascending=False).head(n)

    # Create CSV file
    data.to_csv('src/TMDB_daily_export_movies.csv', index=False)
    
    print(f'Le fichier a été créé au répertoire src/TMDB_daily_export_movies.csv avec {n} lignes.')
    
except Exception as e:
    # Gérer d'autres exceptions éventuelles
    print(f"Une erreur est survenue : {e}")

  data = pd.read_json(json_data, lines = True)


Le fichier a été créé au répertoire src/TMDB_daily_export_movies.csv avec 20000 lignes.


### API CONNEXION

In [16]:
# Define API key as headers
headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJhMTFkY2JjYzE4MTFlNWIxOGI3MDg1MTIyOWRiOGYzZSIsInN1YiI6IjY1OTQzNWVlY2U0ZGRjNmQ5MDdlYWQxNSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.5VYKD-qYGgixOfyjsDIR5We_wmJklWml5waulWzQVTA"
}

### MOVIE VOTE (CSV)

Environ 2min30 pour 1000 lignes

In [17]:
# URL + Headers
url_start = "https://api.themoviedb.org/3/movie/"
url_end = '?language=en-US'

# Boucle for pour récupérer les infos de toutes les données
movie_data_list = []

for movie_id in data.iloc[:,0]:
    url = url_start + str(movie_id) + url_end
    response = requests.get(url, headers=headers)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Assuming the response content is JSON, you may need to adjust this accordingly
        movie_details = response.json()
        
        # Extract only the desired fields
        movie_data = {
            "tmdb_id": movie_details.get("id", None),
            "vote_average": movie_details.get("vote_average", None),
            "vote_count": movie_details.get("vote_count", None),
        }

        # Append the selected movie data to the list
        movie_data_list.append(movie_data)
    else:
        print(f"Error fetching details for movie_id: {movie_id}")
   
   
# Create dataframe from movie list     
data_movie = pd.DataFrame(movie_data_list)

# Export to CSV
data_movie.to_csv('src/TMDB_movie_vote.csv', index = False)