In [1]:
import os
import time
import logging
import requests
from bs4 import BeautifulSoup as bs
from sqlalchemy import create_engine, table, column, select, insert

In [2]:
tmdb_token = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJjZWY5MGUyOWMzZGQ0ZTg0Y2IzMjdhMWRiMzlhNWY1MSIsIm5iZiI6MTczMTY3Mzg0My44ODA3MjEzLCJzdWIiOiI2NzI3Y2MwNTU5MTgxMzdjZmMzOTljMmQiLCJzY29wZXMiOlsiYXBpX3JlYWQiXSwidmVyc2lvbiI6MX0.mgm9DChhZ_5BTQyz9MnLnUo12mpM_bsGnVEqXVl-hkw"

In [3]:
def scrape_imdb_first_page():
    """Scrape les données des films depuis IMDb et les renvoie sous forme de listes."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        # Récupérer la page des box-offices d'IMDb
        page = requests.get("https://www.imdb.com/chart/boxoffice", headers=headers)
        page.raise_for_status()  # Vérifier que la requête a réussi
        soup = bs(page.content, 'lxml')  # Extraire les liens et titres des films

        links = [a['href'] for a in soup.find_all('a', class_='ipc-title-link-wrapper')]
        cleaned_links = [link.split('/')[2].split('?')[0].replace('tt', '') for link in links]

        return cleaned_links
    except requests.RequestException as e:
        print(f"Erreur lors de la récupération de la page IMDb : {e}")
        return []

In [4]:
def genres_request():
    """Effectue des requêtes à l'API TMDB pour récupérer les informations des genres de films."""
    url = "https://api.themoviedb.org/3/genre/movie/list?language=en"
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {tmdb_token}"
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Vérifier que la requête a réussi

        data = response.json()
        genres = {str(genre["id"]): genre["name"] for genre in data["genres"]}
        return genres
    except requests.RequestException as e:
        print(f"Erreur lors de la récupération des genres : {e}")
        return {}

In [5]:
def api_tmdb_request():
    """Effectue des requêtes à l'API TMDB pour récupérer les informations des films."""
    results = {}
    cleaned_links = scrape_imdb_first_page()

    if not cleaned_links:  # Vérifier si le scraping a échoué
        return results

    genres = genres_request()

    for index, movie_id in enumerate(cleaned_links):
        url = f"https://api.themoviedb.org/3/find/tt{movie_id}?external_source=imdb_id"

        headers = {
            "accept": "application/json",
            "Authorization": f"Bearer {tmdb_token}"
        }

        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Vérifier que la requête a réussi

            data = response.json()

            if data["movie_results"]:
                movie_info = data["movie_results"][0]
                release_date = movie_info["release_date"]
                release_year = release_date.split("-")[0]  # Extraire l'année

                results[str(index)] = {
                    "tmdb_id": movie_info["id"],
                    "title": movie_info["title"],
                    "genre_ids": movie_info['genre_ids'],
                    "imbd_id": movie_id,
                    "date": release_date,
                    "year": release_year,
                    "genres": [genres[str(genre_id)] for genre_id in movie_info['genre_ids']]
                }
            else:
                results[str(index)] = {"error": f"Aucun résultat trouvé pour l'ID IMDb {movie_id}"}

        except requests.RequestException as e:
            results[str(index)] = {"error": f"Erreur lors de la requête TMDB : {e}"}

    return results

In [6]:
result = api_tmdb_request()
print(result)

{'0': {'tmdb_id': 1241982, 'title': 'Moana 2', 'genre_ids': [16, 12, 10751, 35], 'imbd_id': '13622970', 'date': '2024-11-21', 'year': '2024', 'genres': ['Animation', 'Adventure', 'Family', 'Comedy']}, '1': {'tmdb_id': 402431, 'title': 'Wicked', 'genre_ids': [18, 10749, 14], 'imbd_id': '1262426', 'date': '2024-11-20', 'year': '2024', 'genres': ['Drama', 'Romance', 'Fantasy']}, '2': {'tmdb_id': 558449, 'title': 'Gladiator II', 'genre_ids': [28, 12, 36], 'imbd_id': '9218128', 'date': '2024-11-05', 'year': '2024', 'genres': ['Action', 'Adventure', 'History']}, '3': {'tmdb_id': 857598, 'title': 'Pushpa 2 - The Rule', 'genre_ids': [80, 28, 53], 'imbd_id': '16539454', 'date': '2024-12-04', 'year': '2024', 'genres': ['Crime', 'Action', 'Thriller']}, '4': {'tmdb_id': 845781, 'title': 'Red One', 'genre_ids': [28, 35, 14], 'imbd_id': '14948432', 'date': '2024-10-31', 'year': '2024', 'genres': ['Action', 'Comedy', 'Fantasy']}, '5': {'tmdb_id': 157336, 'title': 'Interstellar', 'genre_ids': [12, 18,

In [None]:
for movie_key, movie_info in result.items():
    if 'error' in movie_info:
                print(f"Ignoré: {movie_info['error']}")
                continue

    required_fields = ['title', 'year', 'genres', 'imbd_id', 'tmdb_id']
    if not all(field in movie_info for field in required_fields):
        print(f"Champs manquants pour l'entrée {movie_key}: {movie_info}")
        continue

    title = movie_info["title"]
    year = int(movie_info["year"])
    genres_str = ','.join(movie_info["genres"])
    imdb_id = int(movie_info["imbd_id"])
    tmdb_id = int(movie_info["tmdb_id"])

    print(f"{title} ({year}) - Genres: {genres_str} - IMDb ID: {imdb_id} - TMDB ID: {tmdb_id}")



Moana 2 (2024) - Genres: Animation,Adventure,Family,Comedy - IMDb ID: 13622970 - TMDB ID: 1241982
Wicked (2024) - Genres: Drama,Romance,Fantasy - IMDb ID: 1262426 - TMDB ID: 402431
Gladiator II (2024) - Genres: Action,Adventure,History - IMDb ID: 9218128 - TMDB ID: 558449
Pushpa 2 - The Rule (2024) - Genres: Crime,Action,Thriller - IMDb ID: 16539454 - TMDB ID: 857598
Red One (2024) - Genres: Action,Comedy,Fantasy - IMDb ID: 14948432 - TMDB ID: 845781
Interstellar (2014) - Genres: Adventure,Drama,Science Fiction - IMDb ID: 816692 - TMDB ID: 157336
Solo Leveling -ReAwakening- (2024) - Genres: Action,Adventure,Fantasy,Animation - IMDb ID: 33428606 - TMDB ID: 1357633
Ignoré: Aucun résultat trouvé pour l'ID IMDb 34422601
Y2K (2024) - Genres: Comedy,Horror,Science Fiction - IMDb ID: 27218960 - TMDB ID: 1094274
The Best Christmas Pageant Ever (2024) - Genres: Family,Comedy,Drama - IMDb ID: 2347285 - TMDB ID: 1206617
