In [1]:
import pandas as pd
import os
import requests
import psycopg2
from dotenv import load_dotenv
from contextlib import contextmanager

In [2]:
ratings = pd.read_csv("/home/antoine/jul24_cmlops_reco_film/ml/data/processed/processed_ratings.csv")

In [3]:
user_id = 50
df_user = ratings[ratings['userId'] == user_id]
df_user.head()


Unnamed: 0,userId,movieId,rating,timestamp,bayesian_mean
4973,50,32,4.0,1182349090,3.885546
4974,50,39,3.5,1182349203,3.405865
4975,50,47,4.0,1182348347,4.03785
4976,50,150,3.5,1182349064,3.857264
4977,50,215,5.0,1182347856,3.752335


In [4]:
df_user = df_user.sort_values(by='rating', ascending=False)
best_movies = df_user.head(3)
best_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,bayesian_mean
4978,50,296,5.0,1182348326,4.162792
5057,50,7438,5.0,1181644394,3.840765
5058,50,8638,5.0,1182348573,3.794196


In [6]:
movies = pd.read_csv('/home/antoine/jul24_cmlops_reco_film/ml/data/processed/processed_movies.csv')
links = pd.read_csv('/home/antoine/jul24_cmlops_reco_film/ml/data/processed/processed_links.csv')

In [7]:
movies_links_df = movies.merge(links, on = "movieId", how = 'left')

In [14]:
link_user = links[links['movieId'] == 296
                  ]
link_user.head()

Unnamed: 0,movieId,imdbId,tmdbId
293,296,110912,680


In [8]:
imdb_dict = dict(zip(movies_links_df['movieId'], movies_links_df['imdbId']))

In [9]:
movie_titles = dict(zip(movies['movieId'], movies['title']))

In [10]:
imdb_list = [imdb_dict[movie_id] for movie_id in best_movies['movieId'] if movie_id in imdb_dict]

In [11]:
imdb_list

[110912, 378194, 381681]

In [2]:
username = []
email = []
password = []

for i in range(1, 501):
    username.append('user'+str(i))
    email.append('user'+str(i)+'@example.com')
    password.append('password'+str(i))


In [16]:
print(username)

['user2', 'user3', 'user4', 'user5', 'user6', 'user7', 'user8', 'user9']


In [3]:
from passlib.context import CryptContext

bcrypt_context = CryptContext(schemes=['bcrypt'], deprecated='auto')

hached_password = [bcrypt_context.hash(i) for i in password]


In [8]:
@contextmanager
def get_db_connection():
    """
    Gestionnaire de contexte pour la connexion à la base de données.
    Ouvre une connexion et la ferme automatiquement après utilisation.

    Utilisation:
        with get_db_connection() as conn:
            with conn.cursor() as cur:
                cur.execute("SELECT * FROM table")
    """
    conn = None
    try:
        conn = psycopg2.connect(
            database='reco_movies',
            host='localhost',
            user='antoine',
            password='datascientest',
            port='5432'
        )
        print("Connection à la base de données OK")
        yield conn
    except psycopg2.Error as e:
        print(f"Erreur lors de la connexion à la base de données: {e}")
        raise
    finally:
        if conn is not None:
            conn.close()
            print("Connexion à la base de données fermée")


In [9]:
def fetch_latest_ratings() -> pd.DataFrame:
    """Récupère 25 % des derniers enregistrements de la table ratings et les transforme en DataFrame."""
    query = """
    SELECT userId, movieId, rating
    FROM ratings
    ORDER BY id DESC
    LIMIT (SELECT COUNT(*) FROM ratings) * 0.25
    """
    try:
        with get_db_connection() as conn:
            df = pd.read_sql_query(query, conn)
            print("Derniers enregistrements récupérés")
            return df
    except Exception as e:
        print(f"Erreur lors de la récupération des enregistrements: {e}")
        raise

In [10]:
ratings = fetch_latest_ratings()

ratings.head()

Connection à la base de données OK


  df = pd.read_sql_query(query, conn)


Derniers enregistrements récupérés
Connexion à la base de données fermée


Unnamed: 0,userid,movieid,rating
0,138493,71619,2.5
1,138493,70286,5.0
2,138493,69644,3.0
3,138493,69526,4.5
4,138493,68954,4.5


In [8]:
def read_ratings(ratings_csv: str, data_dir: str = "/app/raw") -> pd.DataFrame:
    """Reads the CSV file containing movie ratings."""
    data = pd.read_csv(os.path.join(data_dir, ratings_csv))
    print("Dataset ratings loaded")
    return data

def read_movies(movies_csv: str, data_dir: str = "/app/raw") -> pd.DataFrame:
    """Reads the CSV file containing movie information."""
    df = pd.read_csv(os.path.join(data_dir, movies_csv))
    print("Dataset movies loaded")
    return df

def read_links(links_csv: str, data_dir: str = "/app/raw") -> pd.DataFrame:
    """Reads the CSV file containing movie information."""
    df = pd.read_csv(os.path.join(data_dir, links_csv))
    print("Dataset links loaded")
    return df

In [38]:
recommandations = [858, 527, 1221, 912, 904, 750, 2019, 58559, 908, 6016]
# Chargement de nos dataframe depuis mongo_db
ratings = read_ratings('/home/antoine/jul24_cmlops_reco_film/ml/data/processed/processed_ratings.csv')
movies = read_movies('/home/antoine/jul24_cmlops_reco_film/ml/data/processed/processed_movies.csv')
links = read_links('/home/antoine/jul24_cmlops_reco_film/ml/data/processed/processed_links.csv')


Dataset ratings loaded
Dataset movies loaded
Dataset links loaded


In [39]:
movies_links_df = movies.merge(links, on = "movieId", how = 'left')
movies_links_df.head()

Unnamed: 0,movieId,title,genres,year,imdbId,tmdbId
0,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",1995,114709,862
1,2,Jumanji,"['Adventure', 'Children', 'Fantasy']",1995,113497,8844
2,3,Grumpier Old Men,"['Comedy', 'Romance']",1995,113228,15602
3,4,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",1995,114885,31357
4,5,Father of the Bride Part II,['Comedy'],1995,113041,11862


In [44]:
# Créer un dictionnaire pour un accès rapide
imdb_dict = dict(zip(movies_links_df['movieId'], movies_links_df['imdbId']))

In [51]:
imdb_list = [imdb_dict[movie_id] for movie_id in recommandations if movie_id in imdb_dict]

# Afficher la liste des IMDb dans l'ordre d'origine
print(imdb_list)

[68646, 108052, 71562, 34583, 47396, 57012, 47478, 468569, 53125, 317248]


In [None]:

def format_movie_id(movie_id):
    """Formate l'ID du film pour qu'il ait 7 chiffres."""
    return str(movie_id).zfill(7)

def api_tmdb_request(movie_ids):
    """Effectue des requêtes à l'API TMDB pour récupérer les informations des films."""
    results = {}

    for index, movie_id in enumerate(movie_ids):
        formatted_id = format_movie_id(movie_id)
        url = f"https://api.themoviedb.org/3/find/tt{formatted_id}?external_source=imdb_id"

        headers = {
            "accept": "application/json",
            "Authorization": f"Bearer {tmdb_token}"
        }

        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            data = response.json()
            if data["movie_results"]:
                # On suppose que nous voulons le premier résultat
                movie_info = data["movie_results"][0]
                results[index] = {
                    "title": movie_info["title"],
                    "vote_average": movie_info["vote_average"],
                    "poster_path": movie_info["poster_path"]
                }
            else:
                results[index] = {"error": "No movie results found"}
        else:
            results[index] = {"error": f"Request failed with status code {response.status_code}"}

    return results


In [53]:
# Appel de la fonction
results = api_tmdb_request(imdbId_list)

print(results)

{0: {'title': "Schindler's List", 'vote_average': 8.565, 'poster_path': '/sF1U4EUQS8YHUYjNl3pMGNIQyr0.jpg'}, 1: {'title': 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb', 'vote_average': 8.117, 'poster_path': '/7SixLzxcqezkZEYU8pcHZgbkmjp.jpg'}, 2: {'title': 'The Godfather', 'vote_average': 8.69, 'poster_path': '/3bhkrj58Vtu7enYsRolD1fZdja1.jpg'}, 3: {'title': 'Rear Window', 'vote_average': 8.352, 'poster_path': '/ILVF0eJxHMddjxeQhswFtpMtqx.jpg'}, 4: {'title': 'North by Northwest', 'vote_average': 8.0, 'poster_path': '/8gvfRlVpcKaTVqipXpYOGWBN1aO.jpg'}, 5: {'title': 'Casablanca', 'vote_average': 8.16, 'poster_path': '/5K7cOHoay2mZusSLezBOY0Qxh8a.jpg'}, 6: {'title': 'The Godfather Part II', 'vote_average': 8.6, 'poster_path': '/hek3koDUyRQk7FIhPXsa6mT2Zc3.jpg'}, 7: {'title': 'Seven Samurai', 'vote_average': 8.463, 'poster_path': '/iAq0sq42vKTLneVGqHn1D4GzgrM.jpg'}, 8: {'title': 'City of God', 'vote_average': 8.425, 'poster_path': '/k7eYdWvhYQyRQoU2TB2A2Xu2TfD.jpg'

In [46]:
tmdb_token = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJjZWY5MGUyOWMzZGQ0ZTg0Y2IzMjdhMWRiMzlhNWY1MSIsIm5iZiI6MTczMTY3MzkyNy4zNDkwNTg0LCJzdWIiOiI2NzI3Y2MwNTU5MTgxMzdjZmMzOTljMmQiLCJzY29wZXMiOlsiYXBpX3JlYWQiXSwidmVyc2lvbiI6MX0.gExSQUL7byxDH4T4WaypusXMFcWOuR4YReVWJMy9wnY"


def api_tmdb_request(external_id, extenal_source = 'imdb_id'):
    url = f"https://api.themoviedb.org/3/find/tt0{external_id}?external_source={extenal_source}"
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {tmdb_token}"
    }
    response = requests.get(url, headers=headers)
    response = response.text
    return response
    # poster_path = response["movie_results"][0]["poster_path"]
    # vote_average = response["movie_results"][0]["vote_average"]
    # original_title = response["movie_results"][0]['original_title']
    # cover_url = "http://image.tmdb.org/t/p/w185"
    # print({"cover_link" : f"{cover_url}{poster_path}" , "vote_average" : vote_average, "original_title": original_title})
    # return {"cover_link" : f"{cover_url}{poster_path}" , "vote_average" : vote_average, "original_title": original_title}

In [None]:
result_dict = {}

for index, imdb_num in enumerate(imdbId_list):
    result_dict[index] = api_tmdb_request(imdb_num)

print(result_dict)

{0: '{"movie_results":[{"backdrop_path":"/zb6fM1CX41D9rF9hdgclu0peUmy.jpg","id":424,"title":"Schindler\'s List","original_title":"Schindler\'s List","overview":"The true story of how businessman Oskar Schindler saved over a thousand Jewish lives from the Nazis while they worked as slaves in his factory during World War II.","poster_path":"/sF1U4EUQS8YHUYjNl3pMGNIQyr0.jpg","media_type":"movie","adult":false,"original_language":"en","genre_ids":[18,36,10752],"popularity":106.822,"release_date":"1993-12-15","video":false,"vote_average":8.565,"vote_count":15829}],"person_results":[],"tv_results":[],"tv_episode_results":[],"tv_season_results":[]}', 1: '{"movie_results":[],"person_results":[],"tv_results":[],"tv_episode_results":[],"tv_season_results":[]}', 2: '{"movie_results":[],"person_results":[],"tv_results":[],"tv_episode_results":[],"tv_season_results":[]}', 3: '{"movie_results":[],"person_results":[],"tv_results":[],"tv_episode_results":[],"tv_season_results":[]}', 4: '{"movie_result

In [None]:
import requests

# Dictionnaire pour stocker les résultats
results_dict = {}

for i in range(4):
    for j in imdbId_list:
        results_dict[i]= api_tmdb_request(j)


print(results_dict)

{0: {'cover_link': 'http://image.tmdb.org/t/p/w185/qJU6rfil5xLVb5HpJsmmfeSK254.jpg', 'vote_average': 6.3}, 1: {'cover_link': 'http://image.tmdb.org/t/p/w185/qJU6rfil5xLVb5HpJsmmfeSK254.jpg', 'vote_average': 6.3}, 2: {'cover_link': 'http://image.tmdb.org/t/p/w185/qJU6rfil5xLVb5HpJsmmfeSK254.jpg', 'vote_average': 6.3}, 3: {'cover_link': 'http://image.tmdb.org/t/p/w185/qJU6rfil5xLVb5HpJsmmfeSK254.jpg', 'vote_average': 6.3}}


In [6]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
from dotenv import load_dotenv
import psycopg2
import csv
import os

base_dir = '/home/antoine/jul24_cmlops_reco_film/ml/src/data'
# Construire le chemin vers le répertoire contenant les données traitées
env_dir = os.path.join(base_dir, '..', '..', '..', 'postgres')

# Charger les variables d'environnement depuis le fichier .env
load_dotenv(env_dir)

# Configuration de la base de données
DB_NAME = os.getenv('POSTGRES_DB')
DB_USER = os.getenv('POSTGRES_USER')
DB_PASSWORD = os.getenv('POSTGRES_PASSWORD')
DB_HOST = os.getenv('POSTGRES_HOST')
DB_PORT = os.getenv('POSTGRES_PORT')


In [8]:
print(DB_PASSWORD)

datascientest


In [None]:
recommendations = [211, 15560, 60, 50]

imdbId_list = [movies_links_df['imdbId'] == str(i)] for i in recommendations]


In [12]:
print(imdbId_list)

[Series([], Name: imdbId, dtype: int64), Series([], Name: imdbId, dtype: int64), Series([], Name: imdbId, dtype: int64), Series([], Name: imdbId, dtype: int64)]


In [20]:
jeton_tmdb = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJjZWY5MGUyOWMzZGQ0ZTg0Y2IzMjdhMWRiMzlhNWY1MSIsIm5iZiI6MTczMTY3Mzg0My44ODA3MjEzLCJzdWIiOiI2NzI3Y2MwNTU5MTgxMzdjZmMzOTljMmQiLCJzY29wZXMiOlsiYXBpX3JlYWQiXSwidmVyc2lvbiI6MX0.mgm9DChhZ_5BTQyz9MnLnUo12mpM_bsGnVEqXVl-hkw"

In [None]:
import requests

url = "https://api.themoviedb.org/3/find/external_id?external_source="

headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJjZWY5MGUyOWMzZGQ0ZTg0Y2IzMjdhMWRiMzlhNWY1MSIsIm5iZiI6MTczMTY3MzkyNy4zNDkwNTg0LCJzdWIiOiI2NzI3Y2MwNTU5MTgxMzdjZmMzOTljMmQiLCJzY29wZXMiOlsiYXBpX3JlYWQiXSwidmVyc2lvbiI6MX0.gExSQUL7byxDH4T4WaypusXMFcWOuR4YReVWJMy9wnY"
}

response = requests.get(url, headers=headers)

print(response.text)