In [2]:
import pandas as pd
from pydantic import BaseModel
from surprise import Dataset, Reader, SVD

In [12]:
class RecommendationRequest(BaseModel):
    favorite_movies: list

### POST ###
def predict(recommendation_request: RecommendationRequest):
    global ratings_updated

    favorite_movies = recommendation_request.favorite_movies

    ### COLLABORATIVE FILTERING ###
    new_user_id = ratings_updated['userId'].max() + 1

    for i in favorite_movies:
        newdata = pd.DataFrame([[new_user_id, i, 5.0]], columns=['userId', 'tmdb_id', 'rating'])
        ratings_updated = pd.concat([ratings_updated, newdata], ignore_index=True)

    reader = Reader(rating_scale=(0.5, 5))
    data = Dataset.load_from_df(ratings_updated[['userId', 'tmdb_id', 'rating']], reader)
    svd = SVD()
    train_set = data.build_full_trainset()
    svd.fit(train_set)

    all_pred = []
    movie_id = []
    for movie in ratings_updated['tmdb_id'].unique().tolist():
        pred = svd.predict(new_user_id, movie)
        all_pred.append(pred.est)
        movie_id.append(pred.iid)

    result_collaborative = pd.DataFrame(list(zip(movie_id, all_pred)), columns=['tmdb_id', 'predicted_rating'])
    result_collaborative.sort_values(by='predicted_rating', ascending=False, inplace=True)
    
    
    ### CONTENT BASED FILTERING ###
    filtered_content_based = content_based[content_based['tmdb_id'].isin(favorite_movies)]
    
    # Extract 'tmdb_id' and 'score' from the 'similarities' column
    result_list = []
    for _, row in filtered_content_based.iterrows():
        for entry in row['similarities']:
            result_list.append({
                'tmdb_id': entry['tmdb_id'],
                'score': entry['score']
            })

    # Create a new DataFrame from the extracted data
    result_content = pd.DataFrame(result_list)
    
    # Paramètres
    alpha = 0.1  # Remplacez cela par la valeur réelle d'alpha

    # Calcul de la somme des scores et du nombre de films en doublon pour chaque tmdb_id
    result_content = result_content.groupby('tmdb_id').agg({'score': 'sum', 'tmdb_id': 'count'})

    # Renommer la colonne 'tmdb_id' pour éviter la confusion avec l'index résultant de groupby
    result_content = result_content.rename(columns={'tmdb_id': 'count_duplicates'})

    # Appliquer la formule à chaque groupe
    result_content['score_content'] = result_content['tmdb_id'].map(result_content['score'] / result_content['count_duplicates'] + alpha * result_content['count_duplicates'])

    # Trier les résultats par score décroissant
    result_content.sort_values(by='score_content', ascending=False, inplace=True)
    
    
    ### REGROUPER LES FICHIERS ###
    # Merge left
    result = pd.merge(result_content, result_collaborative, on='tmdb_id', how='left')
    
    # Paramètres de poids
    weight_collaborative = 0.7  # Poids pour le modèle de filtrage collaboratif
    weight_content = 0.3  # Poids pour le modèle content-based

    # Calcul de la moyenne de predicted_rating en ignorant les valeurs NaN
    average_predicted_rating = result['predicted_rating'].mean()
    
    # Appliquer la formule
    result['final_score'] = (weight_collaborative * result['predicted_rating'].fillna(average_predicted_rating) + weight_content * result['score_content'].apply(lambda x: len(x)))
    
    # Trier les résultats par score décroissant
    result.sort_values(by='final_score', ascending=False, inplace=True)
    
    # Garder que les colonnes utiles
    result = result.loc[:,['tmdb_id', 'final_score']]
    
        
    return result.to_dict(orient='records')

In [11]:
## TEST COLLABORATIVE

ratings_updated = pd.read_csv('src/Movielens_ratings_updated.csv')
# content_based = pd.read_csv('src/content_based.csv')

favorite_movies = [13448, 402, 204922]

collaborative_filtering = predict(RecommendationRequest(favorite_movies=favorite_movies))

df_cf = pd.DataFrame(collaborative_filtering)
df_cf

Unnamed: 0,tmdb_id,predicted_rating
0,422,4.997204
1,274,4.978105
2,1480,4.954864
3,548,4.954569
4,769,4.930337
...,...,...
2456,11357,2.909688
2457,9349,2.853667
2458,11932,2.769435
2459,241,2.766791


In [13]:
## TEST FAST API

import requests

url = 'http://localhost:4000/predict'

headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json',
}

data = {
    'favorite_movies': [13448, 402, 204922, 150020151526]
}

response = requests.post(url, headers=headers, json=data)

if response.status_code == 200:
    result = response.json()
    df = pd.DataFrame(result)
    display(df)
else:
    print(f"Error: {response.status_code}, {response.text}")

Unnamed: 0,tmdb_id,predicted_rating
0,582,4.909750
1,303,4.873357
2,38,4.863366
3,11104,4.860422
4,3782,4.842268
...,...,...
2457,11186,2.866117
2458,1370,2.863792
2459,9349,2.755760
2460,10157,2.680972
