In [3]:
import pandas as pd
from pydantic import BaseModel
from surprise import Dataset, Reader, SVD
import ast
import warnings
warnings.filterwarnings("ignore")

In [18]:
class RecommendationRequest(BaseModel):
    favorite_movies: list

### POST ###
def predict(recommendation_request: RecommendationRequest):
    global ratings_updated
    favorite_movies = recommendation_request.favorite_movies

    ### COLLABORATIVE FILTERING ###
    # Add new user in ratings dataset:
    new_user_id = ratings_updated['userId'].max() + 1
    for movie in favorite_movies:
        newdata = pd.DataFrame([[new_user_id, movie, 5.0]], columns=['userId', 'tmdb_id', 'rating'])
        ratings_updated = pd.concat([ratings_updated, newdata], ignore_index=True)

    # Train model with new data:
    reader = Reader(rating_scale=(0.5, 5))
    data = Dataset.load_from_df(ratings_updated[['userId', 'tmdb_id', 'rating']], reader)
    svd = SVD()
    train_set = data.build_full_trainset()
    svd.fit(train_set)

    # Isolate movies the user never saw:
    all_movies = ratings_updated['tmdb_id'].unique().tolist()
    already_seen = ratings_updated[ratings_updated['userId'] == new_user_id]['tmdb_id'].tolist()
    never_seen = [x for x in all_movies if x not in already_seen]

    # Make predictions for new user:
    predictions = []
    movies = []
    for movie in never_seen:
        pred = svd.predict(new_user_id, movie)
        predictions.append(pred.est)
        movies.append(pred.iid)

    # Results collaborative filtering:
    result_collaborative = pd.DataFrame(list(zip(predictions, movies)), columns=['predicted_rating', 'tmdb_id'])
    result_collaborative.sort_values(by='predicted_rating', ascending=False, inplace=True)   
    
    
    ### CONTENT BASED FILTERING ###
    # Filter content and open list of similarities:
    filtered_content_based = content_based[content_based['tmdb_id'].isin(favorite_movies)]
    filtered_content_based.loc[:, 'similarities'] = filtered_content_based['similarities'].apply(ast.literal_eval)
    # filtered_content_based['similarities'] = filtered_content_based['similarities'].apply(ast.literal_eval)

    result_list = []
    for _, row in filtered_content_based.iterrows():
        for entry in row['similarities']:
            result_list.append({
                'tmdb_id': entry['tmdb_id'],
                'score': entry['score']
            })

    result_content = pd.DataFrame(result_list)
    result_content = result_content[~result_content['tmdb_id'].isin(favorite_movies)]

    # Calculate score_content:
    alpha = 0.1  # Remplacez cela par la valeur choisi d'alpha

    result_content = result_content.groupby('tmdb_id').agg({'score': 'sum', 'tmdb_id': 'count'})
    result_content = result_content.rename(columns={'tmdb_id': 'count_duplicates'})
    result_content = result_content.reset_index()
    result_content['score_content'] = result_content.apply(lambda row: row['score'] / row['count_duplicates'] + alpha * row['count_duplicates'] if row['count_duplicates'] > 1 else row['score'], axis=1)
    
    # Results content_based
    result_content.sort_values(by='score_content', ascending=False, inplace=True)
    
    
    ### REGROUPER LES FICHIERS ###
    result = pd.merge(result_content, result_collaborative, on='tmdb_id', how='left')
    
    weight_collaborative = 1  # Poids pour le modèle de filtrage collaboratif
    weight_content = 5  # Poids pour le modèle content-based
    
    average_predicted_rating = result['predicted_rating'].mean()
    result['final_score'] = (weight_collaborative * result['predicted_rating'].fillna(average_predicted_rating) + weight_content * result['score_content'])
    
    # Results
    result.sort_values(by='final_score', ascending=False, inplace=True)
    result = result.loc[:,['tmdb_id', 'predicted_rating', 'score_content', 'final_score']]
    
        
    return result.to_dict(orient='records')

In [19]:
### TEST COMPLET ###

ratings_updated = pd.read_csv('src/Movielens_ratings_updated.csv')
content_based = pd.read_csv('src/TMDB_content_based.csv')

favorite_movies = [671, 672, 673]

collaborative_filtering = predict(RecommendationRequest(favorite_movies=favorite_movies))

df_result = pd.DataFrame(collaborative_filtering)
df_result

Unnamed: 0,tmdb_id,predicted_rating,score_content,final_score
0,674,4.69121,0.871121,9.046816
1,767,4.456324,0.878808,8.850365
2,675,4.448854,0.878808,8.842895
3,12445,4.510182,0.860418,8.812271
4,12444,4.663629,0.788561,8.606433
5,338952,4.04718,0.60327,7.063528
6,630,4.345959,0.53372,7.01456
7,899082,,0.586437,6.933988
8,34584,4.079947,0.566102,6.910458
9,338953,,0.579315,6.898376


In [12]:
df_result.describe()

Unnamed: 0,tmdb_id,predicted_rating,score_content,final_score
count,86.0,72.0,86.0,86.0
mean,199891.837209,4.11957,0.230238,5.270763
std,256896.786645,0.259412,0.055817,0.369961
min,77.0,3.311913,0.171499,4.221421
25%,1974.0,3.988856,0.2,5.045319
50%,38950.5,4.116033,0.214697,5.201147
75%,398735.5,4.301815,0.24605,5.462308
max,945729.0,4.861845,0.4846,6.423122


In [21]:
## TEST FAST API

import requests

url = 'http://localhost:4000/predict'

headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json',
}

data = {
    'favorite_movies': [671, 672, 673]
}

response = requests.post(url, headers=headers, json=data)

if response.status_code == 200:
    result = response.json()
    df = pd.DataFrame(result)
    display(df)
else:
    print(f"Error: {response.status_code}, {response.text}")

Unnamed: 0,tmdb_id,final_score
0,12445,8.822695
1,767,8.780641
2,675,8.711732
3,674,8.532117
4,12444,8.387245
5,673,8.037998
6,671,7.996707
7,672,7.891717
8,630,7.069815
9,899082,6.92547
