In [116]:
import pandas as pd
from pydantic import BaseModel
from surprise import Dataset, Reader, SVD
import ast
import warnings
warnings.filterwarnings("ignore")

In [117]:
class RecommendationRequest(BaseModel):
    favorite_movies: list

### POST ###
def predict(recommendation_request: RecommendationRequest):
    global ratings_updated
    favorite_movies = recommendation_request.favorite_movies

    ### COLLABORATIVE FILTERING ###
    # Add new user in ratings dataset:
    new_user_id = ratings_updated['userId'].max() + 1
    for movie in favorite_movies:
        newdata = pd.DataFrame([[new_user_id, movie, 5.0]], columns=['userId', 'tmdb_id', 'rating'])
        ratings_updated = pd.concat([ratings_updated, newdata], ignore_index=True)

    # Train model with new data:
    reader = Reader(rating_scale=(0.5, 5))
    data = Dataset.load_from_df(ratings_updated[['userId', 'tmdb_id', 'rating']], reader)
    svd = SVD()
    train_set = data.build_full_trainset()
    svd.fit(train_set)

    # Isolate movies the user never saw:
    all_movies = ratings_updated['tmdb_id'].unique().tolist()
    already_seen = ratings_updated[ratings_updated['userId'] == new_user_id]['tmdb_id'].tolist()
    never_seen = [x for x in all_movies if x not in already_seen]

    # Make predictions for new user:
    predictions = []
    movies = []
    for movie in never_seen:
        pred = svd.predict(new_user_id, movie)
        predictions.append(pred.est)
        movies.append(pred.iid)

    # Results collaborative filtering:
    result_collaborative = pd.DataFrame(list(zip(predictions, movies)), columns=['predicted_rating', 'tmdb_id'])
    result_collaborative.sort_values(by='predicted_rating', ascending=False, inplace=True)   
    
    
    ### CONTENT BASED FILTERING ###
    # Filter content and open list of similarities:
    filtered_content_based = content_based[content_based['tmdb_id'].isin(favorite_movies)]
    filtered_content_based['similarities'] = filtered_content_based['similarities'].apply(ast.literal_eval)

    result_list = []
    for _, row in filtered_content_based.iterrows():
        for entry in row['similarities']:
            result_list.append({
                'tmdb_id': entry['tmdb_id'],
                'score': entry['score']
            })

    result_content = pd.DataFrame(result_list)

    # Calculate score_content:
    alpha = 0.1  # Remplacez cela par la valeur choisi d'alpha

    result_content = result_content.groupby('tmdb_id').agg({'score': 'sum', 'tmdb_id': 'count'})
    result_content = result_content.rename(columns={'tmdb_id': 'count_duplicates'})
    result_content = result_content.reset_index()
    result_content['score_content'] = result_content.apply(lambda row: row['score'] / row['count_duplicates'] + alpha * row['count_duplicates'] if row['count_duplicates'] > 1 else row['score'], axis=1)
    
    # Results content_based
    result_content.sort_values(by='score_content', ascending=False, inplace=True)
    
    
    ### REGROUPER LES FICHIERS ###
    result = pd.merge(result_content, result_collaborative, on='tmdb_id', how='left')
    
    weight_collaborative = 1  # Poids pour le modèle de filtrage collaboratif
    weight_content = 5  # Poids pour le modèle content-based
    
    average_predicted_rating = result['predicted_rating'].mean()
    result['final_score'] = (weight_collaborative * result['predicted_rating'].fillna(average_predicted_rating) + weight_content * result['score_content'])
    
    # Results
    result.sort_values(by='final_score', ascending=False, inplace=True)
    result = result.loc[:,['tmdb_id', 'predicted_rating', 'score_content', 'final_score']]
    
        
    return result.to_dict(orient='records')

In [118]:
### TEST COMPLET ###

ratings_updated = pd.read_csv('src/Movielens_ratings_updated.csv')
content_based = pd.read_csv('src/TMDB_content_based.csv')

favorite_movies = [13448, 402, 204922]

collaborative_filtering = predict(RecommendationRequest(favorite_movies=favorite_movies))

df_cf = pd.DataFrame(collaborative_filtering)
df_cf

Unnamed: 0,tmdb_id,predicted_rating,score_content,final_score
0,77,4.539778,0.434735,6.713454
1,2043,4.303231,0.400740,6.306933
2,207932,3.776866,0.484600,6.199866
3,484247,3.893831,0.400740,5.897533
4,11963,4.570814,0.244949,5.795559
...,...,...,...,...
81,176,3.658513,0.216930,4.743166
82,254128,3.643182,0.207973,4.683045
83,335866,3.727882,0.180151,4.628636
84,345,3.449460,0.198762,4.443268


In [13]:
# ## TEST FAST API

# import requests

# url = 'http://localhost:4000/predict'

# headers = {
#     'accept': 'application/json',
#     'Content-Type': 'application/json',
# }

# data = {
#     'favorite_movies': [13448, 402, 204922, 150020151526]
# }

# response = requests.post(url, headers=headers, json=data)

# if response.status_code == 200:
#     result = response.json()
#     df = pd.DataFrame(result)
#     display(df)
# else:
#     print(f"Error: {response.status_code}, {response.text}")

Unnamed: 0,tmdb_id,predicted_rating
0,582,4.909750
1,303,4.873357
2,38,4.863366
3,11104,4.860422
4,3782,4.842268
...,...,...
2457,11186,2.866117
2458,1370,2.863792
2459,9349,2.755760
2460,10157,2.680972
