In [2]:
# Core Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy import stats
from scipy.stats import randint, uniform

# Sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split , RandomizedSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import sklearn.metrics as metrics
from sklearn.pipeline import Pipeline
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestRegressor  # For regression
from sklearn.ensemble import RandomForestClassifier  # For classification
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.neighbors import KNeighborsRegressor

pd.options.display.float_format = '{:.2f}'.format

## IMPORT & EXPLORE

#### MODELS

##### HGB

In [5]:
from joblib import load 

movielens_user_movies_rating_hist_gradient_boosting = load(r"C:\Users\User\Desktop\MovieLens Project\models\movielens_user_movies_rating_hist_gradient_boosting.joblib")


In [7]:
movielens_user_movies_rating_hist_gradient_boosting

#### FUNCTIONS

##### WATCH LIST CREATION

In [47]:
def create_user_watch_list(users_data, watch_history):
    """
    Creates a DataFrame containing the watch list for each user.
    
    Parameters:
    - users_data: DataFrame with at least a 'user_id' column.
    - watch_history: DataFrame with 'user_id' and 'movie_id' columns.
    
    Returns:
    - DataFrame with 'user_id' and 'watch_list' (set of watched movie_ids).
    """
    watch_list = watch_history.groupby("user_id")["movie_id"].apply(set).reset_index() # groupping 
    
    return watch_list

##### RECOMMENDATION PROCESS

In [71]:
def recommend_movies(users_data, 
                     movies_data, 
                     watch_list, 
                     model, 
                     recommendation_threshold, 
                     maximum_recommendation_list_size):
    """
    Generate personalized movie recommendations for each user.

    Parameters:
    users_data (pd.DataFrame): User dataset containing user features.
    movies_data (pd.DataFrame): Movie dataset containing movie features.
    watch_list (dict): Dictionary mapping user_id to watched movie_ids.
    model: Trained rating prediction model.
    recommendation_threshold (float): Minimum predicted rating required for recommendation.
    maximum_recommendation_list_size (int): Maximum number of recommendations per user.

    Returns:
    dict: {user_id: list of recommended movie_ids}
    """
    feature_order = [
        "year", "runtimeminutes", "genres_code", "imdb_avg_rating", "imdb_num_votes",
        "user_zip_code", "bucketized_user_age", "user_occupation_label", "user_gender_code"
    ]

    recommendations = {}

    for _, user in users_data.iterrows():
        user_id = user["user_id"]
        seen_movies = watch_list.get(user_id, set())

        recommended_movies = []
        user_features = user.drop("user_id").to_frame().T  # Retain feature names

        # Create input features dynamically for each user
        movie_features = movies_data.drop("movie_id", axis=1)
        user_repeated = pd.concat([user_features] * len(movies_data), ignore_index=True)

        input_features = pd.concat([movie_features.reset_index(drop=True), user_repeated], axis=1)
        input_features = input_features[feature_order]  # Ensure correct column order

        predicted_ratings = model.predict(input_features)

        # Rank movies by predicted ratings
        movie_scores = list(zip(movies_data["movie_id"], predicted_ratings))
        movie_scores = sorted(movie_scores, key=lambda x: x[1], reverse=True)

        for movie_id, rating in movie_scores:
            if len(recommended_movies) >= maximum_recommendation_list_size:
                break
            if movie_id in seen_movies:
                continue
            
            # Get IMDb votes for the current movie
            imdb_votes = movies_data.loc[movies_data["movie_id"] == movie_id, "imdb_num_votes"].values[0]

            if rating > recommendation_threshold and imdb_votes >= 1000:
                recommended_movies.append(movie_id)

        recommendations[user_id] = recommended_movies

    return recommendations

##### MOVIE ATTRIBUTES FETCHING

In [61]:
def get_movie_attributes( recommendation_dict ,
                          movie_attributes ):

    recommendation_df = pd.DataFrame(recommendation_dict).melt() 
    recommendation_df.columns = ['user_id','movie_id']


    recommended_movie_attributes = pd.merge( recommendation_df ,
                                             movie_attributes )


    return recommended_movie_attributes


#### DATA

In [4]:
movielens_users_data = pd.read_csv(r"C:\Users\User\Desktop\MovieLens Project\data\movielens_users_data.csv.gz")

movielens_ratings_data = pd.read_csv(r"C:\Users\User\Desktop\MovieLens Project\data\movielens_ratings_data.csv.gz")


In [46]:
movielens_movies_ext = pd.read_csv(r"C:\Users\User\Desktop\MovieLens Project\data\movielens_movies_ext_filtered.csv.gz")

movielens_movies_ext_attributes = pd.read_csv(r"C:\Users\User\Desktop\MovieLens Project\data\movielens_movies_ext_filtered_attributes.csv.gz")


## IMPLEMENTATION

#### RECOMMENDER SYSTEM TESTING

In [51]:
# First , let's create watch history 

movielens_watch_history = movielens_ratings_data[['user_id','movie_id']]


# Now , let's use function 

movielens_watch_list = create_user_watch_list(users_data = movielens_users_data,
                                              watch_history = movielens_watch_history)

movielens_watch_list.sample(5)

Unnamed: 0,user_id,movie_id
5275,5331,"{1, 260, 1288, 527, 2710, 3862, 2712, 1304, 13..."
5971,6038,"{920, 1354, 1419, 3396}"
2167,2192,"{2, 648, 3593, 2826, 1291, 2572, 2193, 3489, 3..."
5146,5201,"{3203, 260, 1674, 1419, 908, 909, 910, 912, 91..."
3543,3580,"{1527, 260, 2571, 2699, 1037, 2446, 2468, 2490..."


In [73]:
# Now , we can use recommender function 

movielens_user_ratings_hist_gradient_boosting_recommendations = recommend_movies( users_data = movielens_users_data.sample(5) ,
                                                                                  movies_data = movielens_movies_ext.sample(100000),
                                                                                  watch_list = movielens_watch_list,
                                                                                  model = movielens_user_movies_rating_hist_gradient_boosting,
                                                                                  recommendation_threshold = 3.75 ,
                                                                                  maximum_recommendation_list_size = 5 )


In [75]:
# now , we can use this dictionary object to obtain movie attributes , thanks to prepared function

movielens_user_ratings_hist_gradient_boosting_recommendations

{2681: [175830, 180291, 204627, 210832, 93038],
 2065: [318, 2571, 180291, 50, 175847],
 335: [213623, 220725, 215412, 188980, 180291],
 3625: [318, 50, 65609, 129788, 76413],
 4709: [252807, 180291, 151549, 190582, 230895]}

In [77]:
movielens_user_ratings_hist_gradient_boosting_recommendations_attributes = get_movie_attributes( recommendation_dict = movielens_user_ratings_hist_gradient_boosting_recommendations ,
                                                                                                 movie_attributes = movielens_movies_ext_attributes )

In [79]:
movielens_user_ratings_hist_gradient_boosting_recommendations_attributes

Unnamed: 0,user_id,movie_id,tconst,primaryTitle,year,genres
0,2681,175830,tt1579361,Chasing Ice,2012,"Biography,Documentary"
1,2681,180291,tt16747572,The Silence of Swastika,2021,"Documentary,History"
2,2681,204627,tt2321405,My Life as a Zucchini,2016,"Animation,Comedy,Drama"
3,2681,210832,tt2569398,And Here's What's Happening to Me,2012,"Comedy,Drama"
4,2681,93038,tt0245280,Michael Jordan to the Max,2000,"Documentary,Sport"
5,2065,318,tt0111161,The Shawshank Redemption,1994,Drama
6,2065,2571,tt0133093,The Matrix,1999,"Action,Sci-Fi"
7,2065,180291,tt16747572,The Silence of Swastika,2021,"Documentary,History"
8,2065,50,tt0114814,The Usual Suspects,1995,"Crime,Drama,Mystery"
9,2065,175847,tt1579592,Kick,2009,"Action,Comedy,Crime"
