In [2]:
# Core Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy import stats
from scipy.stats import randint, uniform

# Sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split , RandomizedSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import sklearn.metrics as metrics
from sklearn.pipeline import Pipeline
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestRegressor  # For regression
from sklearn.ensemble import RandomForestClassifier  # For classification
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.neighbors import KNeighborsRegressor

pd.options.display.float_format = '{:.2f}'.format

## IMPORT & EXPLORE

#### USER - MOVIES DATA

In [7]:
movielens_user_movies_sample_interaction_data = pd.read_csv(r"C:\Users\User\Desktop\MovieLens Project\data\movielens_user_movies_sample_interaction_data.csv.gz")

In [9]:
movielens_user_movies_sample_interaction_data.sample(5)

Unnamed: 0,movie_id,user_id,user_rating,movie_title,year,clean_movie_title,tconst,isadult,runtimeminutes,genres,genres_code,imdb_avg_rating,imdb_num_votes,user_zip_code,user_gender,bucketized_user_age,user_occupation_label,user_occupation_text,user_gender_code
73186,2433,5355,4.0,"Civil Action, A (1998)",1998,A Civil Action,tt0120633,0,115.0,"Biography,Drama",131,6.6,31689,78232,True,56,11,other/not specified,1
305062,3751,677,2.0,Chicken Run (2000),2000,Chicken Run,tt0120630,0,84.0,"Adventure,Animation,Comedy",61,7.1,217587,94122,True,50,1,artist,1
186957,1704,1150,2.0,Good Will Hunting (1997),1997,Good Will Hunting,tt0119217,0,126.0,"Drama,Romance",280,8.3,1120048,75226,False,25,21,writer,0
266078,2906,2223,1.0,Random Hearts (1999),1999,Random Hearts,tt0156934,0,133.0,"Drama,Mystery,Romance",275,5.4,24517,60123,True,25,1,artist,1
93855,3608,1496,4.0,Pee-wee's Big Adventure (1985),1985,Pee-wee's Big Adventure,tt0089791,0,91.0,"Adventure,Comedy,Family",69,7.0,60696,94108,True,25,18,technician/engineer,1


In [11]:
movielens_user_movies_sample_interaction_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356834 entries, 0 to 356833
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   movie_id               356834 non-null  int64  
 1   user_id                356834 non-null  int64  
 2   user_rating            356834 non-null  float64
 3   movie_title            356834 non-null  object 
 4   year                   356834 non-null  int64  
 5   clean_movie_title      356834 non-null  object 
 6   tconst                 356834 non-null  object 
 7   isadult                356834 non-null  int64  
 8   runtimeminutes         356834 non-null  float64
 9   genres                 356834 non-null  object 
 10  genres_code            356834 non-null  int64  
 11  imdb_avg_rating        356834 non-null  float64
 12  imdb_num_votes         356834 non-null  int64  
 13  user_zip_code          356834 non-null  int64  
 14  user_gender            356834 non-nu

#### MODELS IMPORT

##### HGB 

In [38]:
from joblib import load

movielens_user_movies_rating_hist_gradient_boosting = load(r"C:\Users\User\Desktop\MovieLens Project\models\movielens_user_movies_rating_hist_gradient_boosting.joblib")

In [18]:
movielens_user_movies_rating_hist_gradient_boosting

#### Extra Trees

In [308]:
movielens_user_movies_rating_extra_trees = load(r"C:\Users\User\Desktop\MovieLens Project\models\movielens_user_movies_rating_extra_trees.joblib")

In [310]:
movielens_user_movies_rating_extra_trees

## System Building

Our recommender system will have 3 conditions to satisfy to recommend certain movie to certain user:

1) User has unpacked recommendation list - fastest condition , it must be to avoid scenarios where users has hundreds or even thousands of recommended items. Can be expressed as :  recommendation_list length < n
   
2) User have not seen certain item , obviously people will not be happy if we simply will recommend them to watch movies they already watched and liked , or at least , it will not give a chance to a new movies that users could like too. can be expressed as :
movie_id not in user_watch_list.

3) If 2 previous conditions met , then we use user features and movie features to pass them to the model and calculate the estimated rating , if calculated rating > threshold then we recommend this item , otherwise we keep searching. Can be expressed as e > t 


In order to run through step by step , let's first build function that will make for us this 'watch_list' for each user.

In [12]:
movielens_watch_history = movielens_user_movies_sample_interaction_data[['movie_id','user_id']] # we only need to know if user watched movie

In [28]:
movielens_watch_history = movielens_ratings_data[['movie_id','user_id']]

In [30]:
def create_user_watch_list(user_data, watch_history):
    """
    Creates a DataFrame containing the watch list for each user.
    
    Parameters:
    - user_data: DataFrame with at least a 'user_id' column.
    - watch_history: DataFrame with 'user_id' and 'movie_id' columns.
    
    Returns:
    - DataFrame with 'user_id' and 'watch_list' (set of watched movie_ids).
    """
    watch_list = watch_history.groupby("user_id")["movie_id"].apply(set).reset_index()
    
    return watch_list

In [52]:
# now , we need 2 more datasets , one for available movies and one for users 

movielens_users = movielens_user_movies_sample_interaction_data[['user_id','user_zip_code','user_gender_code',
                                                                 'bucketized_user_age','user_occupation_label']].drop_duplicates()

movielens_movies = movielens_user_movies_sample_interaction_data[['movie_id','year','runtimeminutes',
                                                                  'genres_code','imdb_avg_rating','imdb_num_votes']].drop_duplicates()


In [32]:
movielens_users_watch_list = create_user_watch_list(user_data = movielens_users ,
                                                    watch_history = movielens_watch_history)

In [34]:
movielens_users_watch_list.sample(5)

Unnamed: 0,user_id,movie_id
2908,2939,"{3072, 1921, 904, 2058, 2571, 1293, 3342, 912,..."
1629,1648,"{513, 898, 2051, 903, 908, 527, 2067, 1172, 19..."
5426,5486,"{1089, 555, 910, 592, 144, 1617, 2268, 2903, 1..."
5753,5817,"{2881, 2531, 2694, 1127, 2541, 3005, 2546, 235..."
476,481,"{2944, 2947, 2948, 357, 2950, 3591, 904, 3334,..."


In [312]:
def recommend_movies(users_data, 
                     movies_data, 
                     watch_list, 
                     model, 
                     recommendation_threshold, 
                     maximum_recommendation_list_size):
    """
    Generate personalized movie recommendations for each user.

    Parameters:
    users_data (pd.DataFrame): User dataset containing user features.
    movies_data (pd.DataFrame): Movie dataset containing movie features.
    watch_list (dict): Dictionary mapping user_id to watched movie_ids.
    model: Trained rating prediction model.
    recommendation_threshold (float): Minimum predicted rating required for recommendation.
    maximum_recommendation_list_size (int): Maximum number of recommendations per user.

    Returns:
    dict: {user_id: list of recommended movie_ids}
    """
    feature_order = [
        "year", "runtimeminutes", "genres_code", "imdb_avg_rating", "imdb_num_votes",
        "user_zip_code", "bucketized_user_age", "user_occupation_label", "user_gender_code"
    ]

    recommendations = {}

    for _, user in users_data.iterrows():
        user_id = user["user_id"]
        seen_movies = watch_list.get(user_id, set())

        recommended_movies = []
        user_features = user.drop("user_id").to_frame().T  # Retain feature names

        # Create input features dynamically for each user
        movie_features = movies_data.drop("movie_id", axis=1)
        user_repeated = pd.concat([user_features] * len(movies_data), ignore_index=True)

        input_features = pd.concat([movie_features.reset_index(drop=True), user_repeated], axis=1)
        input_features = input_features[feature_order]  # Ensure correct column order

        predicted_ratings = model.predict(input_features)

        # Rank movies by predicted ratings
        movie_scores = list(zip(movies_data["movie_id"], predicted_ratings))
        movie_scores = sorted(movie_scores, key=lambda x: x[1], reverse=True)

        for movie_id, rating in movie_scores:
            if len(recommended_movies) >= maximum_recommendation_list_size:
                break
            if movie_id in seen_movies:
                continue
            if rating > recommendation_threshold:
                recommended_movies.append(movie_id)

        recommendations[user_id] = recommended_movies

    return recommendations

#### Extended Movies dataset

In [254]:
movielens_movies.shape[0]

# our original subset has only 2886 movies , let's extend thi list using imdb data

2886

In [303]:
imdb_title_data = pd.read_csv(r"C:\Users\User\Desktop\MovieLens Project\data\imdb_data\title.basics.tsv.gz",sep = '\t')

imdb_ratings_data = pd.read_csv(r"C:\Users\User\Desktop\MovieLens Project\data\imdb_data\title.ratings.tsv.gz",sep = '\t')


  imdb_title_data = pd.read_csv(r"C:\Users\User\Desktop\MovieLens Project\data\imdb_data\title.basics.tsv.gz",sep = '\t')


In [435]:
imdb_title_data = imdb_title_data[(imdb_title_data['titleType'] == 'movie')]

In [437]:
movielens_user_movies_sample_interaction_data_tconst_list = list(movielens_user_movies_sample_interaction_data['tconst'].drop_duplicates())

In [439]:
imdb_title_data_not_added_movies = imdb_title_data[~imdb_title_data['tconst'].isin(movielens_user_movies_sample_interaction_data_tconst_list)]

In [441]:
imdb_title_data_not_added_movies = imdb_title_data_not_added_movies.drop(columns = ['endYear']).replace(r'\N',np.nan).dropna()

In [443]:
imdb_title_data_not_added_movies = pd.merge(imdb_title_data_not_added_movies,
                                            imdb_ratings_data).drop_duplicates()

In [447]:
imdb_title_data_not_added_movies = imdb_title_data_not_added_movies[['tconst','startYear','runtimeMinutes',
                                                                     'genres','averageRating','numVotes']]

In [451]:
max_movie_id = movielens_user_movies_sample_interaction_data["movie_id"].max()
imdb_title_data_not_added_movies["movie_id"] = range(max_movie_id + 1, max_movie_id + 1 + len(imdb_title_data_not_added_movies))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_title_data_not_added_movies["movie_id"] = range(max_movie_id + 1, max_movie_id + 1 + len(imdb_title_data_not_added_movies))


In [455]:
imdb_title_data_not_added_movies.columns = movielens_user_movies_sample_interaction_data[['tconst','year','runtimeminutes',
                                                                                          'genres','imdb_avg_rating','imdb_num_votes','movie_id']].columns

In [459]:
movielens_movies_ext = pd.concat( [movielens_user_movies_sample_interaction_data[imdb_title_data_not_added_movies.columns] ,
                                   imdb_title_data_not_added_movies] , axis = 0, 
                                 ).drop_duplicates()

In [463]:
print(movielens_user_movies_sample_interaction_data['genres'].nunique())

print(movielens_movies_ext['genres'].nunique())

# as we can see , our extended movie list has a lot more different genres , but model was trained only on original 319 ones
# so , i will keep only movies that has genres which model was trained on 
# but of course , model can be retrained on larger number of genres 

319
1279


In [465]:
movie_genres_trained = list(movielens_user_movies_sample_interaction_data['genres'].unique())

movielens_movies_ext_filtered = movielens_movies_ext[movielens_movies_ext['genres'].isin(movie_genres_trained)]

In [None]:
###################

In [478]:
# now we need to ensure properly categorical encoding

genre_mapping = dict(movielens_user_movies_sample_interaction_data[['genres', 'genres_code']].drop_duplicates().values)

movielens_movies_ext_filtered['genres_code'] = movielens_movies_ext_filtered['genres'].map(genre_mapping)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movielens_movies_ext_filtered['genres_code'] = movielens_movies_ext_filtered['genres'].map(genre_mapping)


In [480]:
movielens_movies_ext_filtered['genres_code'].unique() == movielens_user_movies_sample_interaction_data['genres_code'].unique()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [692]:
movielens_movies_ext_filtered = movielens_movies_ext_filtered.sample(frac = 1 ,
                                                                     replace = False,
                                                                     random_state = 42) # let's also shuffle it 

movielens_movies_ext_filtered['year'] = movielens_movies_ext_filtered['year'].astype('int')
movielens_movies_ext_filtered['runtimeminutes'] = movielens_movies_ext_filtered['runtimeminutes'].astype('int')
movielens_movies_ext_filtered = movielens_movies_ext_filtered[(movielens_movies_ext_filtered['year'] >= 1990)]
# let's also kepp more new movies


Now, our extended version of movie dataset , can be used to make recommendations , and with a lot more of choice , users will recieve more interesting recommendations.

#### Making Recommendations

In [358]:
movielens_movies_hist_gradient_boosting_recommendations = recommend_movies( users_data = movielens_users.sample(30) ,
                                                                            movies_data = movielens_movies_ext_filtered.sample(100000),
                                                                            watch_list = movielens_users_watch_list ,
                                                                            model = movielens_user_movies_rating_hist_gradient_boosting ,
                                                                            recommendation_threshold = 3.9 ,
                                                                            maximum_recommendation_list_size = 5 )

In [360]:
movielens_movies_hist_gradient_boosting_recommendations

{5500: [110214, 62109, 116590, 1411, 238593],
 4370: [259833, 151549, 527, 228603, 208653],
 5624: [175376, 285892, 267743, 243794, 250173],
 1380: [65609, 98163, 195938, 112190, 253648],
 4775: [223792, 268416, 178993, 527, 245794],
 3869: [161533, 161598, 213095, 217374, 170470],
 5986: [272013, 116471, 179370, 218476, 64030],
 5389: [174724, 80256, 118890, 231900, 271582],
 4496: [189735, 2682, 260294, 230688, 268653],
 5799: [285113, 189172, 258435, 174177, 217751],
 2888: [527, 223792, 144096, 259188, 245794],
 2876: [86346, 65042, 109925, 98147, 93123],
 5049: [228603, 177986, 527, 58817, 296],
 805: [260294, 230688, 268653, 274131, 202594],
 6037: [222619, 186541, 205978, 210163, 252869],
 2980: [527, 140854, 189718, 279068, 86564],
 2032: [527, 2762, 154545, 118890, 216217],
 4811: [55958, 182754, 186541, 64314, 82782],
 2246: [228603, 238166, 207518, 223751, 233834],
 441: [274131, 202594, 170052, 245574, 189735],
 5080: [260824, 115427, 527, 145171, 213452],
 2555: [189735, 1

Built function works well and can work with different parameters we pass , when we pass larger number of users for which we need to recommend movies or larget available list of movies that can be recommended , or even both , it definitely slows down the procces , but it's reasonable cost , especially if we keep in mind that we can pack recommendation list for hundreds of users for about couple of minutes using only single Intel Core i5 11400 Machine and 16GB RAM.

Let's reviews some parameters to understand why function can work faster or slower better:

users_data - that is dataset with unique users , function has to make recommendations for each user if it's possible , so , when size of dataset (number of users) lower , function works faster. 

movies_data - that's also a dataset , it contains informaion about each unique movie , and the conditions the same.

model - that's the model that will be used , altough we cannot directly control this parameter to affect speed , we can build different models to improve speed performance.

recommendation_threshold - that's the threshold which allows function decide if certain movie can be recommended to user or not , when threshold is higher , model usually need to build more predictions before it will met this condition and will be able to recommend movie.

maximum_recommendation_list_size - that's the maximum number of recommended items for single user , when we enter large number , model will have to make more recommendations for each single user , and as a result will make more predictions overall.

_
-

**The total computation time for the function can be expressed as follows:**

**𝑇
≈
𝐶
1
⋅
𝑈
⋅
𝑀
+
𝐶
2
⋅
𝑈
⋅
𝑃
+
𝐶
3
⋅
𝑈
⋅
𝑅
T≈C 
1
​
 ⋅U⋅M+C 
2
​
 ⋅U⋅P+C 
3
​
 ⋅U⋅R**
Where:

𝑈
U = Number of users in users_data
𝑀
M = Number of movies in movies_data
𝑃
P = Number of predictions required per user
𝑅
R = maximum_recommendation_list_size
𝐶
1
,
𝐶
2
,
𝐶
3
C 
1
​
 ,C 
2
​
 ,C 
3
​
  = Constants representing computational complexity of respective operations
Explanation:
Cartesian Product (Users × Movies):

This step merges all users with all movies, leading to 
𝑈
⋅
𝑀
U⋅M operations.
The larger 
𝑈
U and 
𝑀
M, the more combinations to evaluate.
Model Predictions:

Each user-movie pair requires a prediction.
The threshold affects this: a higher recommendation_threshold means more predictions are needed before valid recommendations are found.
Let 
𝑃
P be the average number of predictions made per user before filtering out bad recommendations.
Sorting & Selecting Top Recommendations:

Once valid recommendations are found, sorting and filtering take additional time.
The higher maximum_recommendation_list_size, the more filtering and sorting required, adding complexity proportional to 
𝑈
⋅
𝑅
U⋅R.


When recommendation list packed , we can show basic movie info for each movie id obtained , like title , year , writer and so on.

In [663]:
movielens_movies_ext_filtered_attributes = pd.merge(imdb_title_data,
                                                    movielens_movies_ext_filtered).drop_duplicates()[['tconst','movie_id','primaryTitle',
                                                                                                      'year','genres']]



In [46]:
def get_movie_attributes( recommendation_dict ,
                          moive_attributes ):

    recommendation_df = pd.DataFrame(recommendation_dict).melt() 
    recommendation_df.columns = ['user_id','movie_id']


    recommended_movie_attributes = pd.merge( recommendation_df ,
                                             moive_attributes )


    return recommended_movie_attributes


In [362]:
get_movie_attributes(recommendation_dict = movielens_movies_hist_gradient_boosting_recommendations ,
                     moive_attributes = movielens_movies_ext_filtered_attributes)

Unnamed: 0,user_id,movie_id,tconst,primaryTitle,year,genres
0,5500,110214,tt0346336,The Best of Youth,2003,"Drama,Romance"
1,5500,62109,tt0111341,Satantango,1994,Drama
2,5500,116590,tt0395524,Eiffel... I'm in Love,2003,"Comedy,Romance"
3,5500,1411,tt0116477,Hamlet,1996,Drama
4,5500,238593,tt3854496,Ambiancé,2020,Documentary
...,...,...,...,...,...,...
146,3453,80256,tt0182944,Cry Havoc,1999,"Drama,War"
147,3453,259833,tt6019206,Kill Bill: The Whole Bloody Affair,2006,"Action,Crime,Thriller"
148,3453,151549,tt12361974,Zack Snyder's Justice League,2021,"Action,Adventure,Fantasy"
149,3453,149122,tt11963556,Joda Thath Ka,2020,"Comedy,Drama,Family"


Let's also try different model 

In [348]:
movielens_movies_extra_trees_recommendations = recommend_movies( users_data = movielens_users.sample(30) ,
                                                                 movies_data = movielens_movies_ext_filtered.sample(100000),
                                                                 watch_list = movielens_users_watch_list ,
                                                                 model = movielens_user_movies_rating_extra_trees ,
                                                                 recommendation_threshold = 3.9 ,
                                                                 maximum_recommendation_list_size = 5 )

In [350]:
movielens_movies_extra_trees_recommendations

{2155: [318, 296, 3578, 2571, 150],
 263: [318, 246, 2571, 593, 76412],
 224: [318, 50, 593, 2571, 296],
 2357: [593, 50, 296, 246, 1411],
 3334: [318, 593, 76412, 3114, 1],
 4733: [2028, 296, 50, 76412, 318],
 2906: [318, 2571, 1, 3949, 1361],
 3646: [318, 1, 2762, 593, 2571],
 2766: [1, 50, 3114, 2762, 3897],
 191: [50, 3949, 296, 593, 110],
 2204: [318, 2762, 608, 3897, 593],
 4216: [318, 593, 50, 2762, 2571],
 5991: [2571, 2028, 2762, 1732, 608],
 1972: [318, 2028, 1192, 593, 2762],
 4059: [318, 50, 2028, 76412, 124545],
 382: [2571, 318, 2762, 110, 2628],
 4574: [318, 296, 50, 124545, 593],
 1951: [2571, 3949, 306, 593, 1],
 1243: [318, 76412, 2571, 50, 124545],
 2004: [318, 246, 593, 608, 3114],
 5759: [3578, 318, 2571, 50, 246],
 2402: [318, 2028, 296, 593, 110],
 335: [318, 1, 2571, 2762, 3949],
 2142: [318, 2571, 50, 2762, 110],
 4537: [2762, 593, 3114, 110, 364],
 1176: [593, 50, 2028, 2762, 608],
 4540: [50, 246, 3578, 593, 1],
 5449: [2571, 296, 50, 2762, 318],
 1665: [318,

In [352]:
get_movie_attributes(recommendation_dict = movielens_movies_extra_trees_recommendations ,
                     moive_attributes = movielens_movies_ext_filtered_attributes)

Unnamed: 0,user_id,movie_id,tconst,primaryTitle,year,genres
0,2155,318,tt0111161,The Shawshank Redemption,1994,Drama
1,2155,296,tt0110912,Pulp Fiction,1994,"Crime,Drama"
2,2155,3578,tt0172495,Gladiator,2000,"Action,Adventure,Drama"
3,2155,2571,tt0133093,The Matrix,1999,"Action,Sci-Fi"
4,2155,150,tt0112384,Apollo 13,1995,"Adventure,Drama,History"
...,...,...,...,...,...,...
147,1461,446,tt0106332,Farewell My Concubine,1993,"Drama,Music,Romance"
148,1461,246,tt0110057,Hoop Dreams,1994,"Documentary,Drama,Sport"
149,1461,1280,tt0101640,Raise the Red Lantern,1991,"Drama,Romance"
150,1461,1147,tt0118147,When We Were Kings,1996,"Documentary,Sport"


Both model are able to make fairly accurate recommendations from users , howeever , the ExtraTrees model is significantly bigger (1.3 GB) comparing to HGB (4.4 MB) , and HGM also faster , given that , HGB is very attractive option for our task , the ExtraTrees model can be considered as side option.