In [None]:
import pandas as pd
import os
from surprise import Dataset, Reader, KNNBaseline, AlgoBase
from surprise.model_selection import GridSearchCV
from surprise.trainset import Trainset
import numpy as np

data_dir = 'data_movie_lens_100k/'
user_info = pd.read_csv(os.path.join(data_dir, 'user_info.csv'))
item_info = pd.read_csv(os.path.join(data_dir, 'movie_info.csv'))
ratings = pd.read_csv(os.path.join(data_dir, 'ratings_all_development_set.csv'))
list_test = pd.read_csv(os.path.join(data_dir, 'ratings_masked_leaderboard_set.csv'))
list_train = pd.read_csv(os.path.join(data_dir, 'ratings_all_development_set.csv'))

age_mean = user_info['age'].mean()
age_std = user_info['age'].std()
user_info['age_scaled'] = (user_info['age'] - age_mean) / age_std
release_year_mean = item_info['release_year'].mean()
release_year_std = item_info['release_year'].std()
item_info['release_year_scaled'] = (item_info['release_year'] - release_year_mean) / release_year_std
merged = ratings.merge(user_info[['user_id', 'age_scaled', 'is_male']], on='user_id')
merged = merged.merge(item_info[['item_id', 'release_year_scaled']], on='item_id')

# Define a custom similarity function that incorporates user and item features
def custom_similarity(trainset, user_based=True):
    """
    Custom similarity function that includes user and item features.

    Args:
        trainset (Trainset): The trainset object.
        user_based (bool): Whether to compute user-based or item-based similarity.

    Returns:
        numpy.ndarray: The similarity matrix.
    """
    if user_based:
        n_users = trainset.n_users
        similarity_matrix = np.zeros((n_users, n_users))
        for u1 in range(n_users):
            for u2 in range(u1 + 1, n_users):
                # Get common items rated by users u1 and u2
                i_u1 = set(trainset.ur[u1])
                i_u2 = set(trainset.ur[u2])
                common_items = i_u1.intersection(i_u2)

                if not common_items:
                    similarity = 0.0  # Default similarity if no common items
                else:
                    # Calculate a basic similarity (e.g., based on common ratings)
                    ratings_u1 = [trainset.ur[u1][trainset.to_inner_iid(i)][1] for i in common_items]
                    ratings_u2 = [trainset.ur[u2][trainset.to_inner_iid(i)][1] for i in common_items]
                    similarity = np.corrcoef(ratings_u1, ratings_u2)[0, 1]
                    if np.isnan(similarity):
                        similarity = 0.0

                    # Incorporate user features (age and gender)
                    user1_id = trainset.to_raw_uid(u1)
                    user2_id = trainset.to_raw_uid(u2)
                    user1_age = user_info[user_info['user_id'] == int(user1_id)]['age_scaled'].values[0]
                    user2_age = user_info[user_info['user_id'] == int(user2_id)]['age_scaled'].values[0]
                    user1_gender = user_info[user_info['user_id'] == int(user1_id)]['is_male'].values[0]
                    user2_gender = user_info[user_info['user_id'] == int(user2_id)]['is_male'].values[0]

                    age_similarity = 1 - abs(user1_age - user2_age) / 3  # Scale by max age difference
                    gender_similarity = 1 if user1_gender == user2_gender else 0
                    # Combine similarities (adjust weights as needed)
                    similarity = 0.5 * similarity + 0.2 * age_similarity + 0.3 * gender_similarity

                similarity_matrix[u1, u2] = similarity
                similarity_matrix[u2, u1] = similarity  # Similarity matrix is symmetric
        return similarity_matrix
    else: #Item Based
        n_items = trainset.n_items
        similarity_matrix = np.zeros((n_items, n_items))
        for i1 in range(n_items):
            for i2 in range(i1 + 1, n_items):
                # Get common users who rated items i1 and i2
                u_i1 = set(trainset.ir[i1])
                u_i2 = set(trainset.ir[i2])
                common_users = u_i1.intersection(u_i2)
                if not common_users:
                    similarity = 0.0
                else:
                    #get ratings for common users
                    ratings_i1 = [trainset.ir[i1][trainset.to_inner_uid(u)][1] for u in common_users]
                    ratings_i2 = [trainset.ir[i2][trainset.to_inner_uid(u)][1] for u in common_users]
                    similarity = np.corrcoef(ratings_i1, ratings_i2)[0,1]
                    if np.isnan(similarity):
                        similarity = 0.0
                    #incorporate item feature
                    item1_id = trainset.to_raw_iid(i1)
                    item2_id = trainset.to_raw_iid(i2)
                    item1_year = item_info[item_info['item_id'] == int(item1_id)]['release_year_scaled'].values[0]
                    item2_year = item_info[item_info['item_id'] == int(item2_id)]['release_year_scaled'].values[0]
                    year_similarity = 1- abs(item1_year-item2_year)/3 #scale
                    similarity = 0.7*similarity + 0.3 * year_similarity
                similarity_matrix[i1,i2] = similarity
                similarity_matrix[i2,i1] = similarity
        return similarity_matrix
# Define a custom KNN algorithm that uses the custom similarity function
class CustomKNN(KNNBaseline):
    """
    Custom KNN algorithm that incorporates user and item features.
    """

    def __init__(self, k=40, min_support=1, sim_options={},**kwargs):
        super().__init__(k=k, min_support=min_support, sim_options=sim_options,**kwargs)

    def fit(self, trainset):
        """
        Fits the model to the training data.

        Args:
            trainset (Trainset): The training data.
        """
        self.trainset = trainset  # Store the trainset
        self.n_users = trainset.n_users
        self.n_items = trainset.n_items

        # Call the custom similarity function
        self.sim = custom_similarity(trainset, user_based=self.sim_options.get('user_based', True))

        #compute means and baselines.
        self.compute_baselines()

        return self

    def estimate(self, u, i):
        """
        Estimates the rating for a given user-item pair.

        Args:
            u (str): The user id.
            i (str): The item id.

        Returns:
            float: The estimated rating.
        """
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')

        # Convert user and item ids to inner ids
        user_inner_id = self.trainset.to_inner_uid(u)
        item_inner_id = self.trainset.to_inner_iid(i)

        if self.user_based:
            neighbors = [(u2, self.sim[user_inner_id, u2]) for u2 in range(self.n_users) if u2 != user_inner_id]
            neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)[:self.k]
            sum_sim = sum(sim for _, sim in neighbors)
            if sum_sim == 0:
                return self.global_mean
            weighted_sum = sum((self.trainset.ur[u2][self.trainset.to_inner_iid(item_inner_id)][1] - self.trainset.global_mean) * sim for u2, sim in neighbors if self.trainset.knows_user(u2) and self.trainset.knows_item(item_inner_id))
            estimated_rating = self.trainset.global_mean + weighted_sum / sum_sim

        else: #Item based
            neighbors = [(i2, self.sim[item_inner_id, i2]) for i2 in range(self.n_items) if i2 != item_inner_id]
            neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)[:self.k]
            sum_sim = sum(sim for _, sim in neighbors)

            if sum_sim == 0:
                return self.global_mean

            weighted_sum = sum((self.trainset.ur[user_inner_id][self.trainset.to_inner_iid(i2)][1] - self.trainset.global_mean) * sim for i2, sim in neighbors if self.trainset.knows_user(user_inner_id) and self.trainset.knows_item(i2))
            estimated_rating = self.trainset.global_mean + weighted_sum / sum_sim
        return estimated_rating
# Load data into Surprise Dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'item_id', 'rating']], reader)

# Define the parameter grid for GridSearchCV
param_grid = {
    'k': [20, 30, 40, 50, 60],
    'sim_options': {
        'name': ['cosine', 'MSD'],  # Use only cosine and MSD for simplicity
        'user_based': [True, False],
        'min_support': [1],
    },
}

# Instantiate GridSearchCV with the custom KNN algorithm
gs = GridSearchCV(
    CustomKNN,  # Use our CustomKNN class
    param_grid,
    measures=['mae'],
    cv=5,
    return_train_measures=True,
)

# Fit the model using GridSearchCV
gs.fit(data)



Training with parameters: {'batch_size': 64, 'embedding_size': 30, 'epochs': 10, 'learning_rate': 0.001}
Fold 1/3


InvalidArgumentError: Graph execution error:

Detected at node 'model/item_embedding/embedding_lookup' defined at (most recent call last):
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
      app.start()
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\ipykernel\kernelapp.py", line 739, in start
      self.io_loop.start()
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\tornado\platform\asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\asyncio\base_events.py", line 570, in run_forever
      self._run_once()
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\asyncio\base_events.py", line 1859, in _run_once
      handle._run()
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\asyncio\events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell
      await result
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request
      await super().execute_request(stream, ident, parent)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute
      res = shell.run_cell(
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\IPython\core\interactiveshell.py", line 3009, in run_cell
      result = self._run_cell(
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\IPython\core\interactiveshell.py", line 3064, in _run_cell
      result = runner(coro)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\IPython\core\interactiveshell.py", line 3269, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\IPython\core\interactiveshell.py", line 3448, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\ningn\AppData\Local\Temp\ipykernel_30328\292251384.py", line 186, in <module>
      results = run_grid_search_cv(user_ids, item_ids, user_features, item_features, ratings, grid, num_folds=3)
    File "C:\Users\ningn\AppData\Local\Temp\ipykernel_30328\292251384.py", line 161, in run_grid_search_cv
      history = model.fit(
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\engine\training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\engine\training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\engine\training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\engine\training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\engine\training.py", line 889, in train_step
      y_pred = self(x, training=True)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\engine\training.py", line 490, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\engine\functional.py", line 458, in call
      return self._run_internal_graph(
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\engine\functional.py", line 596, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\ningn\anaconda3\envs\opencv\lib\site-packages\keras\layers\core\embedding.py", line 199, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'model/item_embedding/embedding_lookup'
indices[28,0] = 1665 is not in [0, 1663)
	 [[{{node model/item_embedding/embedding_lookup}}]] [Op:__inference_train_function_4144]