In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import scipy as sp
from concurrent.futures import ThreadPoolExecutor

In [4]:
books = pd.read_csv('/kaggle/input/books-csv/Books.csv', delimiter=';', low_memory=False)
ratings = pd.read_csv('/kaggle/input/books-csv/Ratings.csv', delimiter=';')
users = pd.read_csv('/kaggle/input/books-csv/Users.csv', delimiter=';')

  users = pd.read_csv('/kaggle/input/books-csv/Users.csv', delimiter=';')


In [6]:
df_prep_step_1 = pd.merge(books, ratings.query("`Rating` > 0"), on='ISBN', how='inner')

# join users data
df_prep_step_2 = pd.merge(df_prep_step_1, users, on='User-ID', how='inner')

In [7]:
df_prep = df_prep_step_2.drop(['Publisher'], axis=1)
df_isbn = df_prep.drop_duplicates()

In [8]:
df_isbn['Original_NaN'] = df_isbn['Age'].isna()

# Convert 'Age' to numeric, turning non-numeric values into NaN
df_isbn['Age'] = pd.to_numeric(df_isbn['Age'], errors='coerce')

# Drop rows where 'Age' is NaN and were not originally NaN
df_isbn = df_isbn[~(users['Age'].isna() & ~df_isbn['Original_NaN'])]

# Drop the 'Original_NaN' column as it's no longer needed
df_isbn.drop(columns=['Original_NaN'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_isbn['Original_NaN'] = df_isbn['Age'].isna()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_isbn['Age'] = pd.to_numeric(df_isbn['Age'], errors='coerce')
  df_isbn = df_isbn[~(users['Age'].isna() & ~df_isbn['Original_NaN'])]


In [9]:
df_isbn['User-ID'] = pd.to_numeric(df_isbn['User-ID'], errors='coerce')

# Drop rows where 'Age' is NaN and were not originally NaN
df_isbn = df_isbn[~(df_isbn['User-ID'].isna())]

In [10]:
age_outliers = df_isbn.query("Age > 100 or Age < 6")

user_outliers = age_outliers["User-ID"].to_list()

In [11]:
# exclude age outliers
df_isbn = df_isbn[~df_isbn["User-ID"].isin(user_outliers)]

In [12]:
df_isbn.drop_duplicates(inplace=True)

In [12]:
y = df_isbn['Rating']
X = df_isbn.drop('Rating', axis=1)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [86]:
import heapq

In [78]:
user_id_mapping = {uid: i for i, uid in enumerate(df_isbn['User-ID'].unique())}

In [79]:
from sklearn.model_selection import train_test_split
df_isbn['User-ID'] = df_isbn['User-ID'].apply(user_id_mapping.get)
train_ratings, test_ratings = train_test_split(df_isbn[['User-ID', 'ISBN', 'Rating']], test_size=0.3, random_state=42)

In [81]:
from numpy.linalg import solve


class ALSRecommender:
    def __init__(self, num_users, num_items, num_factors=10, regularization=0.1, num_iterations=10):
        self.num_factors = num_factors
        self.regularization = regularization
        self.num_iterations = num_iterations
        self.user_factors = None
        self.item_factors = None
        self.isbn_to_index = None
        self.num_users = num_users
        self.num_items = num_items

    def fit(self, user_item_matrix, isbn_to_index):
        self.isbn_to_index = isbn_to_index
        self.user_factors = np.random.rand(self.num_users, self.num_factors)
        self.item_factors = np.random.rand(self.num_items, self.num_factors)
        
        for iteration in range(self.num_iterations):
            self.user_factors = self.als_step(user_item_matrix, self.item_factors, self.user_factors)
            self.item_factors = self.als_step(user_item_matrix.T, self.user_factors, self.item_factors)
            print(f"Iteration {iteration + 1} complete.")

    def als_step(self, R, fixed_factors, update_factors):
        num_factors = update_factors.shape[1]
        fixed_T_fixed = fixed_factors.T @ fixed_factors
        lambda_eye = self.regularization * np.eye(num_factors)
        
        for u in tqdm(range(update_factors.shape[0])):
            user_ratings = R[u].toarray().flatten()
            rated_indices = np.where(user_ratings != 0)[0]
            fixed_rated = fixed_factors[rated_indices]
            
            Ai = fixed_T_fixed + fixed_rated.T @ fixed_rated + lambda_eye
            Vi = fixed_rated.T @ user_ratings[rated_indices]
            
            update_factors[u] = solve(Ai, Vi)
        return update_factors

    def predict(self, user_id, isbn):
        item_idx = self.isbn_to_index.get(isbn)
        if item_idx is None:
            return None  
        return self.user_factors[user_id] @ self.item_factors[item_idx].T

    def recommend(self, user_id, num_recommendations=5, isbn_to_index):
        scores = self.user_factors[user_id] @ self.item_factors.T
        best_items_indices = np.argsort(scores)[-num_recommendations:][::-1]
        index_to_isbn = {v: k for k, v in self.isbn_to_index.items()}
        best_isbns = [index_to_isbn[idx] for idx in best_items_indices]
        return best_isbns

In [82]:
def create_isbn_mapping(data):
    isbn_list = data['ISBN'].unique()
    isbn_to_index = {isbn: idx for idx, isbn in enumerate(isbn_list)}
    return isbn_to_index

In [109]:
def user_item_normalized(books, ratings):
        books = books.reset_index() # add index as a column
        isbn_mapping = {category: idx for idx, category in enumerate(books['ISBN'].unique())}
        user_id_mapping = {uid: i for i, uid in enumerate(df_isbn['User-ID'].unique())}
        
        ratings = ratings.copy()
        ratings['ISBN_i'] = ratings['ISBN'].map(isbn_mapping) # map ISBN to index
        ratings['User-ID_i'] = ratings['User-ID'].map(user_id_mapping).astype(np.int32)
        ratings.dropna(subset=['ISBN_i'], inplace=True) # drop rows with NaN ISBN_i
        ratings['ISBN_i'] = ratings['ISBN_i'].astype(np.int32)
        
        # Create a sparse user-item matrix
        user_item_matrix = csr_matrix((ratings['Rating'], (ratings['User-ID_i'], ratings['ISBN_i'])), dtype=np.float64)

        # Normalize the user-item matrix
        normalized = user_item_matrix.copy()
        means = np.array([normalized[i].data.mean() for i in range(normalized.shape[0])])
        normalized.data -= np.repeat(means, np.diff(normalized.indptr))
        
        return means, normalized, books, ratings

In [110]:
from scipy.sparse import csr_matrix

means, normalized_matrix, updated_books, updated_ratings = user_item_normalized(df_isbn['ISBN'], df_isbn[['User-ID', 'ISBN', 'Rating']])

In [106]:
train_ratings, test_ratings = train_test_split(df_isbn[['User-ID', 'ISBN', 'Rating']], test_size=0.3, random_state=42)

In [111]:
normalized_matrix.shape

(56688, 132632)

In [108]:
df_isbn['User-ID'].max(), len(df_isbn['ISBN'].unique())

(56687, 132632)

In [112]:
isbn_to_index = create_isbn_mapping(train_ratings)

als1 = ALSRecommender(df_isbn['User-ID'].max(), len(df_isbn['ISBN'].unique()))

als1.fit(normalized_matrix, isbn_to_index)

100%|██████████| 56687/56687 [00:40<00:00, 1390.96it/s]
100%|██████████| 132632/132632 [08:01<00:00, 275.59it/s]


Iteration 1 complete.


  6%|▋         | 3625/56687 [00:02<00:40, 1302.27it/s]


KeyboardInterrupt: 

In [113]:
y_predicted = []
test_ratings = test_ratings[test_ratings['User-ID'].isin(train_ratings['User-ID'].unique())]
for i, test_rating in test_ratings.iterrows():
    y_predicted.append(als1.predict(test_rating['User-ID'], test_rating['ISBN']))

In [130]:
test_ratings['User-ID'].max()

56645

In [148]:
def recommend(als, user_id, num_recommendations, isbn_to_index):
        scores = als.user_factors[user_id] @ als.item_factors.T
        best_items_indices = np.argsort(scores)[-num_recommendations:][::-1]
        index_to_isbn = {v: k for k, v in als.isbn_to_index.items()}
        best_isbns = [index_to_isbn.get(idx) for idx in best_items_indices]
        return best_isbns
    
recommend(als1, 12000, 5, isbn_to_index)  

['0553225286', '0865471185', '0451408977', '0449005666', '0451199685']

In [155]:
def eval(gt_ratings, predictions, k=10):
     
        all_relevances = []
        all_recommended = []
        for user_id in tqdm(gt_ratings['User-ID'].unique()):
            user_ratings = gt_ratings[gt_ratings['User-ID'] == user_id]
            user_relevances = user_ratings.sort_values(by='Rating', ascending=False)['Rating'].tolist()
            all_relevances.append(user_relevances)

            user_recommended = recommend(als1, user_id, k, isbn_to_index)
            all_recommended.append(user_recommended)

        ndcg_val = np.mean([ndcg(relevances, k) for relevances in all_relevances])
        precision_at_k_val = np.mean([precision_at_k(recommended, relevances, k) for recommended, relevances in zip(all_recommended, all_relevances)])
        recall_at_k_val = np.mean([recall_at_k(recommended, relevances, k) for recommended, relevances in zip(all_recommended, all_relevances)])
        average_precision_val = np.mean([average_precision(recommended, relevances) for recommended, relevances in zip(all_recommended, all_relevances)])

        return {
            'ndcg': ndcg_val,
            'precision_at_k': precision_at_k_val,
            'recall_at_k': recall_at_k_val,
            'average_precision': average_precision_val
        }


In [156]:
results = eval(gt_ratings=test_ratings, predictions=y_predicted, k=5)

100%|██████████| 16600/16600 [13:15<00:00, 20.87it/s]


In [157]:
print(results)

{'ndcg': 1.0, 'precision_at_k': 0.0, 'recall_at_k': 0.0, 'average_precision': 0.0}
