### Data collection

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import random


import scipy
import math
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split 
from scipy.sparse.linalg import svds 
from sklearn import preprocessing 

import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer 


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lars\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path='./Data/'

df_books=pd.read_csv(path +'filtered_books.csv') 
df_ratings=pd.read_csv(path + 'Ratings.csv') 
df_users=pd.read_csv(path + 'Users.csv') 

  df_books=pd.read_csv(path +'filtered_books.csv')


In [3]:
df_ratings.shape

(1149780, 3)

In [4]:
df_users.describe()



Unnamed: 0,User-ID,Age
count,278858.0,168096.0
mean,139429.5,34.751434
std,80499.51502,14.428097
min,1.0,0.0
25%,69715.25,24.0
50%,139429.5,32.0
75%,209143.75,44.0
max,278858.0,244.0


### Preprocessing

In [5]:
merged_df=pd.merge(df_users,df_ratings,on='User-ID')
merged_df=pd.merge(merged_df,df_books,on='ISBN')



In [6]:
book_review_counts = df_ratings['ISBN'].value_counts()
popular_books = book_review_counts[book_review_counts >= 20].index
filtered_ratings = df_ratings[df_ratings['ISBN'].isin(popular_books)]
#filtered_ratings.to_csv('data/filtered_ratings.csv', index=False)


## Combined Recommendation

In [7]:

from content_based_file import get_content_based_recommendations_by_user
from user_based_file import get_user_based_recommendations_by_user
from nlp_based_file import get_nlp_recommendations_by_user_program



#Variables for content based filtering

book_review_counts = df_ratings['ISBN'].value_counts()
popular_books = book_review_counts[book_review_counts >= 20].index
filtered_ratings = df_ratings[df_ratings['ISBN'].isin(popular_books)]

user_rating_counts = filtered_ratings['User-ID'].value_counts()
active_users = user_rating_counts[user_rating_counts >= 5].index
filtered_ratings = filtered_ratings[filtered_ratings['User-ID'].isin(active_users)]


user_item_matrix = filtered_ratings.pivot(index='User-ID', columns='ISBN', values='Book-Rating').fillna(0)



sparse_matrix = csr_matrix(user_item_matrix.values)
sparse_matrix=sparse_matrix.astype('float32')


item_similarity = cosine_similarity(sparse_matrix.T)  # Transpose for item-item similarity
item_similarity_df = pd.DataFrame(
    item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns
)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lars\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df_books=pd.read_csv(path +'filtered_books.csv')


ISBN
006109918X    1.772117
0060928336    1.716283
0553289063    1.666379
0590471570    1.640632
0140314202    1.602477
0451410742    1.593712
0061000043    1.564670
0312988249    1.557247
880781000X    1.547350
1550544683    1.547350
8804342838    1.547350
3492225926    1.547350
2070362388    1.547350
3250600555    1.547350
1844262553    1.547350
8497593588    1.547350
207038165X    1.547350
0385333404    1.452531
0743474325    1.447388
0425154637    1.431060
dtype: float64
Cosine similarity matrix loaded from 'cosine_similarity_matrix.npy'
6803              Sense of Evil
1824                     Chains
6988                   Ricochet
9839             One for Sorrow
5568     The Portrait of a Lady
1300                   The Heir
9635           Keeping the Moon
4905                       Lies
11209                Wide Awake
11135       The Collected Poems
Name: Book-Title, dtype: object


In [8]:
user_weight = 0.3
content_weight = 0.35
nlp_weight=0.35
top_n_recommendations = 50

example_user_id = 165



def scale_scores(scores, scale_to=10):
    max_score = scores.max()
    if max_score > 0:  # Avoid division by zero
        return (scores / max_score) * scale_to
    return scores



# User-based recommendations
def get_user_based_recommendations(user_id, n=top_n_recommendations):
    recommended_items = get_user_based_recommendations_by_user(user_id, n)
    return pd.DataFrame({
        'ISBN': recommended_items.index,
        'user_score': recommended_items.values
    })

# Content-based recommendations
def get_content_based_recommendations(user_id, n=top_n_recommendations):
    recommended_items = get_content_based_recommendations_by_user(user_id, n)
    return pd.DataFrame({
        'ISBN': recommended_items.index,
        'content_score': recommended_items.values
    })
    
    

    
def get_nlp_recommendations(user_id):
    n=books_read_by_user_with_description(user_id)*10
    recommended_items = get_nlp_recommendations_by_user_program(user_id, n)
    return pd.DataFrame({
        'ISBN': recommended_items,
        'nlp_score': range(len(recommended_items), 0, -1)  # Assign descending scores
    })
    

    
def books_read_by_user_with_description(user_id):
        user_books = filtered_ratings[filtered_ratings['User-ID'] == user_id]['ISBN'].unique()
        return len(user_books)

    
    
   

# Combine the three methods
def combine_recommendations(user_id, user_weight, content_weight,nlp_weight, n=top_n_recommendations):
    # Get top recommendations from both methods
    user_based_recs = get_user_based_recommendations(user_id, n)
    content_based_recs = get_content_based_recommendations(user_id, n)
    nlp_based_recs=get_nlp_recommendations(user_id)
    
     # Scale scores
    user_based_recs['user_score'] = scale_scores(user_based_recs['user_score'])
    content_based_recs['content_score'] = scale_scores(content_based_recs['content_score'])
    nlp_based_recs['nlp_score']=scale_scores(nlp_based_recs['nlp_score'])
    
    # Merge on ISBN
    combined = pd.merge(user_based_recs, content_based_recs, on='ISBN', how='outer')
    combined = pd.merge(combined, nlp_based_recs, on='ISBN', how='outer')

    
    # Fill missing scores with 0
    combined['user_score'] = combined['user_score'].fillna(0)
    combined['content_score'] = combined['content_score'].fillna(0)
    combined['nlp_score']=combined['nlp_score'].fillna(0)
    
    # Calculate hybrid score
    combined['hybrid_score'] = (user_weight * combined['user_score'] +
                                content_weight * combined['content_score']+
                                nlp_weight*combined['nlp_score'])
    
    # Sort by hybrid score
    combined = combined.sort_values(by='hybrid_score', ascending=False)
    
    # Return top recommendations
    return combined.head(40)

# Example usage


final_recommendations = combine_recommendations(example_user_id,user_weight , content_weight,nlp_weight ,n=top_n_recommendations)

print(final_recommendations)


similar users:  [165, 136733, 50711, 275610, 96354, 122235, 218836, 207727, 130793, 212009, 33036, 144348, 101299, 78545, 222941, 113618, 37790, 21870, 243607, 191913, 42759, 81854, 486, 28709, 257804, 112818, 96589, 216336, 248850, 112598, 194735, 66591, 250196, 93421, 110493, 189891, 6501, 147687, 82901, 65653, 231694, 179591, 242878, 110165, 80036, 143807, 136104, 266697, 46197, 185468, 55421]


  df_books=pd.read_csv(path +'filtered_books.csv')


dict_items([('0768322413', 50), ('0345362721', 49), ('0679408835', 48), ('0446517909', 47), ('0590257889', 46), ('0553802542', 45), ('0802132898', 44), ('0385491050', 43), ('0140042393', 42), ('015610685X', 41), ('0786003677', 50), ('0671870602', 49), ('0671685112', 48), ('0380792923', 47), ('1572460733', 46), ('0750925493', 45), ('0714530387', 44), ('0743467175', 43), ('1582430438', 42), ('0743418700', 41)])
           ISBN  user_score  content_score  nlp_score  hybrid_score
0    0060928336   10.000000       9.684926        0.0      6.389724
7    006109918X    4.021708      10.000000        0.0      4.706512
6    0061000043    4.021708       8.829383        0.0      4.296796
96   0768322413    0.000000       0.000000       10.0      3.500000
10   0061097853    3.518994       6.978607        0.0      3.498211
97   0786003677    0.000000       0.000000        9.5      3.325000
50   0553289063    0.000000       9.403322        0.0      3.291163
51   0590471570    0.000000       9.258033 