In [None]:
# CP421 Group Project (Group 3)
# -----------------------------------------------------------------
# File: cp421_group_project.ipynb
# Author: Yvonne Itangishaka, Mariam Lom, Hoi Hin Ng, Melissa Pinto
# Due Date: Dec 6th, 2023
# -----------------------------------------------------------------

## Imports


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import seaborn as sns
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.metrics import mean_squared_error, precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('all')


## Load Dataset

In [None]:
# Load the Book Recommendation dataset from Kaggle
books_data = pd.read_csv('Books.csv')
ratings_data = pd.read_csv('Ratings.csv')
users_data = pd.read_csv('Users.csv')

print("Books Data:")
display(books_data.head())
print("\nRatings Data:")
display(ratings_data.head())
print("\nUsers Data:")
display(users_data.head())

# Concatenate relevant columns into 'Books-Data'
books_data["Books-Data"] = (
    books_data["ISBN"].astype(str) +
    books_data["Book-Title"] +
    books_data["Book-Author"] +
    books_data["Year-Of-Publication"].astype(str) +
    books_data["Publisher"] +
    books_data["Image-URL-S"] +
    books_data["Image-URL-M"] +
    books_data["Image-URL-L"]
)

In [None]:
books_data = pd.read_csv('/content/Books.csv', dtype={'Year-Of-Publication': str})
ratings_data = pd.read_csv('/content/Ratings.csv')
users_data = pd.read_csv('/content/Users.csv')

# Concatenate relevant columns into 'Books-Data' column
books_data["Books-Data"] = (
    books_data["ISBN"].astype(str) +
    books_data["Book-Title"]+
    books_data["Book-Author"] +
    books_data["Year-Of-Publication"].astype(str) +
    books_data["Publisher"] +
    books_data["Image-URL-S"] +
    books_data["Image-URL-M"] +
    books_data["Image-URL-L"]
)

print("Books Data:")
display(books_data.head())
print("\nRatings Data:")
display(ratings_data.head())
print("\nUsers Data:")
display(users_data.head())


## Understanding Data

In [None]:
books_data = books_data.drop(columns=['Image-URL-S', 'Image-URL-M', 'Image-URL-L']) #drop because it is not needed
#Check for missing values
missing_books = books_data.isnull().sum()
print("missing values in books:\n",missing_books)
print("\n")
missing_ratings = ratings_data.isnull().sum()
print("missing values in ratings:\n",missing_ratings)
print("\n")
missing_users = users_data.isnull().sum()
print("missing values in users:\n",missing_users)
print("\n")

#Check for duplicates in data set
print("duplicates in books:\n",books_data.duplicated().sum())
print("\n")
ratings_books = ratings_data.duplicated().sum()
print("duplicates in ratings:\n",ratings_books)
print("\n")
duplicate_users = users_data.duplicated().sum()
print("duplicates in users:\n",duplicate_users)

#print ((ratings_data==0).values.sum())

## Merge All Tables

In [None]:
#Merge tables
first_merged_data = pd.merge(books_data, ratings_data, on='ISBN', how='inner')
merged_data = pd.merge(first_merged_data, users_data, on='User-ID', how='inner')

display(merged_data.head())

## Data Visualization

In [None]:
#Visualizations, check for any outliers!

#Publication Year
valid_years = merged_data['Year-Of-Publication'].astype(str).str.isnumeric()
filtered_data = merged_data[valid_years]
plt.figure(figsize=(10, 6))
sns.kdeplot(filtered_data['Year-Of-Publication'].astype(int), color='skyblue', fill=True)
plt.title('Density Plot of Books Based on Publication Year')
plt.xlabel('Publication Year')
plt.ylabel('Density')
plt.show()

#Count Plot of Books for Top 20 Publication Years
valid_years = merged_data['Year-Of-Publication'].astype(str).str.isnumeric()
filtered_data = merged_data[valid_years]
top_years = filtered_data['Year-Of-Publication'].value_counts().nlargest(20).index
filtered_data_top_years = filtered_data[filtered_data['Year-Of-Publication'].isin(top_years)]
plt.figure(figsize=(12, 6))
sns.countplot(x='Year-Of-Publication', data=filtered_data_top_years, palette='viridis')
plt.title('Count Plot of Books for Top 20 Publication Years')
plt.xlabel('Publication Year')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.show()

# Density plot of Average book rating
average_ratings = merged_data.groupby('ISBN')['Book-Rating'].mean()
plt.figure(figsize=(10, 6))
sns.kdeplot(average_ratings, color='skyblue', fill=True)
plt.title('Density Plot of Average Book Ratings')
plt.xlabel('Average Rating')
plt.ylabel('Density')
plt.show()

#Distribution of Book Ratings
plt.figure(figsize=(10, 6))
plt.hist(merged_data['Book-Rating'], bins=5, edgecolor='black', color='skyblue', alpha=0.7)
overall_average_rating = merged_data['Book-Rating'].mean()
plt.axvline(x=overall_average_rating, color='red', linestyle='dashed', linewidth=2, label=f'Overall Avg: {overall_average_rating:.2f}')
plt.title('Distribution of Book Ratings')
plt.xlabel('Book Rating')
plt.ylabel('Count')
plt.legend()
plt.show()

#Distribution of User rating counts
user_ratings_count = merged_data['User-ID'].value_counts()
plt.figure(figsize=(10, 6))
plt.hist(user_ratings_count, bins=50, edgecolor='black')
plt.title('Distribution of User Ratings')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Users')
plt.xlim(0, 2000)
plt.show()

#most popular locations
merged_data['Location'] = merged_data['Location'].apply(lambda x:x.split(',')[-1])
merged_data['Location'].value_counts().head(10).plot(kind='bar')

## Setting Thresholds

In [None]:
#user who has rated at least x books
#books that have received at least x ratings ( x average ratings ?) Book ratings>=50
#removes outliers
merged_data = merged_data.groupby('ISBN').filter(lambda x: x['Book-Rating'].count() >= 50)
merged_data = merged_data.groupby('User-ID').filter(lambda x: x['Book-Rating'].count() >= 50)

# Display the merged_data after applying the threshold
print("Merged data after applying thresholds:")
display(merged_data.head())

print(merged_data.shape)


## KNN Implementation 

In [None]:
book_user_mat = merged_data.pivot(index='ISBN', columns='User-ID', values='Book-Rating').fillna(0)
book_user_mat_sparse = csr_matrix(book_user_mat.values)
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(book_user_mat_sparse)


target_book_isbn = '0452264464'  # Replace with the actual ISBN
target_book_index = book_user_mat.index.get_loc(target_book_isbn)
distances, indices = model_knn.kneighbors([book_user_mat.iloc[target_book_index].values], n_neighbors=5)

print("Nearest Neighbors for Book with ISBN:", target_book_isbn)
for i, (distance, index) in enumerate(zip(distances.flatten(), indices.flatten())):
    neighbor_book_isbn = book_user_mat.index[index]
    print(f"{i + 1}. ISBN: {neighbor_book_isbn}, Distance: {distance}")

# print("Distances:", distances.flatten())
# print("Indices:", indices.flatten())

# plt.figure(figsize=(10, 6))
# plt.barh(range(len(distances.flatten())), distances.flatten(), color='skyblue')
# plt.xlabel('Distance')
# plt.ylabel('Neighbor')
# plt.title(f'Nearest Neighbors for Book with ISBN: {target_book_isbn}')
# plt.yticks(range(len(indices.flatten())), [book_user_mat.index[idx] for idx in indices.flatten()])
# plt.gca().invert_yaxis()  # Invert y-axis for better visualization
# plt.show()

## Calculation Global Mean, User Mean, Item Mean

In [None]:


# Calculating global,

global_mean = merged_data['Book-Rating'].mean()

# Test
print("Global Mean:", global_mean)



In [None]:
user_means = merged_data.groupby('User-ID')['Book-Rating'].mean()

# Test
user_id = 11676  # Replace with the desired user ID
user_mean = user_means.get(user_id, global_mean)  # Use global mean if user ID is not found
print(f"User {user_id} Mean Rating:", user_mean)


In [None]:
item_means = merged_data.groupby('ISBN')['Book-Rating'].mean()

# Test
isbn = '0440234743'  # Replace with the desired ISBN
item_mean = item_means.get(isbn, global_mean)  # Use global mean if ISBN is not found
print(f"Item {isbn} Mean Rating:", item_mean)


## Spliting data into Training/Testing sets

In [None]:
trainset, testset = train_test_split(merged_data, test_size=0.2, random_state=42)
# Create user-item matrices training and testing sets
trainset_matrix = trainset.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating', fill_value=0)
testset_matrix = testset.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating', fill_value=0)


## Collaborative Filtering


In [None]:
# Collaborative based filtering

class CollaborativeFilteringRecommendationSystem:
    def __init__(self, user_Item, global_mean, user_means, item_means):
        self.user_Item = user_Item
        self.global_mean = global_mean
        self.user_means = user_means
        self.item_means = item_means
        self.userPearsonSim = 1 - pairwise_distances(user_Item, metric='correlation')
        self.itemPearsonSim = 1 - pairwise_distances(user_Item.T, metric='correlation')

    def userBased_predict(self, user_id, item_id):
        user_ratings = self.user_Item.loc[user_id]
        ratingsSimValue = pd.Series(self.itemPearsonSim[self.user_Item.columns.get_loc(item_id)], index=self.user_Item.columns)
        filter_ratingsSimValue = np.multiply(ratingsSimValue, user_ratings)
        user_mean = self.user_means.get(user_id, self.global_mean)
        itemUserRating_prediction = (filter_ratingsSimValue.sum() + user_mean) / (ratingsSimValue.abs().sum() + 1)
        return itemUserRating_prediction

    def itemBased_predict(self, user_id, item_id):
        user_ratings = self.user_Item.loc[user_id]
        ratingsSimValue = pd.Series(self.userPearsonSim[user_id - 1], index=self.user_Item.index)
        filter_ratingsSimValue = np.multiply(ratingsSimValue, user_ratings)
        item_mean = self.item_means.get(item_id, self.global_mean)
        itemUserRating_prediction = (filter_ratingsSimValue.sum() + item_mean) / (ratingsSimValue.abs().sum() + 1)
        return itemUserRating_prediction

    def bookUser_Recommender(self, user_id, top_n=10):
        itemRatings_prediction = {}
        for item_id in self.user_Item.columns:
            itemUserRating_prediction = self.userBased_predict(user_id, item_id)
            itemRatings_prediction[item_id] = itemUserRating_prediction
        bestBook_recommendations = sorted(itemRatings_prediction.items(), key=lambda x: x[1], reverse=True)[:top_n]
        return bestBook_recommendations


user_item_matrix_sparse = csr_matrix(trainset_matrix.values)

# Start the recommendation system with the training data and additional parameters
recSystrain = CollaborativeFilteringRecommendationSystem(trainset_matrix, global_mean, user_means, item_means)

# Get recommendations for a user from the test set
user_id = 177458
bestBook_recommendations = recSystrain.bookUser_Recommender(user_id)

# Extract only ISBNs from the list of tuples
isbn_Recommend = list(map(lambda x: x[0], bestBook_recommendations))

# Extract book details for the best recommendations using ISBNs
book_info = books_data.set_index('ISBN')
recommended_books_info = book_info.loc[isbn_Recommend]

# Display the best recommendations for the user
display(recommended_books_info[['Book-Title', 'Year-Of-Publication']])

## CF-Metrics

In [None]:
# Get Recommendations
user_id = 177458
bestBook_recommendations = recSystrain.bookUser_Recommender(user_id)

# Extract ISBNs and Actual Ratings
isbn_Recommend, actual_ratings = zip(*[(isbn, testset_matrix.loc[user_id, isbn]) for isbn, _ in bestBook_recommendations])

# Generate Predicted Ratings
itemRatings_prediction = [recSystrain.userBased_predict(user_id, item_id) for item_id in isbn_Recommend]

# Calculate Evaluation Metrics
mse = mean_squared_error(actual_ratings, itemRatings_prediction)
rmse = mean_squared_error(actual_ratings, itemRatings_prediction, squared=False)
mae = mean_absolute_error(actual_ratings, itemRatings_prediction)

print("Evaluation Metrics:")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")


## Content-Based Filtering

In [None]:
merged_data.drop_duplicates(subset='Book-Title', keep='first', inplace=True)

class ContentBasedFiltering:
    def __init__(self, trainset):
        self.trainset = trainset
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = self._create_tfidf_matrix()

    def _create_tfidf_matrix(self):
        book_text_data = self.trainset['Book-Title'].astype(str) + ' ' + self.trainset['Book-Author'].astype(str)  # Concatenate 'Book-Title' and 'Book-Author' for TF-IDF
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(book_text_data)
        return tfidf_matrix

    def recommend_similar_books(self, book_title, top_n=10):
        book_index = self.trainset[self.trainset['Book-Title'] == book_title].index[0]
        similarity_scores = cosine_similarity(self.tfidf_matrix, self.tfidf_matrix[book_index])
        similar_books_indices = similarity_scores.argsort(axis=0)[::-1][1:top_n+1].flatten()

        similar_books_info = self.trainset.iloc[similar_books_indices][["ISBN", "Book-Title", "Book-Author", "Year-Of-Publication"]]
        return similar_books_info

# Perform train-test split on merged_data
trainset, testset = train_test_split(merged_data, test_size=0.2, random_state=42)

# Initialize content-based system with trainset
content_based_system = ContentBasedFiltering(trainset)

# Get content-based recommendations for a specific book title
book_title = "Wild Justice"
bestBook_recommendations = content_based_system.recommend_similar_books(book_title)

# Extract only ISBNs from the list of recommendations
isbn_Recommend = bestBook_recommendations['ISBN'].tolist()

# Extract book details for the recommended ISBNs from books_data
book_info = books_data.set_index('ISBN')
recommended_books_info = book_info.loc[isbn_Recommend]

# Display the recommended book details
display(recommended_books_info[['Book-Title', 'Year-Of-Publication']])

## Hybrid Recomendation System and Metrics

In [None]:
class HybridRecommender:
    def __init__(self, collaborative_filter, content_filter, popular_items, collab_weight=8.5, content_weight=8.5):
        self.collaborative_filter = collaborative_filter
        self.content_filter = content_filter
        self.popular_items = popular_items  # List of popular items as a fallback
        self.collab_weight = collab_weight  # Weight for collaborative filtering scores
        self.content_weight = content_weight  # Weight for content-based filtering scores

    def hybrid_recommendations(self, user_id, book_title, testset_matrix, top_n=10):
        # Generate collaborative filtering recommendations
        collab_recs = self.collaborative_filter.bookUser_Recommender(user_id)

        # Generate content-based recommendations
        content_recs = self.content_filter.recommend_similar_books(book_title)

        # Normalize scores from collaborative filtering recommendations
        collab_scores = {isbn: rating for isbn, rating in collab_recs}
        max_collab_score = max(collab_scores.values())
        normalized_collab_scores = {isbn: score / max_collab_score for isbn, score in collab_scores.items()}

        # Normalize scores from content-based recommendations
        content_scores = {isbn: 1 / (index + 1) for index, isbn in enumerate(content_recs['ISBN'])}
        max_content_score = max(content_scores.values())
        normalized_content_scores = {isbn: score / max_content_score for isbn, score in content_scores.items()}

        # Combine normalized scores with weighted average
        combined_scores = {}
        for isbn in set(normalized_collab_scores) | set(normalized_content_scores):
            collab_score = normalized_collab_scores.get(isbn, 0) * self.collab_weight
            content_score = normalized_content_scores.get(isbn, 0) * self.content_weight
            combined_scores[isbn] = collab_score + content_score

        # Sort the combined recommendations by score
        sorted_combined_recs = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:int(top_n)]

        # Extract ISBNs from the combined recommendations
        isbn_recommendations = [isbn for isbn, _ in sorted_combined_recs]

        # Fallback for new users or items - Recommend popular items if no recommendations
        if not isbn_recommendations:
            isbn_recommendations = self.popular_items

        # Extract book details for the recommended ISBNs from books_data
        book_info = books_data.set_index('ISBN')
        recommended_books_info = book_info.loc[isbn_recommendations]

        # Evaluate performance using testset_matrix
        user_test_ratings = testset_matrix.loc[user_id]
        print("User Test Ratings:")
        print(user_test_ratings)

        # Check if testset_matrix only contains zeros
        print("Testset Matrix contains only zeros:", user_test_ratings.eq(0).all().all())

        # Print non-zero entries in user_test_ratings
        non_zero_ratings = user_test_ratings[user_test_ratings != 0]
        print("Non-Zero Ratings:")
        print(non_zero_ratings)

        actual_ratings = user_test_ratings[user_test_ratings.index.isin(isbn_recommendations)]
        print("Filtered Actual Ratings:")
        print(actual_ratings)

        predicted_ratings = [combined_scores.get(isbn, 0) for isbn in isbn_recommendations]

        # Calculate RMSE on test set
        rmse = mean_squared_error(actual_ratings, predicted_ratings, squared=False)

        # Calculate Precision, Recall, and F1-score
        # Assuming a threshold for predicted ratings to determine relevance
        threshold = 0.1  # Adjust as needed
        predicted_labels = [1 if rating >= threshold else 0 for rating in predicted_ratings]
        actual_labels = [1 if rating > 0 else 0 for rating in actual_ratings]

        precision = precision_score(actual_labels, predicted_labels)
        recall = recall_score(actual_labels, predicted_labels)
        f1 = f1_score(actual_labels, predicted_labels)

        print("Actual Ratings:", actual_ratings)
        print("Predicted Ratings:", predicted_ratings)
        print("Predicted Labels:", predicted_labels)

        #trial on training
        predicted_ratings_train = [combined_scores.get(isbn, 0) for isbn in trainset_matrix.columns]
        actual_ratings_train = trainset_matrix.loc[user_id]
        rmse_train = mean_squared_error(actual_ratings_train, predicted_ratings_train, squared=False)

        # Calculate Precision, Recall, and F1-score for the training set
        predicted_labels_train = [1 if rating >= threshold else 0 for rating in predicted_ratings_train]
        actual_labels_train = [1 if rating > 0 else 0 for rating in actual_ratings_train]

        precision_train = precision_score(actual_labels_train, predicted_labels_train)
        recall_train = recall_score(actual_labels_train, predicted_labels_train)
        f1_train = f1_score(actual_labels_train, predicted_labels_train)

        print("\nTraining Set Metrics:")
        print("RMSE:", rmse_train)
        print("Precision:", precision_train)
        print("Recall:", recall_train)
        print("F1-score:", f1_train)

        return recommended_books_info[['Book-Title', 'Year-Of-Publication']], rmse, precision, recall, f1,rmse_train,precision_train,recall_train,f1_train

collaborative_filter = CollaborativeFilteringRecommendationSystem(trainset_matrix, global_mean, user_means, item_means)

content_filter = ContentBasedFiltering(trainset)

# List of popular items as a fallback (Books with ratings of 10)
popular_items = [
    '3596151465',
    '055310666X',
    '60096195',
    '142302198',
    '038076041X',
    '699854289',
    '786817070',
    '805057706',
    '1573248533',
    '3423071516'
]

# Initialize the hybrid recommender system with both filters and popular items
hybrid_recommender = HybridRecommender(collaborative_filter, content_filter, popular_items)

# Get hybrid recommendations for a specific user and book title
user_id = 177458
book_title = "Wild Justice"
# recommended_books, rmse, precision, recall, f1 = hybrid_recommender.hybrid_recommendations(user_id, book_title, testset_matrix)
recommended_books, rmse, precision, recall, f1,rmse_train,precision_train,recall_train,f1_train = hybrid_recommender.hybrid_recommendations(user_id, book_title, trainset_matrix)

# Display the hybrid recommendations and performance metrics
#test set
print("Recommended Books:")
display(recommended_books)
print("RMSE:", rmse)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

#overfitting
print("\nTraining Set Metrics:")
print("RMSE:", rmse_train)
print("Precision:", precision_train)
print("Recall:", recall_train)
print("F1-score:", f1_train)