## Load Data

In [2]:
import pandas as pd
import numpy as np

# Load the datasets, skipping problematic lines
books_df = pd.read_csv('BX_Books.csv', sep=';', encoding='latin-1', header=0, names=['Book-ID', 'Title', 'Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L'], on_bad_lines='skip')
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', encoding='latin-1', on_bad_lines='skip')

# Display the first few rows of each DataFrame
print("Books DataFrame:")
print(books_df.head())


Books DataFrame:
      Book-ID                                              Title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

                 Author  Year-Of-Publication                Publisher  \
0    Mark P. O. Morford                 2002  Oxford University Press   
1  Richard Bruce Wright                 2001    HarperFlamingo Canada   
2          Carlo D'Este                 1991          HarperPerennial   
3      Gina Bari Kolata                 1999     Farrar Straus Giroux   
4       E. J. W. Barber                 1999   W. W. Norton & Company   

                                         Image-URL-S  \
0  http://images.amazon.com/images/P/0195153448.0...   
1  http://images.amazon

## Pre-processing of Both Data

In [3]:
# Drop the image columns
books_df = books_df.drop(columns=['Publisher','Image-URL-S', 'Image-URL-M', 'Image-URL-L'])

# Drop rows with any null (NaN) values
books_df = books_df.dropna()

# Display the cleaned-up dataframe
print("Books DataFrame without image columns and with corrected years:")
print(books_df.head())


Books DataFrame without image columns and with corrected years:
      Book-ID                                              Title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

                 Author  Year-Of-Publication  
0    Mark P. O. Morford                 2002  
1  Richard Bruce Wright                 2001  
2          Carlo D'Este                 1991  
3      Gina Bari Kolata                 1999  
4       E. J. W. Barber                 1999  


In [4]:
# Calculate the shape of the books DataFrame
books_shape = books_df.shape

# Display the shape
print("Shape of the books DataFrame:", books_shape)


Shape of the books DataFrame: (271377, 4)


In [5]:
# Remove duplicates
ratings = ratings.drop_duplicates(subset=['User-ID', 'ISBN'])

# Filter out ratings <= 0
ratings = ratings[ratings['Book-Rating'] >= 0]

# Handle missing data
ratings = ratings.dropna(subset=['Book-Rating'])

print("\nRatings DataFrame:")
print(ratings.head())



Ratings DataFrame:
   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3   276729  052165615X            3
4   276729  0521795028            6


In [6]:
# Group by ISBN to count the number of ratings per book
rating_counts = ratings.groupby('ISBN').size()

# Filter out books rated less than 50 times
books_to_keep = rating_counts[rating_counts >= 50].index

# Filter the ratings to keep only the books rated more than 100 times
ratings_filtered = ratings[ratings['ISBN'].isin(books_to_keep)]

# Group by user to count the number of ratings per user
user_counts = ratings_filtered.groupby('User-ID').size()

# Filter out users who have rated less than 20 books
users_to_keep = user_counts[user_counts >= 20].index

# Filter the ratings to keep only the users who have rated more than 5 books
ratings_filtered = ratings_filtered[ratings_filtered['User-ID'].isin(users_to_keep)]

# Display the result
print(f"Original dataset size: {ratings.shape[0]} rows")
print(f"Filtered dataset size: {ratings_filtered.shape[0]} rows")
print(ratings_filtered.head())


Original dataset size: 1149780 rows
Filtered dataset size: 133749 rows
      User-ID        ISBN  Book-Rating
1083   277195  0060391626           10
1084   277195  0060502258            0
1089   277195  0060987561            0
1098   277195  0316666343            0
1099   277195  0316734837            0


In [7]:
# Filter books that have a corresponding rating
books_in_ratings = books_df[books_df['Book-ID'].isin(ratings_filtered['ISBN'])]

# Display the filtered books DataFrame
print("\nFiltered Books DataFrame:")
print(books_in_ratings.head())



Filtered Books DataFrame:
       Book-ID                                 Title            Author  \
18  0440234743                         The Testament      John Grisham   
19  0452264464  Beloved (Plume Contemporary Fiction)     Toni Morrison   
26  0971880107                           Wild Animus      Rich Shapero   
27  0345402871                              Airframe  Michael Crichton   
28  0345417623                              Timeline  MICHAEL CRICHTON   

    Year-Of-Publication  
18                 1999  
19                 1994  
26                 2004  
27                 1997  
28                 2000  


In [8]:
# Calculate the shape of the books DataFrame
books_shape = books_in_ratings.shape

# Display the shape
print("Shape of the books DataFrame:", books_shape)

Shape of the books DataFrame: (2161, 4)


In [9]:
# Calculate the shape of the books DataFrame
rating_shape = ratings_filtered.shape

# Display the shape
print("Shape of the Rating DataFrame:", rating_shape)

Shape of the Rating DataFrame: (133749, 3)


In [10]:
# Filter ratings within the valid range (0-10)
ratings_filtered = ratings_filtered[(ratings_filtered['Book-Rating'] >= 0) & (ratings_filtered['Book-Rating'] <= 10)]

# Count the total number of books rated by each user
user_ratings_count = ratings_filtered['User-ID'].value_counts()

# Print the count of books rated by each user
print("\nNumber of books rated by each user:")
print(user_ratings_count)



Number of books rated by each user:
User-ID
11676     1583
35859      746
76352      680
16795      608
153662     562
          ... 
187266      20
101620      20
72990       20
253629      20
58612       20
Name: count, Length: 2027, dtype: int64


## Colaborative Filtering

In [11]:
# Mean-center the ratings
ratings_filtered['Mean-Adjusted-Rating'] = ratings_filtered.groupby('User-ID')['Book-Rating'].transform(lambda x: x - x.mean())

print("\nMean-Adjusted Ratings Head:")
print(ratings_filtered[['User-ID', 'ISBN', 'Mean-Adjusted-Rating']].head())



Mean-Adjusted Ratings Head:
      User-ID        ISBN  Mean-Adjusted-Rating
1083   277195  0060391626                  8.75
1084   277195  0060502258                 -1.25
1089   277195  0060987561                 -1.25
1098   277195  0316666343                 -1.25
1099   277195  0316734837                 -1.25


In [12]:
# Convert IDs to categories
ratings_filtered['User-ID'] = ratings_filtered['User-ID'].astype('category')
ratings_filtered['ISBN'] = ratings_filtered['ISBN'].astype('category')

# Create the User-Item Interaction Matrix
user_item_matrix = ratings_filtered.pivot_table(index='User-ID', columns='ISBN', values='Mean-Adjusted-Rating', fill_value=0)

# Show the resulting matrix of rated items
print("\nUser-Item Interaction Matrix with Rated Items:")
print(user_item_matrix.head())


  user_item_matrix = ratings_filtered.pivot_table(index='User-ID', columns='ISBN', values='Mean-Adjusted-Rating', fill_value=0)



User-Item Interaction Matrix with Rated Items:
ISBN     000649840X  0007110928  002026478X  0020442203  002542730X  \
User-ID                                                               
243             0.0         0.0         0.0         0.0         0.0   
254             0.0         0.0         0.0         0.0         0.0   
507             0.0         0.0         0.0         0.0         0.0   
638             0.0         0.0         0.0         0.0         0.0   
882             0.0         0.0         0.0         0.0         0.0   

ISBN     0028604199  006000438X  0060008032  0060008776  006001203X  ...  \
User-ID                                                              ...   
243             0.0    0.000000         0.0         0.0         0.0  ...   
254             0.0   -1.987179         0.0         0.0         0.0  ...   
507             0.0    0.000000         0.0         0.0         0.0  ...   
638             0.0    0.000000         0.0         0.0         0.0  ...  

In [14]:
# Compute item-based cosine similarity
item_base_cosine_similarity = compute_item_similarity_manual(user_item_matrix)

# Convert similarity matrix to a DataFrame for easier viewing
item_similarity_df = pd.DataFrame(
    item_base_cosine_similarity,
    index=user_item_matrix.columns,  # Use ISBNs as row/column labels
    columns=user_item_matrix.columns
)

# Display the first few rows of the similarity DataFrame
print("\nItem-Based Cosine Similarity Matrix (First Few Rows):")
print(item_similarity_df.head())


Starting item similarity computation for 2185 items...
Processing item 1/2185...
Processing item 101/2185...
Processing item 201/2185...
Processing item 301/2185...
Processing item 401/2185...
Processing item 501/2185...
Processing item 601/2185...
Processing item 701/2185...
Processing item 801/2185...
Processing item 901/2185...
Processing item 1001/2185...
Processing item 1101/2185...
Processing item 1201/2185...
Processing item 1301/2185...
Processing item 1401/2185...
Processing item 1501/2185...
Processing item 1601/2185...
Processing item 1701/2185...
Processing item 1801/2185...
Processing item 1901/2185...
Processing item 2001/2185...
Processing item 2101/2185...
Item similarity computation complete.

Item-Based Cosine Similarity Matrix (First Few Rows):
ISBN        000649840X  0007110928  002026478X  0020442203  002542730X  \
ISBN                                                                     
000649840X    1.000000    0.021106    0.005648    0.027808    0.021345   
0007

# Case Based

In [None]:
# Case-Based Similarity Computation (based on user-provided attributes like Author and Year)
def calculate_case_similarity(book, user_prefs, weights):
    similarity = 0
    total_weight = sum(weights.values())
    for attr, weight in weights.items():
        if attr in user_prefs:
            if attr == "Year-Of-Publication":
                similarity += weight * (1 - abs(book[attr] - user_prefs[attr]) / 100)  # Normalize the year difference
            elif attr == "Author":
                similarity += weight * (1 if book[attr] == user_prefs[attr] else 0)  # Exact match for author
    return similarity / total_weight

In [17]:
# Count how many books are available by each author
author_counts = books_df['Author'].value_counts()

# Get the top 10 most frequent authors
top_authors = author_counts.head(10)

# Display the top authors
print("Top Authors:")
print(top_authors)

# Count the distribution of books by year of publication
year_counts = books_df['Year-Of-Publication'].value_counts()

# Get the top 10 most frequent publication years
top_years = year_counts.head(10)

# Display the top years
print("\nTop Years of Publication:")
print(top_years)


Top Authors:
Author
Agatha Christie        632
William Shakespeare    567
Stephen King           524
Ann M. Martin          423
Carolyn Keene          373
Francine Pascal        373
Isaac Asimov           330
Nora Roberts           315
Barbara Cartland       307
Charles Dickens        302
Name: count, dtype: int64

Top Years of Publication:
Year-Of-Publication
2002    17628
1999    17431
2001    17360
2000    17235
1998    15767
1997    14892
2003    14359
1996    14031
1995    13547
1994    11796
Name: count, dtype: int64


In [19]:
# Apply case-based similarity to all books in the filtered books dataset
print("\nCalculating Case-Based Similarity for each book...")
books_in_ratings['Case-Based-Similarity'] = books_in_ratings.apply(
    lambda x: calculate_case_similarity(x, user_preferences, attribute_weights), axis=1
)


Calculating Case-Based Similarity for each book...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_in_ratings['Case-Based-Similarity'] = books_in_ratings.apply(


In [22]:
# Output the top 5 recommended books
print("\nTop 5 Recommended Books:")
for isbn, title, score in recommendations:
    print(f"Title: {title}, Final Score: {score:.2f}")


Top 5 Recommended Books:
Title: A Game of Thrones (A Song of Ice and Fire, Book 1), Final Score: 0.69
Title: Fortune's Hand, Final Score: 0.28
Title: Night Moves (Tom Clancy's Net Force, No. 3), Final Score: 0.28
Title: Bittersweet, Final Score: 0.28
Title: Lake News, Final Score: 0.28
