In [21]:
# Importing Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [32]:
# Data Description
users = pd.read_csv('/content/sample_data/Users.csv')
books = pd.read_csv('/content/sample_data/Books.csv')
ratings = pd.read_csv('/content/sample_data/Ratings.csv')

# Display information about the datasets
print(users.info())




  books = pd.read_csv('/content/sample_data/Books.csv')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB
None


In [33]:
print(users.shape)


(278858, 3)


In [34]:
print(books.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271358 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB
None


In [35]:
print(books.shape)


(271360, 8)


In [36]:
print(ratings.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB
None


In [37]:
print(ratings.shape)



(1149780, 3)


In [38]:
# Preview the datasets
print(users.head())


   User-ID                            Location   Age
0        1                  nyc, new york, usa   NaN
1        2           stockton, california, usa  18.0
2        3     moscow, yukon territory, russia   NaN
3        4           porto, v.n.gaia, portugal  17.0
4        5  farnborough, hants, united kingdom   NaN


In [39]:
print(books.head())


         ISBN                                         Book-Title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

            Book-Author Year-Of-Publication                   Publisher  \
0    Mark P. O. Morford                2002     Oxford University Press   
1  Richard Bruce Wright                2001       HarperFlamingo Canada   
2          Carlo D'Este                1991             HarperPerennial   
3      Gina Bari Kolata                1999        Farrar Straus Giroux   
4       E. J. W. Barber                1999  W. W. Norton &amp; Company   

                                         Image-URL-S  \
0  http://images.amazon.com/images/P/0195153448.0...   
1  http://images.amazon.com/

In [40]:
print(ratings.head())

   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3   276729  052165615X            3
4   276729  0521795028            6


In [41]:
# Display column names to verify
print("Users columns:", users.columns)
print("Books columns:", books.columns)
print("Ratings columns:", ratings.columns)

Users columns: Index(['User-ID', 'Location', 'Age'], dtype='object')
Books columns: Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')
Ratings columns: Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')


In [42]:
# Split 'Location' into 'City', 'State', 'Country'
users[['City', 'State', 'Country']] = users['Location'].str.split(',', expand=True)[[0, 1, 2]]
users.head()

# Remove duplicate books
new_books = books.drop_duplicates('Book-Title')
print(new_books.shape)

# Merge ratings with book titles
ratings_with_name = ratings.merge(new_books, on='ISBN')
ratings_with_name.drop(['ISBN', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1, inplace=True)
print(ratings_with_name.head())

# Merge with user data
users_ratings_matrix = ratings_with_name.merge(users, on='User-ID')
users_ratings_matrix.drop(['Location', 'Age', 'City', 'State', 'Country'], axis=1, inplace=True)
print(users_ratings_matrix.head())

# Drop missing values
users_ratings_matrix.dropna(inplace=True)
print(users_ratings_matrix.shape)

# Filter users and books based on the number of ratings
x = users_ratings_matrix.groupby('User-ID').count()['Book-Rating'] > 100
knowledgeable_users = x[x].index
filtered_users_ratings = users_ratings_matrix[users_ratings_matrix['User-ID'].isin(knowledgeable_users)]

y = filtered_users_ratings.groupby('Book-Title').count()['Book-Rating'] >= 50
famous_books = y[y].index
final_users_ratings = filtered_users_ratings[filtered_users_ratings['Book-Title'].isin(famous_books)]
print(final_users_ratings.head())

# Create pivot table
pivot_table = final_users_ratings.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
pivot_table.fillna(0, inplace=True)
print(pivot_table.head())

# Normalize the pivot table
scaler = StandardScaler(with_mean=True, with_std=True)
pivot_table_normalized = scaler.fit_transform(pivot_table)
print(pivot_table_normalized)


(242135, 8)
   User-ID  Book-Rating            Book-Title Book-Author Year-Of-Publication  \
0   276725            0  Flesh Tones: A Novel  M. J. Rose                2002   
1     2313            5  Flesh Tones: A Novel  M. J. Rose                2002   
2     6543            0  Flesh Tones: A Novel  M. J. Rose                2002   
3     8680            5  Flesh Tones: A Novel  M. J. Rose                2002   
4    10314            9  Flesh Tones: A Novel  M. J. Rose                2002   

          Publisher  
0  Ballantine Books  
1  Ballantine Books  
2  Ballantine Books  
3  Ballantine Books  
4  Ballantine Books  
   User-ID  Book-Rating                                        Book-Title  \
0   276725            0                              Flesh Tones: A Novel   
1     2313            5                              Flesh Tones: A Novel   
2     2313            8             In Cold Blood (Vintage International)   
3     2313            9  Divine Secrets of the Ya-Ya Sisterho

In [43]:
# Model Building
similarity_matrix = cosine_similarity(pivot_table_normalized)
print(similarity_matrix.shape)


(620, 620)


In [47]:
from IPython.display import display, Image

def recommend(book_name):
    # Get the index of the book
    try:
        index = np.where(pivot_table.index == book_name)[0][0]
    except IndexError:
        return f"Book '{book_name}' not found in the dataset."

    # Sort the books based on the similarity score
    similar_books = sorted(list(enumerate(similarity_matrix[index])), key=lambda x: x[1], reverse=True)[1:10]

    # Create a list to store recommended book details
    recommendations = []

    for i, score in similar_books:
        # Get book details
        book_data = new_books[new_books['Book-Title'] == pivot_table.index[i]]
        for title, author, image_url in zip(book_data['Book-Title'], book_data['Book-Author'], book_data['Image-URL-M']):
            recommendations.append({
                "title": title,
                "author": author,
                "similarity": score,
                "image_url": image_url
            })

    # Display images alongside book details
    for rec in recommendations:
        display(Image(url=rec['image_url'], width=100))
        print(f"Title: {rec['title']}, Author: {rec['author']}")

# Test the recommendation function
recommend('1984')


Title: Foucault's Pendulum, Author: Umberto Eco


Title: Tis : A Memoir, Author: Frank McCourt


Title: Animal Farm, Author: George Orwell


Title: The Glass Lake, Author: Maeve Binchy


Title: Summer Pleasures, Author: Nora Roberts


Title: Slaughterhouse Five or the Children's Crusade: A Duty Dance With Death, Author: Kurt Vonnegut


Title: 2010: Odyssey Two, Author: Arthur C. Clarke


Title: The Bonesetter's Daughter, Author: Amy Tan


Title: Word Freak: Heartbreak, Triumph, Genius, and Obsession in the World of Competitive Scrabble Players, Author: Stefan Fatsis
