In [81]:
import itertools
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings('ignore')

# Adjusting for Data Frame Output
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

In [82]:
# Read BX-Book-Ratings.csv
origin_rating = pd.read_csv('dataset/Book reviews/Book reviews/BX-Book-Ratings.csv',sep=';',encoding="latin-1")
origin_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [83]:
# Read BX-Book.csv
origin_books = pd.read_csv('dataset/Book reviews/Book reviews/BX_Books.csv', sep=';', encoding="latin-1")
origin_books.drop(columns=['Year-Of-Publication', 'Publisher',
                  'Image-URL-S', 'Image-URL-M', 'Image-URL-L'], inplace=True)
origin_books.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   ISBN         271379 non-null  object
 1   Book-Title   271379 non-null  object
 2   Book-Author  271378 non-null  object
dtypes: object(3)
memory usage: 6.2+ MB


In [84]:
# Read Preprocessed_data.csv
origin_preprocessed = pd.read_csv(
    'dataset/Books Data with Category Language and Summary/Preprocessed_data.csv', sep=',', encoding="latin-1")
origin_preprocessed = origin_preprocessed.loc[:, ['isbn', 'Category']]
origin_preprocessed.rename(columns={'isbn': 'ISBN'}, inplace=True)
origin_preprocessed.drop_duplicates(inplace=True)
origin_preprocessed.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 270170 entries, 0 to 1031174
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   ISBN      270170 non-null  object
 1   Category  270170 non-null  object
dtypes: object(2)
memory usage: 6.2+ MB


In [85]:
# Merge 'origin_books' data and 'origin_rating' data about ISBN
user_book_rating=pd.merge(origin_rating,origin_books,on="ISBN")
user_book_rating.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031175 entries, 0 to 1031174
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1031175 non-null  int64 
 1   ISBN         1031175 non-null  object
 2   Book-Rating  1031175 non-null  int64 
 3   Book-Title   1031175 non-null  object
 4   Book-Author  1031174 non-null  object
dtypes: int64(2), object(3)
memory usage: 47.2+ MB


In [86]:
# Check null data
user_book_rating.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
Book-Title     0
Book-Author    1
dtype: int64

In [87]:
# drop null data
user_book_rating.dropna(inplace=True)
user_book_rating.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031174 entries, 0 to 1031174
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1031174 non-null  int64 
 1   ISBN         1031174 non-null  object
 2   Book-Rating  1031174 non-null  int64 
 3   Book-Title   1031174 non-null  object
 4   Book-Author  1031174 non-null  object
dtypes: int64(2), object(3)
memory usage: 47.2+ MB


In [88]:
# Calculate rating count each ISBN
book_ratingCount = (user_book_rating.groupby(by=['ISBN'])
                    ['Book-Rating'].
                    count().
                    reset_index().
                    rename(columns={'Book-Rating': 'TotalRatingCount'})
                    )

user_book_rating=pd.merge(user_book_rating,book_ratingCount,on="ISBN")

# Delete data about rating count under 50
ratingTreshold = 50
user_book_rating = user_book_rating.query('TotalRatingCount >= @ratingTreshold')
user_book_rating.drop(columns='TotalRatingCount',inplace=True)
user_book_rating.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234876 entries, 0 to 710818
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   User-ID      234876 non-null  int64 
 1   ISBN         234876 non-null  object
 2   Book-Rating  234876 non-null  int64 
 3   Book-Title   234876 non-null  object
 4   Book-Author  234876 non-null  object
dtypes: int64(2), object(3)
memory usage: 10.8+ MB


In [89]:
# Preprocessing 'Category' column
origin_preprocessed["Category"] = origin_preprocessed["Category"].str.replace(
    '[', '')
origin_preprocessed["Category"] = origin_preprocessed["Category"].str.replace(
    ']', '')
origin_preprocessed["Category"] = origin_preprocessed["Category"].str.replace(
    '\'', '')
origin_preprocessed["Category"] = origin_preprocessed["Category"].str.replace(
    '\"', '')
origin_preprocessed["Category"] = origin_preprocessed["Category"].str.replace(
    '-', '_')
origin_preprocessed["Category"] = origin_preprocessed["Category"].str.replace(
    ', ', ',')
origin_preprocessed["Category"] = origin_preprocessed["Category"].str.replace(
    ' & ', '_')
origin_preprocessed["Category"] = origin_preprocessed["Category"].str.replace(
    ' : ', '_')
origin_preprocessed["Category"] = origin_preprocessed["Category"].str.replace(
    ' ', '_')


# Strange category value '9' delete
delte = origin_preprocessed[origin_preprocessed['Category'] == '9'].index
origin_preprocessed.drop(delte, inplace=True)

# # Extract Category data
# category = list(set(origin_preprocessed['Category'].to_list()))

# # Extract User-ID data
# isbn = list(set(user_book_rating['ISBN'].to_list()))

In [90]:
df = origin_preprocessed.copy()
df.set_index('ISBN')
df = df.T
df = df.to_numpy()
df

array([['195153448', '2005018', '60973129', ..., '743203763',
        '767907566', '912333022'],
       ['Social_Science', 'Actresses', '1940_1949', ..., 'Humor',
        'Nature', 'Fiction']], dtype=object)

In [91]:
tvec = TfidfVectorizer(min_df=10)
transformed_weights = tvec.fit_transform(list(df[1]))

In [93]:
display(pd.DataFrame(transformed_weights.toarray(), columns=tvec.get_feature_names_out(),
                                   index=df[0]))

In [105]:
# Cosine similarity (query, documents)
similarity = cosine_similarity(transformed_weights[0], transformed_weights[1:])
similarity = pd.DataFrame(similarity, index=['Similarity'],columns=df[0,1:])


In [106]:
display(similarity.T.sort_values(by='Similarity', ascending=False).T)

Unnamed: 0,517884976,691017778,452283779,679450378,155001965,743452933,671505998,773509534,60924594,967943302,...,688167519,688159311,688080731,684842319,671708317,055380121X,525484612,517556715,517539586,912333022
Similarity,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
