In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings('ignore')



In [3]:
df1 = pd.read_csv("book_data/Books_Recommendation.csv",dtype={"Comments":"str","Book_Price":"str","Publication_year":"str","Book_Star":"str","num_Page":"str","Total_Rating_Score":"str"})
df2=df1.head(10000)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          10000 non-null  int64 
 1   Asin                10000 non-null  object
 2   Title               10000 non-null  object
 3   Comments            10000 non-null  object
 4   Book_Price          10000 non-null  object
 5   Book_Star           10000 non-null  object
 6   Category            10000 non-null  object
 7   Author              10000 non-null  object
 8   Publication_year    10000 non-null  object
 9   Publisher           10000 non-null  object
 10  num_Page            10000 non-null  object
 11  Total_Rating_Score  10000 non-null  object
dtypes: int64(1), object(11)
memory usage: 937.6+ KB


In [9]:
features = ['Category','Author','Publisher','Comments',"Book_Price","Book_Star",
            "Publication_year","num_Page","Total_Rating_Score"]

def combine_features(row):
    return row['Category'] +" "+row['Author']+" "+row["Publisher"]+" "+row["Comments"]+" "+row["Book_Price"]+" "+row["Book_Star"]+" "+row["Publication_year"]+" "+row["num_Page"]+" "+row["Total_Rating_Score"]

#Replace NaN with an empty string
for feature in features:
    df2[feature] = df2[feature].fillna("")
    
df2["combined_features"] = df2.apply(combine_features,axis=1)



**CountVectorizer from scikit-learn**

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
cv = CountVectorizer()

count_matrix = cv.fit_transform(df2["combined_features"])


# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix)

def get_title_from_index(ind):
    return df2[df2.index == ind]["Title"].values[0]

def get_index_from_title(Title):
    return df2[df2.Title == Title]["Unnamed: 0"].values[0]



In [31]:
book_user_likes = input("Enter the Book Title")
# Warhammer Age of Sigmar - Im Reich der Unbegrabenen: Gotrek
# Warhammer 40.000 - Mephiston: Kreuzug der Verdammnis

Enter the Book Title Warhammer Age of Sigmar - Im Reich der Unbegrabenen: Gotrek


In [30]:
book_index = get_index_from_title(book_user_likes)

similar_books =  list(enumerate(cosine_sim[book_index]))

sorted_similar_books = sorted(similar_books,key=lambda x:x[1],reverse=True)[1:]

i=0
print("Top 5 similar books, similar to : "+book_user_likes+" are:\n")
for element in sorted_similar_books:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>=5:
        break

Top 5 similar books, similar to : Warhammer Age of Sigmar - Im Reich der Unbegrabenen: Gotrek are:

Warhammer 40.000 - Mephiston: Kreuzug der Verdammnis
Warhammer 40.000 - Malleus: Eisenhorn
Warhammer 40.000 - Die Schwarze Festung
Weihnachten mit Susan Mallery
Grace: Die Biographie


**TfIdfVectorizer from scikit-learn**

In [20]:
#Import TfIdfVectorizer from scikit-learn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
# tfidf = TfidfVectorizer(stop_words='english')

tfidf = TfidfVectorizer()

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df2["combined_features"])

#Output the shape of tfidf_matrix
# tfidf_matrix.shape

In [22]:
# Compute the cosine similarity matrix
cosine_sim2 = linear_kernel(tfidf_matrix, tfidf_matrix)

#Construct a reverse map of indices and movie titles
indices = pd.Series(df2.index, index=df2['Title']).drop_duplicates()

In [33]:
# Function that takes in Book title as input and outputs most similar movies
def get_recommendations(Title, cosine_sim2=cosine_sim2):
    # Get the index of the movie that matches the title
    idx = indices[Title]

    # Get the pairwsie similarity scores of all Book-titles with that Book-title
    sim_scores = list(enumerate(cosine_sim2[idx]))

    # Sort the Book_titles based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar Book-titles
    sim_scores = sim_scores[1:6]

    # Get the Book-title indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar Book-Titles
    return df2['Title'].iloc[movie_indices]

In [34]:
get_recommendations(input("Enter the Book Title"))

# Warhammer Age of Sigmar - Im Reich der Unbegrabenen: Gotrek

Enter the Book Title Warhammer Age of Sigmar - Im Reich der Unbegrabenen: Gotrek


638                 Warhammer 40.000 - Malleus: Eisenhorn
8       Warhammer 40.000 - Mephiston: Kreuzug der Verd...
1326              Warhammer 40.000 - Die Schwarze Festung
1314                 Warhammer 40.000 - Requiem Infernale
9                 Warhammer 40.000 - Hereticus: Eisenhorn
Name: Title, dtype: object

In [36]:
# from surprise import Reader, Dataset, SVD, evaluate
# reader = Reader()
# ratings = pd.read_csv('../input/the-movies-dataset/ratings_small.csv')
# ratings.head()