In [None]:
import pandas as pd
import numpy as np

import nltk
import gensim
from gensim import corpora, models
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD

import warnings
warnings.filterwarnings('ignore')

from IPython.display import clear_output
import timeit

import pickle

In [None]:
#Import cleaned dataset fiction books only 
books = pd.read_csv('books_cleaned_v4.csv')
#Import cleaned reviews
reviews = pd.read_csv('reviews_cleaned_short.csv')
#Import doc-topic model
df_document_topic = pd.read_csv('LDA50kfictionnewclean/df_document_topic.csv')

In [None]:
books.head()

In [None]:
books.shape

In [None]:
books = books[:50000]

In [None]:
books.shape

In [None]:
#Document Term Matrix 
df_document_topic = df_document_topic.drop(columns = ['Unnamed: 0'], axis = 1)

In [None]:
df_document_topic.head()

In [None]:
df_document_topic.shape

### LDA Model 

In [None]:
#load vectorizer 
vectorizer = pickle.load(open("LDA50kfictionnewclean/vectorizer.pk", "rb"))

In [None]:
#load trained LDA model 
lda_model = pickle.load(open("LDA50kfictionnewclean/lda_model.pk", "rb"))

### Scale Number of Ratings and Average Rating for books 

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
books['rating_count_scaled'] = scaler.fit_transform(books[['rating-count']])
books['avg_rating_scaled'] = scaler.fit_transform(books[['rating-avg']])

### Search - Cosine Similarity

In [None]:
import re
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer 
add_sw = ['new','york','times','bestseller','bestselling','author','prize','putlizer']
sw = STOPWORDS.union(set(add_sw))
eng_words = set(nltk.corpus.words.words())

def clean_text(text):
    text = text.lower()
    text = re.sub('<.*?>', ' ', text) #remove tags 
    text = re.sub(r'[^a-zA-Z]',' ',text) #remove anything that is not an alphabet 
    #Remove stop words
    text = re.split(r'[^\w]+',text) 
    text_filtered = [w for w in text if not w in sw]
    #Lemmatize 
    lemmatizer = WordNetLemmatizer()
    text_lemmatized = [lemmatizer.lemmatize(w) for w in text_filtered]
    #remove short words
    text_filtered = [w for w in text_lemmatized if len(w)>1]
    text_filtered = ' '.join(text_filtered).strip()
    return text_filtered

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import heapq
num_topics = 2000
def search_books(text, number_books_recommend, popular = 1):
    start = timeit.default_timer()
    #clean new text input 
    text = clean_text(text)
    #vectorize clean text
    text_vectorized = vectorizer.transform([text])
    topic_probability = lda_model.transform(text_vectorized)
    
    #compute similarity
    similarity_array = cosine_similarity(df_document_topic, topic_probability, dense_output=True)
    #most_similar_books = heapq.nlargest(number_books_recommend, range(len(similarity_array)), similarity_array.__getitem__)
    book_copy = books.copy()
    book_copy['similarity'] = similarity_array
    
    #sort 
    book_copy = book_copy.sort_values(by = 'similarity', ascending = False)
    
    #popular books - filter books that have more than 100000 ratings 
    if popular != 0: 
        book_copy = book_copy[book_copy['rating-count']>100000]

    #print recommended books
    most_similar_books = book_copy.index[:number_books_recommend]

    for book_index in most_similar_books:
        print(book_copy.loc[book_index,'title'])
        print(book_copy.loc[book_index,'description_original'])
        print('ISBN:',book_copy.loc[book_index,'isbn13'])
        print('Rating:',book_copy.loc[book_index,'rating-avg'])
        print('Number of ratings:',book_copy.loc[book_index,'rating-count'])
        print('Cosine Similarity:', book_copy.loc[book_index,'similarity'])
        print('Genre:', book_copy.loc[book_index,'genres'])
        #print('Rank:', book_copy.loc[book_index,'rank'])
        print('\n')
        
    stop = timeit.default_timer()
    print('Run time:', np.round((stop-start)/60, 2), "minutes")

In [None]:
#fantasy 
search_books('wizarding magic',10,popular = 1)

In [None]:
# thriller 
search_books('serial killer murder detective',10, popular = 1)

In [None]:
#science fiction - dystopian 
search_books('dystopian end of world',10, popular = 1)

### Model Testing

In [None]:
#books that have reviews
books_reviews = books[books['isbn13'].isin(reviews['isbn13'])]
print('Number of books with reviews:', len(books_reviews))

In [None]:
def search_books_score(text, number_books_recommend):
    predicted_isbn = []
    #clean new text input 
    text = clean_text(text)
    #vectorize clean text
    text_vectorized = vectorizer.transform([text])
    topic_probability = lda_model.transform(text_vectorized)
    
    #compute similarity
    similarity_array = cosine_similarity(df_document_topic, topic_probability, dense_output=True)
    #most_similar_books = heapq.nlargest(number_books_recommend, range(len(similarity_array)), similarity_array.__getitem__)
    book_copy = books.copy()
    book_copy['similarity'] = similarity_array
    
    #sort 
    book_copy = book_copy.sort_values(by = 'similarity', ascending = False)

    #print recommended books
    most_similar_books = book_copy.index[:number_books_recommend]
    
    for book_index in most_similar_books:
        isbn = book_copy.loc[book_index,'isbn13']
        predicted_isbn.append(isbn)
    
    return predicted_isbn

In [None]:
def test_df(df,number_books_recommend):
    start = timeit.default_timer()
    for x in range(0,len(df)):
        clear_output(wait=True)

        isbn = df.loc[x,'isbn13']
        text = df.loc[x,'review']
        predicted_isbn = search_books_score(text,number_books_recommend)
        df.at[x,'predicted_isbn'] = predicted_isbn

        stop = timeit.default_timer()

        print('Current progress: {} out of {} rows'.format(x+1,len(df)))
        print('Current run time:', np.round((stop-start)/60, 2), "minutes")
    
    print('Computing Score') 
    
    df['intersection'] = df.apply(lambda x: x['isbn13'] in x['predicted_isbn'] ,axis = 1)
    df['score'] = np.where(df['intersection'] == False,0,1)
    
    accuracy = df['score'].sum() / len(df) * 100
    print('Accuracy: {} %'.format(accuracy))
    
    return df

In [None]:
def show_books(df,correct = 1): 
    if correct == 1:   
        filtered_index = df[df['score'] == 1].index
        
    #for 3 correctly predicted reviews
        for x in range(0,3):   
            index_1 = filtered_index[x]
            review_isbn = df.loc[index_1,'isbn13']
            review = df.loc[index_1,'review']
            description = books[books['isbn13'] == review_isbn]['description_original'].values
            title = books[books['isbn13'] == review_isbn]['title'].values
            
            print('\n-----Review-----')
            print('Book title for review:', title)
            print('Review:')
            print(review)

            print('\n-----Target book------')
            print('Title:',title)
            print(description)

    else:
        filtered_index = df[df['score'] == 0].index
    
    #for 3 wrongly predicted reviews
        for x in range(0,3):    
            index_1 = filtered_index[x]
            review_isbn = df.loc[index_1,'isbn13']
            review = df.loc[index_1,'review']

            description = books[books['isbn13'] == review_isbn]['description_original'].values
            title = books[books['isbn13'] == review_isbn]['title'].values
            
            print('\n-----Review-----')
            print('Book title for review:', title)
            print('Review:')
            print(review)

            print('\n-----Target Book------')
            print('Title:',title)
            print(description)
        

### Model Testing - Fantasy Books

In [None]:
fantasy = books_reviews[books_reviews['genres'].str.contains('fantasy',na = False)]
print('Number of fantasy books with at least 1 review:', len(fantasy))

fantasy_reviews = reviews[reviews['isbn13'].isin(fantasy['isbn13'])]
print('Number of reviews for fantasy books:', len(fantasy_reviews))
fantasy_reviews_sample = fantasy_reviews.sample(200, random_state = 20).reset_index(drop = True)
fantasy_reviews_sample.shape

In [None]:
fantasy_reviews_sample = fantasy_reviews_sample[['isbn13','review']]
fantasy_reviews_sample['predicted_isbn'] = None
fantasy_reviews_sample['predicted_isbn'] = fantasy_reviews_sample['predicted_isbn'].astype(object)

In [None]:
test_fantasy = test_df(fantasy_reviews_sample,10)
test_fantasy

In [None]:
show_books(test_fantasy,correct = 1)

In [None]:
show_books(test_fantasy,correct = 0)

### Model Testing - Thriller Books

In [None]:
thriller = books_reviews[books_reviews['genres'].str.contains('thriller',na = False)]
print('Number of thriller books with at least 1 review:', len(thriller))

thriller_reviews = reviews[reviews['isbn13'].isin(thriller['isbn13'])]
print('Number of reviews for thriller:', len(thriller_reviews))
thriller_reviews_sample = thriller_reviews.sample(200, random_state = 20).reset_index(drop = True)
thriller_reviews_sample.shape

In [None]:
thriller_reviews_sample = thriller_reviews_sample[['isbn13','review']]
thriller_reviews_sample['predicted_isbn'] = None
thriller_reviews_sample['predicted_isbn'] = thriller_reviews_sample['predicted_isbn'].astype(object)

In [None]:
test_thriller = test_df(thriller_reviews_sample,10)
test_thriller 

In [None]:
show_books(test_thriller ,correct = 1)

In [None]:
show_books(test_thriller ,correct = 0)

### Model Testing - Science Fiction Books

In [None]:
science_fiction = books_reviews[books_reviews['genres'].str.contains('science-fiction',na = False)]
print('Number of science fiction books with at least 1 review:', len(science_fiction))

sf_reviews = reviews[reviews['isbn13'].isin(science_fiction['isbn13'])]
print('Number of reviews for science fiction books:', len(sf_reviews))
sf_reviews_sample = sf_reviews.sample(200, random_state = 20).reset_index(drop = True)
sf_reviews_sample.shape

In [None]:
sf_reviews_sample = sf_reviews_sample[['isbn13','review']]
sf_reviews_sample['predicted_isbn'] = None
sf_reviews_sample['predicted_isbn'] = sf_reviews_sample['predicted_isbn'].astype(object)

In [None]:
test_sf = test_df(sf_reviews_sample,10)
test_sf

In [None]:
show_books(test_sf ,correct = 1)

In [None]:
show_books(test_sf ,correct = 0)

### Model Testing - Romance Books

In [None]:
romance = books_reviews[books_reviews['genres'].str.contains('romance',na = False)]
print('Number of romance books with at least 1 review:', len(romance))

romance_reviews = reviews[reviews['isbn13'].isin(romance['isbn13'])]
print('Number of reviews for romance books:', len(romance_reviews))
romance_reviews_sample = romance_reviews.sample(200, random_state = 20).reset_index(drop = True)
romance_reviews_sample.shape

In [None]:
romance_reviews_sample = romance_reviews_sample[['isbn13','review']]
romance_reviews_sample['predicted_isbn'] = None
romance_reviews_sample['predicted_isbn'] = romance_reviews_sample['predicted_isbn'].astype(object)

In [None]:
test_romance = test_df(romance_reviews_sample,10)
test_romance

In [None]:
show_books(test_romance ,correct = 1)

In [None]:
show_books(test_romance ,correct = 0)