# TF-IDF for Book Tags
Term Frequency - Inverse Document Frequency (TF-IDF) is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

### import requirements

In [1]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')
from io import BytesIO

import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import pearsonr

import matplotlib.pyplot as plt
import seaborn as sns

import datetime
import re
import string
import random
import requests
from collections import Counter
from PIL import Image

In [2]:
import sklearn
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.utils import plot_model
from keras.layers import Flatten, Embedding
from keras.layers.convolutional import Conv2D
from keras.layers.pooling import MaxPooling2D
from keras.layers.recurrent import LSTM
from keras.layers import Concatenate, Dot
from keras.optimizers import Adam

### load dataset

my original dataset

In [3]:
df_ratings = pd.read_csv('books/Ratings.csv')
df_ratings.columns = ['uid', 'isbn', 'rating']
df_ratings.head(3)

Unnamed: 0,uid,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [4]:
# df_books = pd.read_csv('books/Books.csv')
# df_books.head(3)

In [5]:
# df_books.shape

the dataset with book tags i want to merge with

source: [Good Reads dataset](https://www.kaggle.com/zygmunt/goodbooks-10k?select=book_tags.csv) obtained from Kaggle

In [6]:
# books_df = pd.read_csv('books/books_10k.csv')
# books_df.head(3)

In [7]:
# booktags_df = pd.read_csv('books/book_tags.csv')
# booktags_df.head(3)

In [8]:
# tags_df = pd.read_csv('books/tags.csv')
# tags_df.head(3)

In [9]:
# booktags_df = booktags_df.merge(tags_df, how='outer', on='tag_id')
# booktags_df.head(3)

In [10]:
# booktags_df = booktags_df.merge(books_df, how='outer', 
#                                 left_on='goodreads_book_id',
#                                 right_on='book_id')
# booktags_df.head(3)

In [11]:
# booktags_df = booktags_df[['tag_id', 'tag_name', 'isbn']]

In [12]:
# booktags_df.shape

merge the two datasets!

In [13]:
# df_books = df_books.merge(booktags_df, left_on='ISBN', right_on='isbn')
# df_books.head(3)

In [14]:
# df_books.shape

In [15]:
# df_books = df_books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'tag_name']]
# df_books.columns = ['isbn', 'title', 'author', 'year', 'publisher', 'tag_name']
# df_books.head(3)

In [16]:
#df_books.to_csv('books/Books_tags.csv')

In [17]:
booktags_df = pd.read_csv('books/Books_tags.csv', index_col=0)
booktags_df.shape

(39800, 6)

In [18]:
df_books = booktags_df.drop_duplicates(subset=['isbn'])
df_books.shape

(398, 6)

In [19]:
num_tags = len(set(booktags_df['tag_name']))
num_books = len(set(booktags_df['isbn']))
num_auths = len(set(booktags_df['author']))
print(f'There are {num_books} books from {num_auths} authors \
labelled with {num_tags} tags in the dataset.')

There are 398 books from 346 authors labelled with 6096 tags in the dataset.


## calculate similarity

calculate similarities between authors

In [20]:
tf_auth = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),
                     min_df=0, stop_words='english')

tfidf_matrix_auth = tf_auth.fit_transform(df_books['author'])

cosine_sim_auth = linear_kernel(tfidf_matrix_auth, tfidf_matrix_auth)

In [21]:
#cosine_sim_auth

In [22]:
titles = df_books['title']
indices = pd.Series(df_books.index, index=df_books['title'])


def authors_recom(title, n):
    '''
    get book recommendations based on author similarities
    '''
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_auth[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]
    book_indices = [i[0] for i in sim_scores]
    
    return titles.iloc[book_indices]

In [23]:
# authors_recom('Howl's Moving Castle', 10)
isbn1 = df_books.loc[df_books['isbn']=='006441034X']
print(isbn1['title'])

15700    Howl's Moving Castle
Name: title, dtype: object


calculate similarities between book tags

In [24]:
tf_tag = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),
                         min_df=0, stop_words='english')

tfidf_matrix_tag = tf_tag.fit_transform(booktags_df['tag_name'])

cosine_sim_tag = linear_kernel(tfidf_matrix_tag, tfidf_matrix_tag)

In [25]:
#cosine_sim_tag

In [26]:
def tags_recom(title, n):
    '''
    get book recommendations based on books tags similarities
    '''
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_tag[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    book_indices = [i[0] for i in sim_scores]
    
    return booktags_df.loc[book_indices]['title'].drop_duplicates()[:n]

In [27]:
tags_recom('Howl\'s Moving Castle', 10)

0                           Corelli's Mandolin : A Novel
100                                  Atonement : A Novel
200             Preludes and Nocturnes (Sandman, Book 1)
300        The League of Extraordinary Gentlemen, Vol. 1
400                                             Affinity
500    Prozac Nation: Young and Depressed in America ...
600                                             The Firm
700                                 The Tortilla Curtain
800                                         Four Blondes
900                                     Chasing the Dime
Name: title, dtype: object

try author and tags together similarity

In [33]:
booktags_df['combined'] = pd.Series(booktags_df[['author','tag_name']]
                                .fillna('').values.tolist()).str.join(' ')

In [34]:
tf_combined = TfidfVectorizer(analyzer='word',ngram_range=(1, 2)
                              ,min_df=0, stop_words='english')

tfidf_matrix_combined = tf_combined.fit_transform(booktags_df['combined'])

cosine_sim_combined = linear_kernel(tfidf_matrix_combined,
                                    tfidf_matrix_combined)

In [35]:
def combined_recom(title, n):
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_combined[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]
    book_indices = [i[0] for i in sim_scores]
    
    return titles.iloc[book_indices]

In [36]:
combined_recom('Howl\'s Moving Castle', 10)

0                            Corelli's Mandolin : A Novel
500     Prozac Nation: Young and Depressed in America ...
2000                               Snow Falling on Cedars
4000                                             The Reef
4300             What Looks Like Crazy On An Ordinary Day
5100                                          I Am Legend
7000                                  TENDER IS THE NIGHT
1000    Demian. Die Geschichte von Emil Sinclairs Jugend.
2900    Chicken Soup for the Teenage Soul (Chicken Sou...
9400            Endurance: Shackleton's Incredible Voyage
Name: title, dtype: object

## check recommended results with dataset

In [37]:
df_ratings = df_ratings.merge(df_books, on='isbn')[['uid', 'isbn', 'rating', 'title']]

In [38]:
df_ratings[df_ratings['isbn']=='006441034X']

Unnamed: 0,uid,isbn,rating,title
9842,15670,006441034X,8,Howl's Moving Castle
9843,26621,006441034X,8,Howl's Moving Castle
9844,49889,006441034X,9,Howl's Moving Castle
9845,87938,006441034X,8,Howl's Moving Castle
9846,102642,006441034X,4,Howl's Moving Castle
9847,111174,006441034X,0,Howl's Moving Castle
9848,203910,006441034X,10,Howl's Moving Castle
9849,252953,006441034X,9,Howl's Moving Castle
9850,264312,006441034X,10,Howl's Moving Castle


In [39]:
df_ratings[df_ratings['uid']==111174]

Unnamed: 0,uid,isbn,rating,title
753,111174,014023313X,0,The Stone Diaries
2663,111174,067976402X,0,Snow Falling on Cedars
4876,111174,1576737330,0,The Prayer of Jabez: Breaking Through to the B...
6123,111174,038531258X,0,Cold Sassy Tree
6246,111174,037570504X,0,"Breath, Eyes, Memory"
7803,111174,031242227X,0,Running with Scissors: A Memoir
8666,111174,038070952X,0,Ramona and Her Mother (Ramona Quimby (Paperback))
9246,111174,038000321X,0,Alive : The Story of the Andes Survivors (Avon...
9677,111174,1878424114,0,The Seven Spiritual Laws of Success: A Practic...
9847,111174,006441034X,0,Howl's Moving Castle


In [40]:
bought = list(df_ratings[df_ratings['uid']==111174]['title'])

In [41]:
combined = list(combined_recom('Howl\'s Moving Castle', 10))

In [43]:
tags = list(tags_recom('Howl\'s Moving Castle', 10))

In [44]:
set(bought).intersection(set(combined))

{'Snow Falling on Cedars'}

In [46]:
set(bought).intersection(set(tags))

{'Breath, Eyes, Memory'}

In [47]:
set(combined).intersection(set(tags))

{"Corelli's Mandolin : A Novel",
 'Prozac Nation: Young and Depressed in America : A Memoir'}

Becuase the dataset with book tags didn't have that much overlap with our dataset, calculating recommendation accuracy is difficult.

However, from a quick glance checking with a user who bought a lot of books, we had a 1 out of 10 (`10%`) overlap for both using book tags similarity and using book tags and authors combined similarity.

It is also interesting to note that combined similarity recommends quite different results than using tags alone. 