In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/books-movies-reviews/books_films_reviews.sqlite3


## Document similarity with gensim
This notebook uses tf-idf and other code on a dataset of book and movie reviews to determine document similarity.  The "documents" are the combined set of reviews for each book or movie.

In [6]:
import sqlite3

# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("/kaggle/input/books-movies-reviews/books_films_reviews.sqlite3")
df = pd.read_sql_query("SELECT * from book_reviews_cleaned order by book_id", con)
# Important: Some of the types in the dataset do not match.  In this case we want book_id to always be an integer
df["book_id"] = pd.to_numeric(df["book_id"])

In [13]:
import nltk
import gensim
import numpy as np
from nltk.tokenize import word_tokenize

In [7]:
df.head()

Unnamed: 0,index,book_id,review_id,review_text,id,rating_numeric,BookTitle,cleaned
0,273576,1,829021593,read harry potter they said itll be fun they ...,177953,5.0,harry potter and the half-blood prince,read harry potter said itll fun said childhood...
1,273577,1,737648743,i dont want to talk to anyonedont even look at...,177954,5.0,harry potter and the half-blood prince,dont want talk anyonedont even look mei expect...
2,273578,1,683662307,a 86 extraordinarynotes it dwells on the deli...,177955,5.0,harry potter and the half-blood prince,86 extraordinarynotes dwells delightfully mund...
3,273579,1,137941096,this remains one of my top favorite hp books t...,177956,5.0,harry potter and the half-blood prince,remains one top favorite hp book voldemort ori...
4,273580,1,2856577,i was first introduced to harry potter in a ch...,177957,5.0,harry potter and the half-blood prince,first introduced harry potter childrens lit cl...


This data frame has every book review in the dataset.  We want to combine each review for each book to create a text document that includes every review for each book.

In [10]:
book_reviews = df.groupby("book_id").apply(lambda x: ' '.join(x["cleaned"]))

In [12]:
book_reviews.head(10)

book_id
1      read harry potter said itll fun said childhood...
2      interview jk rowling stephen fry settle import...
3      hear screaming expected illustration brought b...
5      weasley appreciation post one definitely favou...
6      dont mind cry eye way better remember remember...
34     never sad give revered book im sorry cant tell...
93     heidi swiss book originally published german 1...
218    book richard hudson used car dealer try write ...
228    absolutely idea book movie came back surface m...
295    someone recently asked review enjoyed writing ...
dtype: object

In [14]:
docs = [[w for w in word_tokenize(text)] for text in (book_reviews)]

In [17]:
print(len(docs))
print(len(docs[0]))

1290
7910


In [19]:
dictionary = gensim.corpora.Dictionary(docs)

In [20]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

The corpus object is a list of docs, each represented by a list of (id, count) tuples

In [21]:
corpus[0][0:5]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]

In [22]:
tfidf = gensim.models.TfidfModel(corpus)

In [23]:
tfidf.num_docs

1290

An example of what the first doc looks like:

In [27]:
doc_vectors = tfidf[corpus]
for (id, frequency) in doc_vectors[0][0:10]:
    print(dictionary[id], frequency)

-all 0.00806463838675613
-at 0.008494604895732187
-dumbledore 0.012769799231622867
-dumbledores 0.01153399092344341
-fleur 0.012769799231622867
-ginny 0.01153399092344341
-harry 0.009575281096899315
-im 0.009062374307084495
-level 0.010811089405078772
-like 0.00806463838675613


In [42]:
# Normally I'd use Similarity but due to the limits of Kaggle it can't seem to write the index to the current directory
# https://radimrehurek.com/gensim/similarities/docsim.html#gensim.similarities.docsim.Similarity
# If your entire index fits in memory (~one million documents per 1GB of RAM), you can also use the MatrixSimilarity or 
#  SparseMatrixSimilarity classes directly. These are more simple but do not scale as well: they keep the entire index in RAM, no sharding
sim = gensim.similarities.MatrixSimilarity(doc_vectors, num_best=20, num_features= len(dictionary))

We now have a similarity object using the tf-idf vectors.  Here's an example of finding a similar document

In [34]:
df_bookinfo = pd.read_sql_query("SELECT * from books order by book_id", con)
df_bookinfo[df_bookinfo["book_id"]==34]

Unnamed: 0,book_id,title,year,avg_rating,rating_count,review_count,series,series_num,author,description,length,five_stars,four_stars,three_stars,two_stars,one_star,cover_image,standardized_rating,normalized_rating
5,34.0,The Fellowship of the Ring,1955,4.35,2068325.0,18185.0,The Lord of the Rings,1,J.R.R. Tolkien,Alternate Cover Edition ISBN 0618260269 (copyr...,398.0,1194095.0,549990.0,221452.0,60903.0,41885.0,https://images.gr-assets.com/books/1298411339l...,1.753561,0.783333


In [49]:
df_bookinfo[df_bookinfo["book_id"]==34]["title"]

5    The Fellowship of the Ring
Name: title, dtype: object

In [54]:
# book id 34 is index 5
test_doc = doc_vectors[5]
#print(book_reviews.loc[34])
print(sim[test_doc])

[(5, 0.9999998807907104), (129, 0.8213246464729309), (112, 0.744496762752533), (63, 0.5794501900672913), (142, 0.07914487272500992), (283, 0.061417847871780396), (226, 0.060370441526174545), (516, 0.05916726216673851), (491, 0.057614460587501526), (2, 0.05752812325954437), (932, 0.04596899822354317), (1181, 0.045834239572286606), (154, 0.045234933495521545), (320, 0.04481838643550873), (364, 0.044282007962465286), (1273, 0.04383333399891853), (65, 0.042100727558135986), (60, 0.04152218624949455), (115, 0.040946848690509796), (1183, 0.04084629938006401)]


In [55]:
for (doc_index, score) in sim[test_doc]:
    print (df_bookinfo.loc[doc_index]["title"], score)

The Fellowship of the Ring 0.9999998807907104
The Return of the King 0.8213246464729309
The Two Towers 0.744496762752533
The Hobbit or There and Back Again 0.5794501900672913
The Book of Three 0.07914487272500992
In Country 0.061417847871780396
Beowulf 0.060370441526174545
The Notebook 0.05916726216673851
The Cement Garden 0.057614460587501526
Harry Potter and the Sorcerer's Stone 0.05752812325954437
Burton and Speke 0.04596899822354317
Poor Little Bitch Girl 0.045834239572286606
The Maltese Falcon 0.045234933495521545
The Dragon and the George 0.04481838643550873
Harry Potter and the Deathly Hallows 0.044282007962465286
Rebecca 0.04383333399891853
The Road 0.042100727558135986
The Wind in the Willows 0.04152218624949455
Harry Potter and the Chamber of Secrets 0.040946848690509796
Eine Liebe in Deutschland 0.04084629938006401


The similarity search found other Tolkien books were very similar.  Also some other close matches and surprises like The Notebook