In [16]:
# To store the data
import pandas as pd

# To do linear algebra
import numpy as np

# To create plots
import matplotlib.pyplot as plt

# To create interactive plots
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

# To shift lists
from collections import deque

# To compute similarities between vectors
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# To use recommender systems
import surprise as sp
from surprise.model_selection import cross_validate

# To create deep learning models
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

# To create sparse matrices
from scipy.sparse import coo_matrix

# To stack sparse matrices
from scipy.sparse import vstack

In [4]:
ratings = pd.read_csv('/media/einhard/Seagate Expansion Drive/3380_data/data/reviews_dedup.csv').drop('Unnamed: 0', axis=1)

In [5]:
books = pd.read_csv('/media/einhard/Seagate Expansion Drive/3380_data/data/books_dedup.csv').drop('Unnamed: 0', axis=1)

# Using Cosine TFIDF

In [8]:
books

Unnamed: 0,title,author,average_rating,ratings_count,text_reviews_count,description,shelf_1,book_id,weighted_score,name
0,Poet Of The Wrong Generation,6572997.0,4.91,425.0,348.0,"""It's not that I don't love you, and my tears ...",music,31675691,4.805233,Lonnie Ostrow
1,"Words of Radiance, Part 2 (The Stormlight Arch...",38550.0,4.82,1661.0,123.0,Brandon Sanderson's Stormlight Archive moves i...,fantasy,21100112,4.793545,Brandon Sanderson
2,"Driven Collection (Driven, #1-3.5)",7047863.0,4.87,537.0,67.0,If you are new to this NYT Best Selling series...,favorites,26252816,4.788629,K. Bromberg
3,Take Me Home Yearbook,6422617.0,4.82,1365.0,34.0,,one-direction,17169204,4.788011,Liam Payne
4,Evolve: 2 Minute Wisdom,2998860.0,4.93,304.0,21.0,"""This 2 minute wisdom of 'Evolve' is like a in...",currently-reading,20492657,4.786597,Radhanath Swami
...,...,...,...,...,...,...,...,...,...,...
401985,The Spiders,448306.0,2.02,1100.0,180.0,,novels,6668764,2.102379,D'Arcy Adrian-Vallance
401986,Melanie's Marvelous Measles,6857453.0,1.25,110.0,58.0,Melanie's Marvelous Measles takes children on ...,never,17152891,2.082722,Stephanie Messenger
401987,Coming Out Straight: Understanding and Healing...,15306662.0,1.73,295.0,97.0,"Richard Cohen, a former homosexual, now marrie...",will-never-read,2574850,2.046625,Richard Cohen
401988,The Girl with the Solar Eyes (The Tale of Onor...,8146489.0,1.39,148.0,13.0,"WARNING: This book contains sex, violence, gri...",fantasy,23511895,2.027553,Dylan Saccoccio


In [9]:
books.shelf_1.fillna("", inplace=True)

In [11]:
books = books.dropna()

In [12]:
books.description.head()

0    "It's not that I don't love you, and my tears ...
1    Brandon Sanderson's Stormlight Archive moves i...
2    If you are new to this NYT Best Selling series...
4    "This 2 minute wisdom of 'Evolve' is like a in...
5    Join Calvin and Hobbeson all their adventures ...
Name: description, dtype: object

In [22]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet') 

[nltk_data] Downloading package punkt to /home/einhard/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/einhard/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /home/einhard/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [25]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/einhard/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [30]:
def preprocess_sentences(text): 
    VERB_CODES = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'} 
    text = text.lower() 
    temp_sent =[] 
    words = nltk.word_tokenize(text) 
    tags = nltk.pos_tag(words) 
    for i, word in enumerate(words): 
        if tags[i][1] in VERB_CODES:  
            lemmatized = lemmatizer.lemmatize(word, 'v') 
        else: 
            lemmatized = lemmatizer.lemmatize(word) 
        if lemmatized not in stop_words and lemmatized.isalpha(): 
            temp_sent.append(lemmatized) 
    finalsent = ' '.join(temp_sent) 
    finalsent = finalsent.replace("n't", " not") 
    finalsent = finalsent.replace("'m", " am") 
    finalsent = finalsent.replace("'s", " is") 
    finalsent = finalsent.replace("'re", " are") 
    finalsent = finalsent.replace("'ll", " will") 
    finalsent = finalsent.replace("'ve", " have") 
    finalsent = finalsent.replace("'d", " would") 
    return finalsent 

In [35]:
books[~books.description_processed.isna()]

Unnamed: 0,title,author,average_rating,ratings_count,text_reviews_count,description,shelf_1,book_id,weighted_score,name,description_processed
2911,How to Seize a Dragon's Jewel (How to Train Yo...,23894.0,4.46,2387.0,133.0,"The Dragon Rebellion has begun, bringing the V...",fantasy,15841027,4.448812,Cressida Cowell,dragon rebellion begin bring viking darkest ho...
7953,Because of the Rain,59414.0,4.40,575.0,45.0,ANNA'S LIFE CHANGES FOREVER ON ONE HORRIFIC NI...,to-read,20981415,4.361177,Deborah Raney,anna life change forever one horrific night as...
14793,"Liberated Parents, Liberated Children: Your Gu...",53578.0,4.35,379.0,49.0,The Companion Volume to How to Talk So Kids Wi...,parenting,164276,4.299267,Adele Faber,companion volume talk kid listen listen kid ta...
17657,"This Side of the Grave (Night Huntress, #5)",669810.0,4.28,46590.0,1978.0,Danger waits on both sides of the grave.\nWith...,vampires,6871617,4.279609,Jeaniene Frost,danger wait side grave mysterious disappearanc...
26204,Rusty Hinges (Metal Boxes #3),1016187.0,4.26,630.0,24.0,"Stone's fist, secure in the combat suit and hi...",to-read,30056458,4.234611,Alan Black,stone fist secure combat suit hide camouflage ...
...,...,...,...,...,...,...,...,...,...,...,...
386453,The Vampire's Beautiful Daughter,81037.0,3.34,96.0,18.0,"Life isn't easy for Johnny Shapiro, despite hi...",vampires,793618,3.536819,S.P. Somtow,life easy johnny shapiro despite mother succes...
390128,Robin: Days of Fire and Madness,12444.0,3.23,77.0,7.0,Written by Bill Willingham Art and cover by Sc...,comics,707856,3.499571,Bill Willingham,write bill willingham art cover scott mcdaniel...
393371,A British Bride by Agreement,1307916.0,3.12,68.0,10.0,"Widowed and far from her native England, Emma ...",kindle,18044026,3.456742,Therese Stenzel,widow far native england emma bank devastate r...
400169,Kick Your Fat in the Nuts,6420217.0,3.19,551.0,42.0,Not only will Tony have you laughing out loud ...,currently-reading,17303400,3.250292,T.C. Hale,tony laugh loud reveal secret behind weight lo...


In [19]:
tfidf = TfidfVectorizer(stop_words='english', strip_accents = 'unicode')
tfidf_matrix = tfidf.fit_transform(books.description)