In [3]:
#Import all the dependencies
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import numpy as np
import re

from IPython.display import clear_output
import timeit

In [11]:
books = pd.read_csv('books_cleaned_v4.csv')

In [12]:
#Train
train = books[:50000]
print('Train size:', len(train))
#train.to_csv('doc2vec_v1/books_train.csv',index = False)

#Test
test = books[50000:60000]
print('Test size:', len(test))
#test.to_csv('doc2vec_v1/books_test.csv',index = False)

Train size: 50000
Test size: 10000


In [13]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]',' ',text) #only keep letters
    return text
    
books['description'] = books['description'].apply(lambda x: clean_text(x))

In [14]:
train_description = train['description']
test_description = test['description']

In [15]:
train_description

0        major motion picture starring amanda stenberg ...
1        deeply moving original dealing material encoun...
2        acclaimed eagerly anticipated fourth thriller ...
3        epicene widely studied johnson play brilliantl...
4        little critter class going critterville museum...
                               ...                        
49995    change die option available planet jeep centur...
49996    captain mackenzie calhoun faced incredible odd...
49997    world divided flier non flier far able fly sac...
49998    doubt ll bigger insect gabby nichols putting s...
49999    mysterious murder dystopian future lead novice...
Name: description, Length: 50000, dtype: object

In [16]:
test_description

50000    conversation shift debate existence global war...
50001    born agrarian ghetto dickens southern outskirt...
50002    length novel novella collected time volume sev...
50003    elegance wealth privilege politics extravaganc...
50004    kit kenyon rate hostage negotiator noah lamber...
                               ...                        
59995    year old winnie willis way horse gentle wildes...
59996    winner      obie playwriting critic pick invas...
59997    installment dci daley series packed accurate p...
59998    walk tombstone star liam neeson unlicensed pri...
59999    sensational thriller richard amp judy lie save...
Name: description, Length: 10000, dtype: object

In [13]:
tagged_data = [TaggedDocument(words=word_tokenize(d), tags=[str(i)]) for i, d in enumerate(train_description)]

In [None]:
features = 2000
epochs = 5

model = Doc2Vec(size=features,
                min_count=1,
                dm =1, #distributed memory, preserves word order 
                epochs = epochs) 
#Build vocab
model.build_vocab(tagged_data)

#Train model 
print('Training Model...')
%time model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

#model.save("doc2vec_v1/d2v_v1.model")
print("Model Saved")

In [4]:
#Load model
#model= Doc2Vec.load("doc2vec_v1/d2v_v1.model")

In [22]:
#document vector for TEST set
document_vector_test = pd.DataFrame()
start = timeit.default_timer()
for x in range(50000,60000): 
    clear_output(wait=True)
    description = word_tokenize(test_description[x])
    vector = model.infer_vector(description)
    vector = vector.reshape(1,2000).tolist()
    document_vector_test = document_vector_test.append(vector)
    
    stop = timeit.default_timer()
    print('Computing for: {} out of {} books'.format(len(document_vector_test),len(test_description)))
    print('Current run time:', np.round((stop-start)/60, 2), "minutes")

#document_vector_test.to_csv('doc2vec_v1/vector_test.csv',index = False)

Computing for: 10000 out of 10000 books
Current run time: 32.2 minutes


In [5]:
## doc_vectors
doc_vectors = np.load('doc2vec_v1/d2v_v1.model.docvecs.vectors_docs.npy')
doc_vectors

array([[-0.01326105, -0.00677363, -0.00065993, ...,  0.00704421,
         0.00545914,  0.00709073],
       [-0.01312576, -0.00297036,  0.00069041, ..., -0.0133282 ,
        -0.01843835, -0.01881793],
       [ 0.0069616 ,  0.00108923, -0.00911423, ...,  0.00695539,
         0.01784323,  0.01078568],
       ...,
       [-0.02609154,  0.00399439,  0.00015313, ..., -0.00907762,
        -0.0138525 , -0.01453141],
       [-0.01182391,  0.00527959,  0.00335627, ...,  0.00764601,
         0.00775823,  0.00235519],
       [-0.01195882, -0.01418764, -0.01553512, ..., -0.01371097,
         0.0035904 , -0.00606263]], dtype=float32)

In [80]:
df_doc_vectors = pd.DataFrame(doc_vectors)

In [81]:
df_doc_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,-0.013261,-0.006774,-0.000660,-0.000293,0.009832,0.010535,0.012721,-0.021488,0.020089,-0.007288,...,0.000458,-0.010699,-0.011727,-0.002300,-0.000770,-0.017063,-0.007267,0.007044,0.005459,0.007091
1,-0.013126,-0.002970,0.000690,0.012485,0.025898,0.012997,-0.005661,-0.018428,0.002324,-0.004618,...,0.001467,-0.002209,0.008576,0.007704,-0.008213,0.007392,-0.008348,-0.013328,-0.018438,-0.018818
2,0.006962,0.001089,-0.009114,-0.016885,0.010773,-0.002402,-0.005502,-0.002807,-0.006670,-0.003398,...,-0.000081,-0.004995,-0.011408,-0.018035,-0.018796,-0.023806,0.002141,0.006955,0.017843,0.010786
3,-0.025949,0.011242,0.003029,0.011299,0.057221,0.019470,-0.006770,-0.018691,0.012505,0.003356,...,-0.020105,-0.015469,-0.004803,0.003606,-0.011103,-0.030019,-0.017571,0.013098,-0.016627,-0.012313
4,0.006467,-0.003168,0.008910,0.001557,-0.007594,0.001969,0.000561,0.006724,0.002906,-0.001929,...,-0.002134,0.004945,0.002511,-0.001814,0.005236,0.002875,0.005892,-0.001138,-0.005302,0.007656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,-0.008076,-0.003420,-0.018366,0.008141,-0.011269,-0.003109,-0.006503,-0.028337,0.003290,-0.009394,...,0.002384,-0.001133,-0.021003,-0.003154,-0.015112,0.006887,-0.027732,0.005829,-0.007524,-0.008869
49996,-0.009236,0.007370,-0.010313,-0.004612,-0.006230,0.006149,0.001094,-0.006059,0.011299,-0.024790,...,-0.000328,-0.002762,0.000538,0.000148,-0.010283,0.000737,0.001922,0.010913,-0.004503,-0.011972
49997,-0.026092,0.003994,0.000153,-0.000263,0.017731,0.022106,-0.006670,-0.028385,0.008884,-0.021834,...,0.003421,-0.012852,-0.003111,0.007272,-0.010175,-0.000518,-0.026980,-0.009078,-0.013853,-0.014531
49998,-0.011824,0.005280,0.003356,-0.004782,0.008378,0.004838,-0.010237,-0.014871,0.015584,-0.009218,...,-0.007070,-0.006861,0.000704,-0.010008,-0.006082,-0.009305,-0.011328,0.007646,0.007758,0.002355


In [82]:
#df_doc_vectors.to_csv('doc2vec_v1/vector_train.csv',index = False)

### Find similar documents 

In [9]:
def similar_documents(book_index):
    similar_doc = model.docvecs.most_similar([book_index])
    print('Book Searched')
    print(train.loc[book_index,'title'])
    print(train.loc[book_index,'description_original'])
    print(train.loc[book_index,'genres'])
    
    print('\nMost Similar Books:')
    
    print('\n----------Rank 1------------')
    rank = int(similar_doc[0][0])
    print(rank)
    print(train.loc[rank,'title'])
    print(train.loc[rank,'description_original'])
    print(train.loc[rank,'genres'])
    
    print('\n----------Rank 2------------')
    rank = int(similar_doc[1][0])
    print(rank)
    print(train.loc[rank,'title'])
    print(train.loc[rank,'description_original'])
    print(train.loc[rank,'genres'])
    
    print('\n----------Rank 3------------')
    rank = int(similar_doc[2][0])
    print(rank)
    print(train.loc[rank,'title'])
    print(train.loc[rank,'description_original'])
    print(train.loc[rank,'genres'])

In [1]:
# # thriller 
# similar_documents(108)

In [2]:
# # romance 
# similar_documents(31000)