In [2]:
import gensim

LabeledSentence = gensim.models.doc2vec.LabeledSentence

import numpy as np
import pandas as pd
from ast import literal_eval
from spacy.en import English
from spacy.parts_of_speech import NOUN
from spacy.parts_of_speech import VERB
nlp = English()
probs = [lex.prob for lex in nlp.vocab]
probs.sort()

In [3]:
import pandas as pd

In [4]:
trainDF = pd.read_csv('sentenceTrainDF.csv')
testDF = pd.read_csv('sentenceTestDF.csv')
unsup_amazonDF = pd.read_csv('amazon_sentenceDF_spacy.csv')

In [5]:
unsup_amazonDF.shape

(1045212, 5)

In [6]:
x_train = trainDF.text.values
y_train_text = list(map(literal_eval, trainDF.categorys.tolist()))
y_train = mlb.transform(y_train_text)

x_test = testDF.text.values
y_test_text = list(map(literal_eval, testDF.categorys.tolist()))
y_test = mlb.transform(y_test_text)

unsup_sentences = unsup_amazonDF.sentence.values

In [7]:
#Do some very minor text preprocessing
def cleanText(corpus):
    punctuation = """.,?!:;(){}[]"""
    corpus = [z.lower().replace('\n','') for z in corpus]
    corpus = [z.replace('<br />', ' ') for z in corpus]

    #treat punctuation as individual words
    for c in punctuation:
        corpus = [z.replace(c, ' %s '%c) for z in corpus]
    corpus = [z.split() for z in corpus]
    return corpus

x_train = cleanText(x_train)
x_test = cleanText(x_test)
unsup_sentences = cleanText(unsup_sentences)

#Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
#We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is
#a dummy index of the review.
def labelizeSentences(sentences, label_type):
    labelized = []
    for i,v in enumerate(sentences):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeSentences(x_train, 'TRAIN')
x_test = labelizeSentences(x_test, 'TEST')
unsup_sentences = labelizeSentences(unsup_sentences, 'UNSUP')

In [8]:
type(x_train)

list

In [9]:
len(x_train)

1739

In [None]:
import random

size = 400

#instantiate our DM and DBOW models
model_dm = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3, negative=5, workers=3)
model_dbow = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3, negative=5, dm=0, workers=3)

#build vocab over all sentences
model_dm.build_vocab(x_train+ x_test+ unsup_sentences)
model_dbow.build_vocab(x_train+ x_test+ unsup_sentences)

#We pass through the data set multiple times, controlling the learning rate
all_train_sentences = x_train + unsup_sentences
for epoch in range(10):
    model_dm.train(all_train_sentences)
    model_dbow.train(all_train_sentences)
    model_dm.alpha -= 0.002  # decrease the learning rate
    model_dbow.alpha -= 0.002  # decrease the learning rate
    model_dm.min_alpha = model_dm.alpha  # fix the learning rate, no decay
    model_dbow.min_alpha = model_dbow.alpha  # fix the learning rate, no decay
    
#Get training set vectors from our models
def getVecs(model, corpus, size):
    vecs = [np.array(model.docvecs[z.tags[0]]).reshape((1, size)) for z in corpus]
    return np.concatenate(vecs)

train_vecs_dm = getVecs(model_dm, x_train, size)
train_vecs_dbow = getVecs(model_dbow, x_train, size)
train_vecs = np.hstack((train_vecs_dm, train_vecs_dbow))

In [None]:
#train over test set

Idx=list(range(len(x_test)))

for epoch in range(10):
    random.shuffle(Idx)
    perm_sentences = [x_test[i] for i in Idx]
    model_dm.train(perm_sentences)
    model_dbow.train(perm_sentences)

#Construct vectors for test sentences
test_vecs_dm = getVecs(model_dm, x_test, size)
test_vecs_dbow = getVecs(model_dbow, x_test, size)
test_vecs = np.hstack((test_vecs_dm, test_vecs_dbow))

In [13]:
model_dm.save('nmodel_dm.d2v')
model_dbow.save('nmodel_dbow.d2v')

In [151]:
model_dm.docvecs.similarity('TRAIN_1', 'TRAIN_2')

0.25602992090955001

In [None]:
result = model_dm.docvecs.most_similar('TRAIN_1')

In [148]:
for s in result:
    index = int(s[0].replace('UNSUP_',''))
    print(unsup_sentences[index])

LabeledSentence(['this', 'is', 'absolutely', 'amazing', 'laptop', '!', '!', '!'], ['UNSUP_716972'])
LabeledSentence(['i', 'absolutely', 'love', 'this', 'computer', '!', '!', '!'], ['UNSUP_775798'])
LabeledSentence(['this', 'computer', 'is', 'amazing', '!', '!'], ['UNSUP_606812'])
LabeledSentence(['this', 'laptop', 'is', 'absolutely', 'amazing', '!'], ['UNSUP_745655'])
LabeledSentence(['i', 'absolutely', 'love', 'this', 'computer', '!', '!', '!', '!'], ['UNSUP_401717'])
LabeledSentence(['i', 'absolutely', 'love', 'this', 'computer', '.'], ['UNSUP_904161'])
LabeledSentence(['absolutely', 'love', 'this', 'computer', '!', '!', '!'], ['UNSUP_102374'])
LabeledSentence(['this', 'computer', 'is', 'amazing', '!'], ['UNSUP_558751'])
LabeledSentence(['i', 'absolutely', 'love', 'this', 'computer', '!'], ['UNSUP_616560'])
LabeledSentence(['this', 'computer', 'is', 'amazing', '!'], ['UNSUP_100267'])
