In [51]:
import gensim
from gensim import utils
from gensim.models.doc2vec import Doc2Vec
TaggededDocument = gensim.models.doc2vec.TaggedDocument
from nltk import word_tokenize
import pandas as pd
import numpy as np
from random import shuffle
from sklearn.linear_model import LogisticRegression
import pickle
import sys
sys.path.append('..')
from helpers import *

## Generating the Corpus

In [14]:
def get_cut_dataset(data):
    '''
    Generate corpus(using tokenized word list)
    '''
    corpus = []
    documents = []
    for idx, item in enumerate(data):
        text = list(item.split(' ') if type (item) == str else str (item))
        document = TaggededDocument(text, tags=[idx])
        corpus.append(document)
    print('len of corpus：', len(corpus))

    return corpus

In [27]:
DataPath = "../part.csv"
df = pd.read_csv(DataPath)
df1 = pd.read_csv("../test.csv")

In [28]:
df['tweet'].astype(str)
df1['tweet'].astype(str)

train_corpus = get_cut_dataset(pd.concat([df['tweet'],df1['tweet']],axis=0))

len of corpus： 191292


## Train model and save

In [45]:
def train(x_train, model_path , size=100, epoch_num=20, dm=1):
    print('start train')
    model_dm = Doc2Vec(x_train, min_count=10, window=10, vector_size=size, sample=1e-3, negative=5, workers=4, dm=dm)
    model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=epoch_num)
    model_dm.save(model_path)
    print('end train')
    return model_dm

model_path = './model_doc2vec'
model_doc2vec = train(train_corpus, model_path=model_path)

start train
end train


## Inspecting the Model
### Check the close synonyms

In [46]:
model_doc2vec = Doc2Vec.load('./model_doc2vec')
model_doc2vec.wv.most_similar(positive=['good','nice'],negative=['bad','shit'])

[('fantastic', 0.5610108971595764),
 ('lovely', 0.5350937843322754),
 ('great', 0.48865360021591187),
 ('fab', 0.4556836187839508),
 ('wonderful', 0.4513000249862671),
 ('successful', 0.41335007548332214),
 ('enjoying', 0.4095670282840729),
 ('fabulous', 0.40703025460243225),
 ('sweetie', 0.40674567222595215),
 ('chilled', 0.3806166648864746)]

### Check whether the closest tweet is itself

In [47]:
str_ = "i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15"
test_text = str_.split(' ')
inferred_vector_dm = model_doc2vec.infer_vector(test_text)
# print('inferred_vector_dm:', inferred_vector_dm)
sims = model_doc2vec.dv.most_similar([inferred_vector_dm], topn=1)
for count,sim in sims:
    sentence = train_corpus[count]
    words = ''
    for word in sentence[0]:
        words = words + word + ' '
    print(words, sim)


i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15  0.8694869875907898


### Generate tweets presentation using Doc2Vec embedding

In [48]:
data = np.zeros((len(df),model_doc2vec.vector_size))
for i,item in df.iterrows():
    text = list(item['tweet'].split(' ') if type (item['tweet']) == str else str (item['tweet']))
    data[i] = model_doc2vec.infer_vector(text)

np.save("./Doc2VecArray", data)

print("save to ./Doc2VecArray")

save to ./Doc2VecArray


### Generate Word2Vec embeddings

In [None]:
vocab_dim = model_doc2vec.vector_size

index2word_set = set(model_doc2vec.wv.index_to_key)

In [67]:
f = open("vocab.pkl", 'rb') 
index_dict = pickle.load(f)    # index dictionary {'word': idx}

data = np.zeros((len(index_dict),model_doc2vec.vector_size))

for key, value in index_dict.items():
    if key in index2word_set:
        data[value,:] = model_doc2vec[key]

np.save("./Word2VecArray", data)

print("save to ./Word2VecArray")

save to ./Word2VecArray


In [68]:
len(data)

101298

In [69]:
len(index_dict)

101298