In [12]:
# gensim modules
import gensim
from gensim import utils
from gensim.models.doc2vec import Doc2Vec
TaggededDocument = gensim.models.doc2vec.TaggedDocument
from nltk import word_tokenize
import pandas as pd
# numpy
import numpy as np
# random
from random import shuffle
# classifier
from sklearn.linear_model import LogisticRegression

## Generating the Corpus

In [13]:
def get_cut_dataset(data):
    '''
    Generate corpus(using tokenized word list)
    '''
    corpus = []
    documents = []
    for idx, item in enumerate(data):
        text = list(item.split(' ') if type (item) == str else str (item))
        document = TaggededDocument(text, tags=[idx])
        corpus.append(document)
    print('len of corpus：', len(corpus))

    return corpus

In [14]:
DataPath = "part.csv"
df = pd.read_csv(DataPath)

In [16]:
df['tweet'].astype(str)
train_corpus = get_cut_dataset(df['tweet'])

len of corpus： 200000


## Train model and save

In [138]:
def train(x_train, model_path , size=300, epoch_num=20, dm=1):
    print('start train')
    model_dm = Doc2Vec(x_train, min_count=20, window=5, vector_size=size, sample=1e-3, negative=5, workers=4, dm=dm)
    model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=epoch_num)
    model_dm.save(model_path)
    print('end train')
    return model_dm

model_path = './model_doc2vec'
model_doc2vec = train(train_corpus, model_path=model_path)

start train
end train


## Inspecting the Model
### Check the close synonyms

In [10]:
model_doc2vec = Doc2Vec.load('./model_doc2vec')
model_doc2vec.wv.most_similar(positive=['good','nice'],negative=['bad','shit'])

[('awesome', 0.3016740083694458),
 ('great', 0.29568761587142944),
 ('fantastic', 0.2867523729801178),
 ('fab', 0.2615671157836914),
 ('interesting', 0.25876688957214355),
 ('goood', 0.2518990933895111),
 ('lovely', 0.24352069199085236),
 ('wonderful', 0.24207670986652374),
 ('perfect', 0.24012957513332367),
 ('sweet', 0.23417885601520538)]

### Check whether the closest tweet is itself

In [17]:
str_ = "i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15"
test_text = str_.split(' ')
inferred_vector_dm = model_doc2vec.infer_vector(test_text)
# print('inferred_vector_dm:', inferred_vector_dm)
sims = model_doc2vec.dv.most_similar([inferred_vector_dm], topn=1)
for count,sim in sims:
    sentence = train_corpus[count]
    words = ''
    for word in sentence[0]:
        words = words + word + ' '
    print(words, sim)


dunno justin read mention not . only justin and god knows about that , but hope you will follow # believe 15  0.8183117508888245


In [29]:
data = np.zeros((len(df),model.vector_size))
for i,item in df.iterrows():
    text = list(item['tweet'].split(' ') if type (item['tweet']) == str else str (item['tweet']))
    data[i] = model.infer_vector(text)

np.save("./Doc2VecArray", data)

print("save to ./Doc2VecArray")

save to ./Doc2VecArray


In [30]:
df

Unnamed: 0,tweet,label
0,dunno justin read mention not . only justin an...,1
1,"because your logic dumb , n't even crop out yo...",1
2,`` just put casper box ! `` looved the battle ...,1
3,thanks sir > > n't trip lil mama ... just keep...,1
4,visiting brother tmr the bestest birthday gift...,1
...,...,...
199995,n't wait fake tan tonight ! hate being pale,0
199996,darling lost internet connection .. and 's see...,0
199997,kanguru defender basic 4 usb 2.0 flash drive k...,0
199998,rizan sad now,0
