In [1]:
"""
训练六分类数据的d2v模型
"""
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from utils import load_reviews
import time

In [2]:
filePath = '../../corpus/6moods/train/usual_trainTrimed.csv'
labels, reviews = load_reviews(filePath)
reviews = [review.split() for review in reviews]
tuple(zip(labels, reviews))[:3]

(('angry',
  ['气死', '姐姐', '快二是', '阵亡', '吗', '尼玛', '一个半', '小时', '过去', '也', '没', '上车']),
 ('happy',
  ['妞妞',
   '今天',
   '又',
   '承办',
   '一个',
   '发文',
   '登记',
   '文号',
   '是',
   '嘻',
   '么',
   '么',
   '哒',
   '晚安',
   '哟']),
 ('neutral',
  ['还',
   '值得注意',
   '另',
   '一个',
   '事实',
   '张鞠存',
   '原有',
   '一个',
   '东溪',
   '草堂',
   '为',
   '其',
   '读书处']))

In [3]:
# mood dict
m2i = {
    'sad':0,
    'angry': 1,
    'fear': 2,
    'neutral': 3,
    'surprise': 4,
    'happy': 5,
}
i2m = {k:i for k, i in enumerate(m2i)}

In [4]:
labels = [m2i[str(label)] for label in labels]
labels[:5]

[1, 5, 3, 3, 4]

In [5]:
doc = [TaggedDocument(reviews[i], [labels[i]]) for i in range(len(reviews))]
doc[:5]

[TaggedDocument(words=['气死', '姐姐', '快二是', '阵亡', '吗', '尼玛', '一个半', '小时', '过去', '也', '没', '上车'], tags=[1]),
 TaggedDocument(words=['妞妞', '今天', '又', '承办', '一个', '发文', '登记', '文号', '是', '嘻', '么', '么', '哒', '晚安', '哟'], tags=[5]),
 TaggedDocument(words=['还', '值得注意', '另', '一个', '事实', '张鞠存', '原有', '一个', '东溪', '草堂', '为', '其', '读书处'], tags=[3]),
 TaggedDocument(words=['这', '在', '前', '华约', '国家', '尤其', '是', '东德', '使用', '的', '首次', '联合演习', '期间', '被', '一些', '北约组织', '的', '飞行员', '所', '证实'], tags=[3]),
 TaggedDocument(words=['上'], tags=[4])]

In [6]:
start = time.time()
model = Doc2Vec(doc, vector_size=20, window=2, min_count=5, epochs=10)
model.save_word2vec_format('./data/d2v.txt')
model.save('./data/d2v.model')
cost = time.time() - start
print(f'Training model cost: {cost:.4f} Sec')

Training model cost: 5.5787 Sec


In [7]:
wv = model.wv
print(wv.most_similar('卧槽'))
print(wv.most_similar('我操'))

[('[吃惊]', 0.8837623000144958), ('感慨', 0.8774041533470154), ('[衰]', 0.8455262780189514), ('神经病', 0.8394677639007568), ('纸', 0.8341495394706726), ('行', 0.8264917731285095), ('那位', 0.8174652457237244), ('商家', 0.8133604526519775), ('远光灯', 0.8080230355262756), ('堪称', 0.8076061606407166)]
[('婊', 0.8698984384536743), ('大神', 0.8337650299072266), ('轰炸', 0.8331423997879028), ('马勒', 0.8261599540710449), ('华为', 0.810968816280365), ('韵达', 0.8008748292922974), ('电钻', 0.8008080124855042), ('郑恺', 0.8006690740585327), ('朱亚文', 0.7992111444473267), ('大叫', 0.7965862154960632)]


In [8]:
print(wv['卧槽'])
print(model.infer_vector(['卧槽']))

[-1.3084278  -0.45839208 -0.5775986  -0.30819038  1.4204996  -0.35753238
 -0.23664102 -1.229145    1.7807796  -1.118629    0.41416073  0.78953296
 -1.3608664  -0.9842466  -0.49807453  1.9896345   0.29180244 -0.6641466
 -0.8558963   1.435378  ]
[-0.05444296  0.0080732  -0.02475881  0.0242496   0.06802876 -0.00584508
 -0.02550354 -0.02837566  0.06826746 -0.01272654  0.0067676   0.05555837
 -0.00609071 -0.06379095 -0.03808757  0.05879767  0.01863134 -0.0119704
 -0.05210801  0.02342739]


In [9]:
vectors = []
for review in reviews:
    vectors.append(model.infer_vector(review))
vectors[:5]

[array([-0.04757316,  0.04367489, -0.09455883,  0.09047905,  0.0787765 ,
        -0.18718566, -0.07665525, -0.0230841 ,  0.18516561, -0.24436016,
         0.01454751,  0.20735273, -0.30142966, -0.1536281 , -0.03942646,
         0.03265611,  0.07468861, -0.27426484, -0.05937847,  0.00431646],
       dtype=float32),
 array([ 0.08900227,  0.15790507, -0.09202043, -0.05244378,  0.09003896,
        -0.22828124,  0.10798083,  0.13961828,  0.0611835 , -0.23666959,
        -0.2105221 ,  0.06099629, -0.30025116, -0.31915152, -0.0917276 ,
         0.0315614 , -0.14340168, -0.15528348,  0.01314865,  0.10393614],
       dtype=float32),
 array([-0.08163825,  0.01461045, -0.08778184,  0.05579885,  0.15973966,
        -0.2454241 ,  0.05913501,  0.01513157,  0.02292997, -0.17410545,
         0.02910296,  0.07980833, -0.19867751, -0.25065857, -0.08286671,
        -0.05207695, -0.17845787, -0.20596807,  0.00301739,  0.04077628],
       dtype=float32),
 array([ 0.01886431,  0.19281362, -0.09836899, -0.09

In [10]:
import pandas as pd

df = pd.DataFrame({'label': labels, 'vector': vectors})
df.to_csv('./data/vec.csv', index=False)

In [11]:
df[:5]

Unnamed: 0,label,vector
0,1,"[-0.047573164, 0.043674894, -0.09455883, 0.090..."
1,5,"[0.089002274, 0.15790507, -0.09202043, -0.0524..."
2,3,"[-0.081638254, 0.014610445, -0.08778184, 0.055..."
3,3,"[0.018864311, 0.19281362, -0.09836899, -0.0936..."
4,4,"[0.00013942734, 0.054771286, -0.06153464, -0.0..."
