In [1]:
from gensim.models import word2vec
from gensim import downloader
import logging

logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)

In [2]:
raw_sentences = ["the quick brown fox jumps over the lazy dogs","yoyoyo you go home now to sleep"]

In [3]:
sentences = [s.split() for s in raw_sentences]
sentences

[['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dogs'],
 ['yoyoyo', 'you', 'go', 'home', 'now', 'to', 'sleep']]

In [4]:
model = word2vec.Word2Vec(sentences,min_count=1)

2023-01-18 02:23:41,197 : INFO : collecting all words and their counts
2023-01-18 02:23:41,197 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-01-18 02:23:41,198 : INFO : collected 15 word types from a corpus of 16 raw words and 2 sentences
2023-01-18 02:23:41,198 : INFO : Creating a fresh vocabulary
2023-01-18 02:23:41,198 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 15 unique words (100.0%% of original 15, drops 0)', 'datetime': '2023-01-18T02:23:41.198660', 'gensim': '4.1.2', 'python': '3.9.13 (main, Oct 13 2022, 21:23:06) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_vocab'}
2023-01-18 02:23:41,199 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 16 word corpus (100.0%% of original 16, drops 0)', 'datetime': '2023-01-18T02:23:41.199656', 'gensim': '4.1.2', 'python': '3.9.13 (main, Oct 13 2022, 21:23:06) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-1

min_count:过滤出现太少次数小于min_count的词语
size: 神经网络层数，默认是100，更多的层数对更大的样本更好。

In [5]:
model.wv.similarity('dogs','you')

0.009126566

In [6]:
w2v_vectors = downloader.load('word2vec-google-news-300')

2023-01-18 02:23:41,326 : INFO : loading projection weights from C:\Users\AnkieF/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
2023-01-18 02:24:07,153 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from C:\\Users\\AnkieF/gensim-data\\word2vec-google-news-300\\word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-01-18T02:24:07.153456', 'gensim': '4.1.2', 'python': '3.9.13 (main, Oct 13 2022, 21:23:06) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'load_word2vec_format'}


In [7]:
w2v_vectors.similarity('indomethacin','etoricoxib')

0.48998526

In [8]:
from gensim.models import Word2Vec
import numpy as np
 
sentences = [["bad","robots"],["good","human"],['yes', 'this', 'is', 'the', 'word2vec', 'model']]
 
# size option needs to be set to 300 to be the same as Google's pre-trained model
 
word2vec_model = Word2Vec(window=5,min_count = 1, workers = 2,vector_size = 300) 
word2vec_model.build_vocab(sentences)

# assign the vectors to the vocabs that are in Google's pre-trained model and your sentences defined above.
# lockf needs to be set to 1.0 to allow continued training.
word2vec_model.wv.vectors_lockf = np.ones(len(model.wv))
word2vec_model.wv.intersect_word2vec_format('GoogleNews-vectors-negative300.bin', lockf=1.0, binary=True)
 
# continue training with you own data
word2vec_model.train(sentences, total_examples=3, epochs = 5)

2023-01-18 02:24:07,217 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=300, alpha=0.025)', 'datetime': '2023-01-18T02:24:07.217479', 'gensim': '4.1.2', 'python': '3.9.13 (main, Oct 13 2022, 21:23:06) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}
2023-01-18 02:24:07,217 : INFO : collecting all words and their counts
2023-01-18 02:24:07,217 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-01-18 02:24:07,218 : INFO : collected 10 word types from a corpus of 10 raw words and 3 sentences
2023-01-18 02:24:07,218 : INFO : Creating a fresh vocabulary
2023-01-18 02:24:07,219 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 10 unique words (100.0%% of original 10, drops 0)', 'datetime': '2023-01-18T02:24:07.219473', 'gensim': '4.1.2', 'python': '3.9.13 (main, Oct 13 2022, 21:23:06) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare

(4, 50)

In [9]:
w2v_vectors['robots']

array([ 0.16308594,  0.07324219, -0.07177734,  0.28125   , -0.2265625 ,
       -0.05859375,  0.10742188, -0.0050354 , -0.04638672, -0.14160156,
        0.0291748 , -0.09716797,  0.18554688, -0.07470703, -0.00262451,
        0.04736328, -0.140625  ,  0.14550781,  0.06640625, -0.09375   ,
        0.11035156, -0.06542969,  0.19628906,  0.08251953, -0.16601562,
        0.0098877 , -0.06738281, -0.15332031, -0.03710938, -0.18066406,
       -0.30664062, -0.20019531, -0.18945312,  0.05029297,  0.12792969,
       -0.05688477, -0.05004883,  0.26953125,  0.12792969,  0.10058594,
        0.09521484, -0.00500488,  0.13378906,  0.44726562, -0.04223633,
       -0.07666016,  0.06079102,  0.06445312,  0.15917969, -0.00561523,
       -0.23242188,  0.14257812, -0.07470703, -0.06689453, -0.14746094,
       -0.12597656,  0.44726562, -0.14257812,  0.14257812,  0.02929688,
       -0.01879883, -0.02380371, -0.01342773,  0.04736328, -0.3515625 ,
       -0.13574219, -0.18359375,  0.30078125, -0.24511719,  0.08

In [10]:
word2vec_model.wv.similarity('robots','human')

0.26561615

In [11]:
old_sentences = [["bad","robots"],["good","human"],['yes', 'this', 'is', 'the', 'word2vec', 'model']]
 
# size option needs to be set to 300 to be the same as Google's pre-trained model
 
old_model = Word2Vec(window=5,min_count = 1, workers = 2,vector_size = 300)
old_model.build_vocab(old_sentences)
old_model.wv['robots']
old_model.wv.similarity('robots','human')

2023-01-18 02:24:19,137 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=300, alpha=0.025)', 'datetime': '2023-01-18T02:24:19.137179', 'gensim': '4.1.2', 'python': '3.9.13 (main, Oct 13 2022, 21:23:06) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}
2023-01-18 02:24:19,138 : INFO : collecting all words and their counts
2023-01-18 02:24:19,138 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-01-18 02:24:19,138 : INFO : collected 10 word types from a corpus of 10 raw words and 3 sentences
2023-01-18 02:24:19,139 : INFO : Creating a fresh vocabulary
2023-01-18 02:24:19,139 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 10 unique words (100.0%% of original 10, drops 0)', 'datetime': '2023-01-18T02:24:19.139173', 'gensim': '4.1.2', 'python': '3.9.13 (main, Oct 13 2022, 21:23:06) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare

0.022898583

In [12]:
old_model.wv.vectors_lockf = np.ones(len(model.wv))
old_model.wv.intersect_word2vec_format('GoogleNews-vectors-negative300.bin', lockf=0.0, binary=True)
# continue training with you own data
old_model.train(sentences, total_examples=3, epochs = 5)
old_model.wv.similarity('robots','human')

2023-01-18 02:24:19,165 : INFO : loading projection weights from GoogleNews-vectors-negative300.bin
2023-01-18 02:24:30,448 : INFO : KeyedVectors lifecycle event {'msg': 'merged 9 vectors into (10, 300) matrix from GoogleNews-vectors-negative300.bin', 'datetime': '2023-01-18T02:24:30.448671', 'gensim': '4.1.2', 'python': '3.9.13 (main, Oct 13 2022, 21:23:06) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'intersect_word2vec_format'}
2023-01-18 02:24:30,449 : INFO : Word2Vec lifecycle event {'msg': 'training model with 2 workers on 10 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-01-18T02:24:30.449668', 'gensim': '4.1.2', 'python': '3.9.13 (main, Oct 13 2022, 21:23:06) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'train'}
2023-01-18 02:24:30,451 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-01-18 02:24:30,451 : INFO : worke

0.26561615