In [30]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [46]:
# Step 1: Preprocess the sentence
sentence = "The cat sat on the mat last night. Dog was barking. We love elephant"
# Tokenize the sentence into words
tokens = word_tokenize(sentence.lower())  # Lowercasing for consistency
print("Tokenized Sentence:", tokens)

Tokenized Sentence: ['the', 'cat', 'sat', 'on', 'the', 'mat', 'last', 'night', '.', 'dog', 'was', 'barking', '.', 'we', 'love', 'elephant']


In [47]:
# Step 2: Prepare data for Word2Vec
# Word2Vec expects a list of tokenized sentences
data = [tokens]  # Wrapping the tokenized sentence in a list to make it a list of sentences
data

[['the',
  'cat',
  'sat',
  'on',
  'the',
  'mat',
  'last',
  'night',
  '.',
  'dog',
  'was',
  'barking',
  '.',
  'we',
  'love',
  'elephant']]

In [48]:
model = Word2Vec(
    sentences=data,
    vector_size=100,  # Size of the word vectors
    window=3,         # Context window size
    min_count=1,      # Minimum frequency of words to be included
    sg=0,             # Use Skip-gram model (1: Skip-gram; 0: CBOW)
    workers=4,        # Number of threads for training
    epochs=10         # Number of training epochs
)

In [49]:
# Step 4: Analyze the trained Word2Vec model
# Get the vector for a specific word
word_vector = model.wv["cat"]  # Get the vector representation for "cat"
print("\nVector for 'cat':\n", word_vector)


Vector for 'cat':
 [-0.00950045  0.00956245 -0.0077711  -0.00264685 -0.00490687 -0.00496653
 -0.00802392 -0.00778332 -0.00455381 -0.00127546 -0.00510294  0.00614129
 -0.00951657 -0.00530794  0.00943732  0.0069913   0.00767612  0.0042349
  0.00050782 -0.00598228  0.00601872  0.00263416  0.00770108  0.00639523
  0.0079412   0.00865835 -0.00989559 -0.00675552  0.00133724  0.00644095
  0.00737411  0.00551761  0.0076623  -0.00512609  0.006586   -0.00410858
 -0.00905581  0.00914177  0.0013301  -0.00275933 -0.00247751 -0.00422111
  0.00481141  0.00440193 -0.00265271 -0.00734366 -0.00356605 -0.00033697
  0.00609597 -0.00283844 -0.00012112  0.00088037 -0.00709547  0.00206405
 -0.0014327   0.00280145  0.00484033 -0.00135295 -0.00277946  0.00773816
  0.005046    0.00671273  0.00451655  0.00866808  0.00747419 -0.00108195
  0.00874769  0.00460273  0.00544039 -0.00138615 -0.0020413  -0.00442316
 -0.00851616  0.0030391   0.00888367  0.00892157 -0.00194196  0.00608665
  0.00378018 -0.00429642  0.0020

In [50]:
# Find most similar words to a given word
similar_words = model.wv.most_similar("cat", topn=2)
print("\nMost similar words to 'cat':", similar_words)


Most similar words to 'cat': [('we', 0.2529977262020111), ('the', 0.1370527595281601)]


In [51]:
model.wv.doesnt_match(["cat", "dog", "elephant"])

'cat'

In [None]:
# Save the model
model.save("word2vec_model.model")

# Load the model
loaded_model = Word2Vec.load("word2vec_model.model")
print("\nLoaded model similar words for 'cat':", loaded_model.wv.most_similar("cat", topn=5))
