In [5]:
import gensim
from gensim.models import Word2Vec
import logging # Optional: To see Gensim's progress logs

# --- 1. Sample Data (Tokenized) ---
# Gensim expects input as a list of lists of tokens (like our previous example)
corpus = [
    "natural language processing and machine learning are related fields",
    "language models help understand text structure",
    "vector representations capture word meanings",
    "skip gram models learn embeddings from text",
    "negative sampling improves training efficiency",
    "word embeddings are useful for many nlp tasks",
    "learning representations requires large amounts of text data",
    "understanding context is key in language processing",
    "machine learning algorithms power modern nlp",
    "embeddings capture semantic relationships between words"
]

# Simple tokenization (same as before)
import re
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text.split()

tokenized_corpus = [tokenize(sentence) for sentence in corpus]
print("Sample tokenized data for Gensim:", tokenized_corpus[0])

# --- 2. Configure Logging (Optional) ---
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# --- 3. Train the Skip-gram Model ---
print("\n--- Training Gensim Word2Vec (Skip-gram) Model ---")

# Instantiate and train the model
# Key parameters for Skip-gram:
#   sentences: Your tokenized corpus
#   vector_size: Dimensionality of the embeddings (e.g., 100)
#   window: Context window size (e.g., 5)
#   min_count: Minimum word frequency to include in vocab (e.g., 1 or 2)
#   sg=1: THIS ENABLES SKIP-GRAM (sg=0 is for CBOW)
#   workers: Number of CPU cores to use (e.g., 4)
#   negative: Number of negative samples (e.g., 5). If > 0, enables Negative Sampling.
#   hs=0: Disable Hierarchical Softmax (to use Negative Sampling). hs=1 enables HS.
#   sample: Threshold for configuring subsampling of frequent words (e.g., 1e-3 or 1e-5)
#   epochs: Number of iterations (epochs) over the corpus (Word2Vec default is 5)

model = Word2Vec(sentences=tokenized_corpus,
                 vector_size=50,    # Same as EMBEDDING_DIM before
                 window=2,          # Same as WINDOW_SIZE before
                 min_count=2,       # Same as MIN_WORD_FREQ before
                 sg=1,              # Use Skip-gram
                 workers=4,         # Use 4 CPU cores
                 negative=5,        # Use Negative Sampling with 5 samples
                 hs=0,              # Don't use Hierarchical Softmax
                 sample=1e-5,       # Use subsampling with threshold 1e-5
                 epochs=50)         # Train for 50 epochs

print("\n--- Training Complete ---")

# --- 4. Using the Model ---

# Get the embedding vector for a word
try:
    vector_language = model.wv['language']
    print(f"\nVector for 'language' (shape {vector_language.shape}):\n", vector_language[:10]) # Print first 10 dims
except KeyError:
    print("\n'language' not in vocabulary (check min_count).")

# Find words most similar to a given word
try:
    similar_words = model.wv.most_similar('language', topn=5)
    print("\nWords most similar to 'language':", similar_words)
except KeyError:
     print("\n'language' not in vocabulary.")

try:
    similar_words_emb = model.wv.most_similar('embeddings', topn=5)
    print("Words most similar to 'embeddings':", similar_words_emb)
except KeyError:
     print("\n'embeddings' not in vocabulary.")

# Analogy task: king - man + woman = queen
# Example: learning - machine + language = ? (might not work well on tiny corpus)
# try:
#     analogy = model.wv.most_similar(positive=['language', 'learning'], negative=['machine'], topn=1)
#     print("\nAnalogy 'learning' - 'machine' + 'language' = ?", analogy)
# except KeyError as e:
#     print(f"\nAnalogy failed: Word '{e.args[0]}' not in vocabulary.")


# Get vocabulary details
vocab_keys = list(model.wv.key_to_index.keys())
print(f"\nVocabulary size in Gensim model: {len(vocab_keys)}")
# print("Sample vocab keys:", vocab_keys[:10])

# --- 5. Saving and Loading ---
# model.save("skipgram_model.gensim")
# loaded_model = Word2Vec.load("skipgram_model.gensim")
# print("\nModel saved and loaded successfully.")

2025-04-13 17:22:58,459 : INFO : collecting all words and their counts
2025-04-13 17:22:58,459 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-04-13 17:22:58,460 : INFO : collected 51 word types from a corpus of 67 raw words and 10 sentences
2025-04-13 17:22:58,460 : INFO : Creating a fresh vocabulary
2025-04-13 17:22:58,462 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 12 unique words (23.53% of original 51, drops 39)', 'datetime': '2025-04-13T17:22:58.462697', 'gensim': '4.3.3', 'python': '3.11.0 (main, Oct 24 2022, 18:26:48) [MSC v.1933 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'prepare_vocab'}
2025-04-13 17:22:58,462 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 28 word corpus (41.79% of original 67, drops 39)', 'datetime': '2025-04-13T17:22:58.462697', 'gensim': '4.3.3', 'python': '3.11.0 (main, Oct 24 2022, 18:26:48) [MSC v.1933 64 bit (AMD64)]', 'platform': 'Windows-10-

Sample tokenized data for Gensim: ['natural', 'language', 'processing', 'and', 'machine', 'learning', 'are', 'related', 'fields']

--- Training Gensim Word2Vec (Skip-gram) Model ---

--- Training Complete ---

Vector for 'language' (shape (50,)):
 [ 0.01563514 -0.01902037 -0.00041106  0.00693839 -0.00187794  0.01676354
  0.01802157  0.01307301 -0.00142324  0.01542081]

Words most similar to 'language': [('representations', 0.0449172779917717), ('models', -0.010146022774279118), ('embeddings', -0.014475265517830849), ('machine', -0.023209011182188988), ('capture', -0.04407211393117905)]
Words most similar to 'embeddings': [('processing', 0.16704076528549194), ('representations', 0.13204392790794373), ('learning', 0.1267007291316986), ('are', 0.0998455360531807), ('text', 0.042373016476631165)]

Vocabulary size in Gensim model: 12
