In [1]:
import pandas as pd
import gensim
from gensim.models import Word2Vec
import logging
import os


In [3]:
# 1. Configure Logging
# This helps to see the training progress and any warnings/errors from gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [5]:
text8_filename = '~/Documents/MLX/text8'


In [7]:
try:
    # Read each line as a separate row in a DataFrame/Series
    # header=None means no header row
    # names=['text_content'] assigns a column name
    # squeeze=True attempts to return a Series if only one column results
    df_temp = pd.read_csv(text8_filename, header=None, names=['text_content'], encoding='utf-8')
    
    # Check if the DataFrame has exactly one column. If so, convert it to a Series.
    # This replaces the deprecated 'squeeze=True' functionality.
    if df_temp.shape[1] == 1:
        df_text = df_temp['text_content']
    else:
        df_text = df_temp # Keep as DataFrame if multiple columns, though unlikely for text8

    print(f"\nText loaded into pandas object. Type: {type(df_text)}. Shape: {df_text.shape}")
    print("First 3 lines from pandas object:")
    print(df_text.head(3))
except FileNotFoundError:
    logging.error(f"Error: The file '{text8_filename}' was not found. Please ensure it's in the correct directory.")
    exit()
except Exception as e:
    logging.error(f"An error occurred while loading the file with pandas: {e}")
    exit()



Text loaded into pandas object. Type: <class 'pandas.core.series.Series'>. Shape: (1,)
First 3 lines from pandas object:
0     anarchism originated as a term of abuse first...
Name: text_content, dtype: object


In [9]:
sentences_for_gensim = df_text.apply(lambda x: x.lower().split()).tolist()

In [11]:
print(f"\nProcessed {len(sentences_for_gensim)} sentences for gensim.")
print("Example of processed sentence (first one):")
print(sentences_for_gensim[0][:20])



Processed 1 sentences for gensim.
Example of processed sentence (first one):
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english']


In [13]:
print("\nStarting CBOW model training...")
cbow_model = Word2Vec(
    sentences=sentences_for_gensim,
    vector_size=100,      # Example: 100-dimensional word vectors
    window=5,             # Consider 5 words before and 5 words after the target word
    min_count=1,          # Include all words that appear at least once (for small demo)
    sg=0,                 # CRUCIAL: 0 for CBOW, 1 for Skip-gram
    workers=4,            # Use 4 CPU cores (adjust based on your system)
    epochs=20             # Train for 20 iterations over the dataset
)

2025-06-10 14:50:27,651 : INFO : collecting all words and their counts
2025-06-10 14:50:27,653 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types



Starting CBOW model training...


2025-06-10 14:50:28,533 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1 sentences
2025-06-10 14:50:28,533 : INFO : Creating a fresh vocabulary
2025-06-10 14:50:28,762 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 253854 unique words (100.00% of original 253854, drops 0)', 'datetime': '2025-06-10T14:50:28.762928', 'gensim': '4.3.3', 'python': '3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]', 'platform': 'macOS-15.5-arm64-arm-64bit', 'event': 'prepare_vocab'}
2025-06-10 14:50:28,763 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 17005207 word corpus (100.00% of original 17005207, drops 0)', 'datetime': '2025-06-10T14:50:28.763221', 'gensim': '4.3.3', 'python': '3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]', 'platform': 'macOS-15.5-arm64-arm-64bit', 'event': 'prepare_vocab'}
2025-06-10 14:50:29,066 : INFO : deleting the raw counts d

In [15]:
skipgram_model = Word2Vec(
    sentences=sentences_for_gensim,
    vector_size=100,      # Dimensionality of the word vectors
    window=5,             # Context window size
    min_count=1,          # Minimum frequency for words to be included
    sg=1,                 # CRUCIAL: 1 for Skip-gram
    workers=4,            # Number of worker threads
    epochs=20             # Number of iterations over the dataset
)
print("Skip-gram model training complete.\n")

2025-06-10 14:50:41,340 : INFO : collecting all words and their counts
2025-06-10 14:50:41,341 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-06-10 14:50:42,232 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1 sentences
2025-06-10 14:50:42,232 : INFO : Creating a fresh vocabulary
2025-06-10 14:50:42,447 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 253854 unique words (100.00% of original 253854, drops 0)', 'datetime': '2025-06-10T14:50:42.447085', 'gensim': '4.3.3', 'python': '3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]', 'platform': 'macOS-15.5-arm64-arm-64bit', 'event': 'prepare_vocab'}
2025-06-10 14:50:42,447 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 17005207 word corpus (100.00% of original 17005207, drops 0)', 'datetime': '2025-06-10T14:50:42.447396', 'gensim': '4.3.3', 'python': '3.12.7 | packaged by Anaconda, Inc. | (mai

Skip-gram model training complete.



In [21]:
# --- 7. Using the Trained Models (CBOW and Skip-gram) ---

# --- CBOW Model Queries ---
print("\n--- CBOW Model Queries ---")
try:
    print("\nQuerying Word Vectors (CBOW):")
    word_vector_fox_cbow = cbow_model.wv['fox']
    print(f"Vector for 'fox' (CBOW, first 5 elements):\n{word_vector_fox_cbow[:5]}...")
except KeyError:
    print("Word 'fox' not found in CBOW vocabulary.")

try:
    print("\nFinding Similar Words (CBOW):")
    similar_to_learning_cbow = cbow_model.wv.most_similar('learning', topn=3)
    print(f"Words similar to 'learning' (CBOW): {similar_to_learning_cbow}")
except KeyError:
    print("Word 'learning' not found in CBOW vocabulary.")

try:
    print("\nPerforming Analogies (CBOW):")
    # Example: 'cat' - 'animal' + 'fruit' = ? (might not work well with small data)
    analogy_result_cbow = cbow_model.wv.most_similar(positive=['quick', 'fox'], negative=['brown'], topn=1)
    print(f"Analogy (CBOW): 'quick' + 'fox' - 'brown' = {analogy_result_cbow}")
except KeyError:
    print("One or more words for analogy not found in CBOW vocabulary.")
except Exception as e:
    print(f"Could not perform CBOW analogy due to: {e}. (Common with very small training data).")


--- CBOW Model Queries ---

Querying Word Vectors (CBOW):
Vector for 'fox' (CBOW, first 5 elements):
[ 0.0079074   0.00659146  0.0093744  -0.00740751  0.00027037]...

Finding Similar Words (CBOW):
Words similar to 'learning' (CBOW): [('at', 0.9988629221916199), ('or', 0.9987952709197998), ('movements', 0.9987720847129822)]

Performing Analogies (CBOW):
Analogy (CBOW): 'quick' + 'fox' - 'brown' = [('mozarteum', 0.41852787137031555)]


In [23]:
# --- Skip-gram Model Queries ---
print("\n--- Skip-gram Model Queries ---")
try:
    print("\nQuerying Word Vectors (Skip-gram):")
    word_vector_fox_skipgram = skipgram_model.wv['fox']
    print(f"Vector for 'fox' (Skip-gram, first 5 elements):\n{word_vector_fox_skipgram[:5]}...")
except KeyError:
    print("Word 'fox' not found in Skip-gram vocabulary.")

try:
    print("\nFinding Similar Words (Skip-gram):")
    similar_to_learning_skipgram = skipgram_model.wv.most_similar('learning', topn=3)
    print(f"Words similar to 'learning' (Skip-gram): {similar_to_learning_skipgram}")
except KeyError:
    print("Word 'learning' not found in Skip-gram vocabulary.")

try:
    print("\nPerforming Analogies (Skip-gram):")
    analogy_result_skipgram = skipgram_model.wv.most_similar(positive=['quick', 'fox'], negative=['brown'], topn=1)
    print(f"Analogy (Skip-gram): 'quick' + 'fox' - 'brown' = {analogy_result_skipgram}")
except KeyError:
    print("One or more words for analogy not found in Skip-gram vocabulary.")
except Exception as e:
    print(f"Could not perform Skip-gram analogy due to: {e}. (Common with very small training data).")



--- Skip-gram Model Queries ---

Querying Word Vectors (Skip-gram):
Vector for 'fox' (Skip-gram, first 5 elements):
[ 0.0079074   0.00659146  0.0093744  -0.00740751  0.00027037]...

Finding Similar Words (Skip-gram):
Words similar to 'learning' (Skip-gram): [('childhood', 0.9991598725318909), ('typical', 0.9990558624267578), ('force', 0.9990442395210266)]

Performing Analogies (Skip-gram):
Analogy (Skip-gram): 'quick' + 'fox' - 'brown' = [('mozarteum', 0.41852787137031555)]


In [25]:
# --- 8. Save and Load the Models (Optional) ---
# Saving the models allows you to reuse them later without retraining.
cbow_model_path = 'cbow_word2vec_model_pandas.bin'
skipgram_model_path = 'skipgram_word2vec_model_pandas.bin'

cbow_model.save(cbow_model_path)
print(f"\nCBOW model saved to {cbow_model_path}")

skipgram_model.save(skipgram_model_path)
print(f"Skip-gram model saved to {skipgram_model_path}")

# Load the models back
loaded_cbow_model = Word2Vec.load(cbow_model_path)
print(f"CBOW model loaded from {cbow_model_path}")

loaded_skipgram_model = Word2Vec.load(skipgram_model_path)
print(f"Skip-gram model loaded from {skipgram_model_path}")

# Verify by checking a vector from the loaded models
try:
    loaded_word_vector_ai_cbow = loaded_cbow_model.wv['ai']
    print(f"Vector for 'ai' from loaded CBOW model (first 5 elements):\n{loaded_word_vector_ai_cbow[:5]}...")
except KeyError:
    print("Word 'ai' not found in CBOW vocabulary (after loading).")

try:
    loaded_word_vector_ai_skipgram = loaded_skipgram_model.wv['ai']
    print(f"Vector for 'ai' from loaded Skip-gram model (first 5 elements):\n{loaded_word_vector_ai_skipgram[:5]}...")
except KeyError:
    print("Word 'ai' not found in Skip-gram vocabulary (after loading).")

2025-06-10 14:56:55,272 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'cbow_word2vec_model_pandas.bin', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2025-06-10T14:56:55.271947', 'gensim': '4.3.3', 'python': '3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]', 'platform': 'macOS-15.5-arm64-arm-64bit', 'event': 'saving'}
2025-06-10 14:56:55,273 : INFO : storing np array 'vectors' to cbow_word2vec_model_pandas.bin.wv.vectors.npy
2025-06-10 14:56:55,299 : INFO : storing np array 'syn1neg' to cbow_word2vec_model_pandas.bin.syn1neg.npy
2025-06-10 14:56:55,328 : INFO : not storing attribute cum_table
2025-06-10 14:56:55,485 : INFO : saved cbow_word2vec_model_pandas.bin
2025-06-10 14:56:55,485 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'skipgram_word2vec_model_pandas.bin', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2025-06-10T14:56:55.485579', 'gensim': '4.3.3', 'pytho


CBOW model saved to cbow_word2vec_model_pandas.bin
Skip-gram model saved to skipgram_word2vec_model_pandas.bin


2025-06-10 14:56:56,082 : INFO : Word2Vec lifecycle event {'fname': 'cbow_word2vec_model_pandas.bin', 'datetime': '2025-06-10T14:56:56.082428', 'gensim': '4.3.3', 'python': '3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]', 'platform': 'macOS-15.5-arm64-arm-64bit', 'event': 'loaded'}
2025-06-10 14:56:56,082 : INFO : loading Word2Vec object from skipgram_word2vec_model_pandas.bin
2025-06-10 14:56:56,105 : INFO : loading wv recursively from skipgram_word2vec_model_pandas.bin.wv.* with mmap=None
2025-06-10 14:56:56,105 : INFO : loading vectors from skipgram_word2vec_model_pandas.bin.wv.vectors.npy with mmap=None
2025-06-10 14:56:56,116 : INFO : loading syn1neg from skipgram_word2vec_model_pandas.bin.syn1neg.npy with mmap=None
2025-06-10 14:56:56,127 : INFO : setting ignored attribute cum_table to None


CBOW model loaded from cbow_word2vec_model_pandas.bin


2025-06-10 14:56:56,594 : INFO : Word2Vec lifecycle event {'fname': 'skipgram_word2vec_model_pandas.bin', 'datetime': '2025-06-10T14:56:56.594725', 'gensim': '4.3.3', 'python': '3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]', 'platform': 'macOS-15.5-arm64-arm-64bit', 'event': 'loaded'}


Skip-gram model loaded from skipgram_word2vec_model_pandas.bin
Vector for 'ai' from loaded CBOW model (first 5 elements):
[-0.0085533   0.00892655  0.00210113 -0.00927936 -0.00496012]...
Vector for 'ai' from loaded Skip-gram model (first 5 elements):
[-0.0085533   0.00892655  0.00210113 -0.00927936 -0.00496012]...
