# Practical 2
## Section 1: Connecting the Google Drive to read dataset

In [None]:
import os

In [None]:
os.listdir()

['.config', 'drive', 'sample_data']

In [None]:
os.chdir('/content/drive/MyDrive/practical/practical_2')

In [None]:
os.listdir()

['practical_2.ipynb']

In [None]:
ROOT = "/content/drive/MyDrive/practical_data"
import os
os.chdir(ROOT)

In [None]:
#import os
os.listdir()

['text.txt', 'my_word2vec_model.model']

In [None]:
with open('text.txt', 'r', encoding='utf-8') as f: # Remember your data set path should be specified if not in same working directory
    texts = f.readlines()

In [None]:
texts[:30]

["                ALICE'S ADVENTURES IN WONDERLAND\n",
 '\n',
 '                          Lewis Carroll\n',
 '\n',
 '               THE MILLENNIUM FULCRUM EDITION 3.0\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '                            CHAPTER I\n',
 '\n',
 '                      Down the Rabbit-Hole\n',
 '\n',
 '\n',
 '  Alice was beginning to get very tired of sitting by her sister\n',
 'on the bank, and of having nothing to do:  once or twice she had\n',
 'peeped into the book her sister was reading, but it had no\n',
 "pictures or conversations in it, `and what is the use of a book,'\n",
 "thought Alice `without pictures or conversation?'\n",
 '\n',
 '  So she was considering in her own mind (as well as she could,\n',
 'for the hot day made her feel very sleepy and stupid), whether\n',
 'the pleasure of making a daisy-chain would be worth the trouble\n',
 'of getting up and picking the daisies, when suddenly a White\n',
 'Rabbit with pink eyes ran close by her.\n',
 '\n',
 '  There was no



It calculates key metrics such as
*   Vocabulary size
*  Average sentence length
*   word frequencies
- Vocabulary diversity and the ratio of rare words


In [None]:
def assess_data_quality(texts):
    """Analyze text data quality for Word2Vec training"""

    stats = {
        'total_documents': len(texts),
        'total_words': 0,
        'unique_words': set(),
        'sentence_lengths': [],
        'word_frequencies': {}
    }

    for text in texts:
        words = text.lower().split()
        stats['total_words'] += len(words)
        stats['sentence_lengths'].append(len(words))
        stats['unique_words'].update(words)

        for word in words:
            stats['word_frequencies'][word] = stats['word_frequencies'].get(word, 0) + 1

    stats['vocabulary_size'] = len(stats['unique_words'])
    stats['avg_sentence_length'] = sum(stats['sentence_lengths']) / len(stats['sentence_lengths'])

    # Find most common words
    sorted_words = sorted(stats['word_frequencies'].items(), key=lambda x: x[1], reverse=True)
    stats['top_words'] = sorted_words[:20]

    # Quality indicators
    stats['quality_score'] = {
        'vocabulary_diversity': stats['vocabulary_size'] / stats['total_words'],
        'avg_word_frequency': stats['total_words'] / stats['vocabulary_size'],
        'rare_words_ratio': sum(1 for count in stats['word_frequencies'].values() if count == 1) / stats['vocabulary_size']
    }

    return stats

# Example usage
quality_report = assess_data_quality(texts)
print(f"Total documents: {quality_report['total_documents']:,}")
print(f"Vocabulary size: {quality_report['vocabulary_size']:,}")
# print(f"Unique Words: {quality_report['unique_words']}")
print(f"Average sentence length: {quality_report['avg_sentence_length']:.1f}")
print(f"Vocabulary diversity: {quality_report['quality_score']['vocabulary_diversity']:.4f}")

Total documents: 3,598
Vocabulary size: 4,950
Average sentence length: 7.4
Vocabulary diversity: 0.1870


 Setting up **Python environment** for **natural language processing** (NLP) tasks

In [None]:
#Import Packages
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
import nltk

Download required **NLTK data**

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Section 2: Data preprocessing

Performing basic preprocessing like
- Lowering the text case
- Remove punctuation and number
- Remove stopwords
- Lemmatize, remove url and email

In [None]:
class AdvancedTextPreprocessor:
    """Comprehensive text preprocessing for Word2Vec training"""

    def __init__(self,
                 lowercase=True,
                 remove_punctuation=True,
                 remove_numbers=False,
                 remove_stopwords=False,
                 min_word_length=2,
                 max_word_length=50,
                 lemmatize=False,
                 remove_urls=True,
                 remove_emails=True,
                 keep_sentences=True):

        self.lowercase = lowercase
        self.remove_punctuation = remove_punctuation
        self.remove_numbers = remove_numbers
        self.remove_stopwords = remove_stopwords
        self.min_word_length = min_word_length
        self.max_word_length = max_word_length
        self.lemmatize = lemmatize
        self.remove_urls = remove_urls
        self.remove_emails = remove_emails
        self.keep_sentences = keep_sentences

        if remove_stopwords:
            self.stop_words = set(stopwords.words('english'))

        if lemmatize:
            self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        """Clean individual text string"""

        # Remove URLs
        if self.remove_urls:
            text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # Remove email addresses
        if self.remove_emails:
            text = re.sub(r'\S+@\S+', '', text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        #Combined
         #(r'https?://\S+|www\.\S+|<.*?>|\S+@\S+\.\S+|@\w+|#\w+|[^A-Za-z0-9\s])

        return text

    def tokenize_text(self, text):
        """Tokenize text into sentences or words"""

        if self.keep_sentences:
            # Tokenize into sentences first
            sentences = sent_tokenize(text)
            processed_sentences = []

            for sentence in sentences:
                words = self.process_sentence(sentence)
                if len(words) >= 3:  # Keep sentences with at least 3 words
                    processed_sentences.append(words)

            return processed_sentences
        else:
            # Return single list of words
            return self.process_sentence(text)

    def process_sentence(self, sentence):
        """Process individual sentence"""

        # Lowercase
        if self.lowercase:
            sentence = sentence.lower()

        # Tokenize into words
        words = word_tokenize(sentence)

        processed_words = []
        for word in words:

            # Remove punctuation
            if self.remove_punctuation:
                word = word.translate(str.maketrans('', '', string.punctuation))

            # Skip if empty after punctuation removal
            if not word:
                continue

            # Remove numbers
            if self.remove_numbers and word.isdigit():
                continue

            # Check word length
            if len(word) < self.min_word_length or len(word) > self.max_word_length:
                continue

            # Remove stopwords
            if self.remove_stopwords and word in self.stop_words:
                continue

            # Lemmatize
            if self.lemmatize:
                word = self.lemmatizer.lemmatize(word)

            processed_words.append(word)

        return processed_words

    def preprocess_corpus(self, texts):
        """Preprocess entire corpus"""

        all_sentences = []

        for text in texts:
            if not isinstance(text, str):
                continue

            # Clean text
            cleaned_text = self.clean_text(text)

            # Tokenize and process
            processed = self.tokenize_text(cleaned_text)

            if self.keep_sentences:
                all_sentences.extend(processed)
            else:
                all_sentences.append(processed)

        return all_sentences

**Cross check the preprocessed data**

In [None]:
# Example usage
preprocessor = AdvancedTextPreprocessor(
    lowercase=True,
    remove_punctuation = True,
    remove_numbers=True,
    remove_stopwords=False,  # Keep stopwords for Word2Vec
    lemmatize=False,  # Usually not needed for Word2Vec
    keep_sentences=True
)

# Processing corpus
processed_sentences = preprocessor.preprocess_corpus(texts)
print(f"Processed {len(processed_sentences)} sentences")
print(f"Sample sentence: {processed_sentences[0]}")

Processed 2941 sentences
Sample sentence: ['alice', 'adventures', 'in', 'wonderland']


In [None]:
processed_sentences[:3]

[['alice', 'adventures', 'in', 'wonderland'],
 ['the', 'millennium', 'fulcrum', 'edition'],
 ['down', 'the', 'rabbithole']]

This Python function, `recommend_parameters`, provides a set of **recommended parameters** for training a **Word2Vec** model. It takes four inputs: `corpus_size`, `vocab_size`, `domain_type`, and `computing_resources`. The function uses a series of conditional statements to determine optimal values for common Word2Vec parameters like:

* **`vector_size`**: The dimensionality of the word vectors, which increases for larger corpora to capture more nuance.
* **`window`**: The maximum distance between the current and predicted word, adjusted based on the `domain_type` (e.g., smaller for technical domains to focus on syntax, larger for academic domains to capture broader semantic context).
* **`min_count`**: The minimum frequency of a word to be included in the vocabulary, scaled with the `corpus_size`.
* **`sg`**: The training algorithm, with Skip-gram (`sg=1`) being recommended for technical and academic domains to better handle rare words, while CBOW (`sg=0`) is used for general text.
* **`epochs`**: The number of training iterations, which is influenced by both corpus size and `computing_resources`.
* **`hs` vs. `negative`**: The training optimization, where hierarchical softmax (`hs=1`) is recommended for very large vocabularies to improve efficiency, while negative sampling (`negative=10`) is preferred for smaller vocabularies.

The final output is a dictionary containing the recommended parameter values tailored to the specific characteristics of the input data and available resources.

In [None]:
def recommend_parameters(corpus_size, vocab_size, domain_type, computing_resources):
    """
    Recommend Word2Vec parameters based on corpus characteristics

    Args:
        corpus_size: Number of sentences/documents
        vocab_size: Unique words in vocabulary
        domain_type: 'general', 'technical', 'social_media', 'academic'
        computing_resources: 'limited', 'moderate', 'high'
    """

    recommendations = {}

    # Vector size based on corpus and vocab size
    if corpus_size < 10000:
        recommendations['vector_size'] = 50
    elif corpus_size < 100000:
        recommendations['vector_size'] = 100
    elif corpus_size < 1000000:
        recommendations['vector_size'] = 200
    else:
        recommendations['vector_size'] = 300

    # Window size based on domain
    domain_windows = {
        'general': 5,
        'technical': 3,  # More syntactic focus
        'social_media': 4,
        'academic': 6    # More semantic focus
    }
    recommendations['window'] = domain_windows.get(domain_type, 5)

    # Min count based on corpus size
    if corpus_size < 10000:
        recommendations['min_count'] = 1
    elif corpus_size < 100000:
        recommendations['min_count'] = 2
    elif corpus_size < 1000000:
        recommendations['min_count'] = 5
    else:
        recommendations['min_count'] = 10

    # Algorithm selection
    if domain_type in ['technical', 'academic']:
        recommendations['sg'] = 1  # Skip-gram for rare technical terms
    else:
        recommendations['sg'] = 0  # CBOW for general text

    # Epochs based on corpus size and resources
    if computing_resources == 'limited':
        recommendations['epochs'] = 5
    elif corpus_size < 100000:
        recommendations['epochs'] = 15
    else:
        recommendations['epochs'] = 10

    # Hierarchical softmax vs negative sampling
    if vocab_size > 100000:
        recommendations['hs'] = 1
        recommendations['negative'] = 0
    else:
        recommendations['hs'] = 0
        recommendations['negative'] = 10

    return recommendations

In [None]:
corpus_size = len(processed_sentences)
print(f"Corpus Size: {corpus_size}")

# Calculate vocabulary size (unique words in vocabulary)
vocab = set(word for sentence in processed_sentences for word in sentence)
vocab_size = len(vocab)
print(f"Vocabulary Size: {vocab_size}")

Corpus Size: 2941
Vocabulary Size: 2519


In [None]:
# For this task
params = recommend_parameters(
    corpus_size=corpus_size,
    vocab_size=vocab_size,
    domain_type='general',
    computing_resources='moderate'
)
print("Recommended parameters:", params)

Recommended parameters: {'vector_size': 50, 'window': 5, 'min_count': 1, 'sg': 0, 'epochs': 15, 'hs': 0, 'negative': 10}


## Section 3: Traning the model

Installing **Gensim** to easily implement and train powerful topic modeling and word embedding models like **Word2Vec**.

In [None]:
pip install gensim



Using Gensim's `Word2Vec` class, which handles all the complex mathematical and computational aspects of the training process. Instead of manually coding the neural network architecture, optimizations (like negative sampling and hierarchical softmax), and multithreading from scratch.

The library also provides utility classes, such as `CallbackAny2Vec`, which the code uses to log training progress, making it easier to monitor your model's performance.

In [None]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
import time
import multiprocessing

class EpochLogger(CallbackAny2Vec):
    """Callback to log information about training progress"""

    def __init__(self):
        self.epoch = 0
        self.start_time = time.time()

    def on_epoch_begin(self, model):
        print(f"Epoch #{self.epoch} start")

    def on_epoch_end(self, model):
        elapsed = time.time() - self.start_time
        print(f"Epoch #{self.epoch} end - Time elapsed: {elapsed:.2f}s")
        self.epoch += 1

def train_word2vec_model(sentences, save_path=None, **params):
    """
    Train Word2Vec model with given parameters

    Args:
        sentences: List of tokenized sentences
        save_path: Path to save the model
        **params: Word2Vec parameters
    """

    # Set default parameters
    default_params = {
        'vector_size': 100,
        'window': 5,
        'min_count': 5,
        'workers': multiprocessing.cpu_count() - 1,
        'sg': 0,  # CBOW
        'epochs': 10,
        'alpha': 0.025,
        'min_alpha': 0.0001,
        'hs': 0,
        'negative': 10
    }

    # Update with provided parameters
    default_params.update(params)

    print("Training Word2Vec model with parameters:")
    for key, value in default_params.items():
        print(f"  {key}: {value}")

    # Add callback for progress monitoring
    epoch_logger = EpochLogger()

    # Train the model
    print(f"\nTraining on {len(sentences)} sentences...")
    start_time = time.time()

    model = Word2Vec(
        sentences=sentences,
        callbacks=[epoch_logger],
        **default_params
    )

    training_time = time.time() - start_time
    print(f"\nTraining completed in {training_time:.2f} seconds")
    print(f"Vocabulary size: {len(model.wv)} words")

    # Save model if path provided
    if save_path:
        model.save(save_path)
        print(f"Model saved to {save_path}")

    return model

In [None]:
# Example usage
model = train_word2vec_model(
    sentences=processed_sentences,
    save_path='my_word2vec_model.model',
    vector_size=50,
    window=2,
    min_count=2,
    epochs=10000,
    compute_loss = True
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch #7502 start
Epoch #7502 end - Time elapsed: 410.81s
Epoch #7503 start
Epoch #7503 end - Time elapsed: 410.86s
Epoch #7504 start
Epoch #7504 end - Time elapsed: 410.91s
Epoch #7505 start
Epoch #7505 end - Time elapsed: 410.96s
Epoch #7506 start
Epoch #7506 end - Time elapsed: 411.01s
Epoch #7507 start
Epoch #7507 end - Time elapsed: 411.05s
Epoch #7508 start
Epoch #7508 end - Time elapsed: 411.10s
Epoch #7509 start
Epoch #7509 end - Time elapsed: 411.15s
Epoch #7510 start
Epoch #7510 end - Time elapsed: 411.20s
Epoch #7511 start
Epoch #7511 end - Time elapsed: 411.25s
Epoch #7512 start
Epoch #7512 end - Time elapsed: 411.29s
Epoch #7513 start
Epoch #7513 end - Time elapsed: 411.35s
Epoch #7514 start
Epoch #7514 end - Time elapsed: 411.39s
Epoch #7515 start
Epoch #7515 end - Time elapsed: 411.44s
Epoch #7516 start
Epoch #7516 end - Time elapsed: 411.49s
Epoch #7517 start
Epoch #7517 end - Time elapsed: 411.54s
Epoch #

In [None]:
vocab_size = len(model.wv.index_to_key)
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 1396


In [None]:
all_words = model.wv.index_to_key
print("All Words in Vocabulary:", all_words[:10])

All Words in Vocabulary: ['the', 'and', 'to', 'it', 'she', 'of', 'said', 'you', 'in', 'was']


### Word2VecEvaluator

Defined a **`Word2VecEvaluator` class** that provides a suite of evaluation methods for Word2Vec models.  
It helps assess how well a trained model captures semantic relationships between words.

#### Features

- **`evaluate_word_similarity`**  
  Compares the model’s similarity scores for word pairs against human-annotated scores using *Spearman correlation*.

- **`evaluate_analogies`**  
  Tests the model on analogy tasks (e.g., *king : queen :: man : woman*) and calculates accuracy.

- **`evaluate_odd_one_out`**  
  Evaluates the model’s ability to detect the odd word in a group  
  (e.g., `["apple", "banana", "car", "orange"]`).

- **`analyze_vocabulary_coverage`**  
  Checks how much of the vocabulary in given test texts is covered by the model and reports unknown words.

- **`compare_with_baseline`**  
  Compares the model against a baseline Word2Vec model by analyzing similarity correlations on common words.


In [None]:
import numpy as np
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity

class Word2VecEvaluator:
    """Comprehensive evaluation suite for Word2Vec models"""

    def __init__(self, model):
        self.model = model
        self.wv = model.wv

    def evaluate_word_similarity(self, word_pairs_with_scores):
        """
        Evaluate model on word similarity datasets

        Args:
            word_pairs_with_scores: List of tuples (word1, word2, human_score)

        Returns:
            Spearman correlation with human judgments
        """

        model_similarities = []
        human_similarities = []

        for word1, word2, human_score in word_pairs_with_scores:
            try:
                model_sim = self.wv.similarity(word1, word2)
                model_similarities.append(model_sim)
                human_similarities.append(human_score)
            except KeyError:
                # Skip if words not in vocabulary
                continue

        if len(model_similarities) < 10:
            print("Warning: Too few valid word pairs for reliable evaluation")
            return None

        correlation, p_value = spearmanr(human_similarities, model_similarities)

        print(f"Word Similarity Evaluation:")
        print(f"Valid pairs: {len(model_similarities)}")
        print(f"Spearman correlation: {correlation:.4f}")
        print(f"P-value: {p_value:.4f}")

        return correlation

    def evaluate_analogies(self, analogy_dataset):
        """
        Evaluate model on word analogy tasks

        Args:
            analogy_dataset: List of tuples (word_a, word_b, word_c, word_d)
                           representing "word_a is to word_b as word_c is to word_d"

        Returns:
            Accuracy on analogy task
        """

        correct = 0
        total = 0
        #('king', 'queen', 'man', 'woman'),
        for word_a, word_b, word_c, expected_d in analogy_dataset:
            try:
                # Predict word_d
                result = self.wv.most_similar(
                    positive=[word_a, word_b],
                    negative=[word_c],
                    topn=1
                )

                predicted_d = result

                if predicted_d[0][0].lower() == expected_d.lower():
                    correct += 1

                total += 1

            except (KeyError, IndexError):
                # Skip if words not in vocabulary
                continue

        if total == 0:
            print("Warning: No valid analogies found")
            return 0

        accuracy = correct / total

        print(f"Analogy Evaluation:")
        print(f"Valid analogies: {total}")
        print(f"Correct predictions: {correct}")
        print(f"Accuracy: {accuracy:.4f}")

        return accuracy

    def evaluate_odd_one_out(self, word_groups):
        """
        Evaluate model's ability to identify odd words in groups

        Args:
            word_groups: List of lists, each containing words where one doesn't belong

        Returns:
            Accuracy on odd-one-out task
        """

        correct = 0
        total = 0

        for group in word_groups:
            if len(group) < 3:
                continue

            try:
                # Find the word that doesn't match others
                odd_word = self.wv.doesnt_match(group)

                # This is tricky - we need ground truth to evaluate properly
                # For now, just check if the model can identify AN odd word
                correct += 1  # Placeholder - you'd need labeled data
                total += 1

            except KeyError:
                continue

        if total == 0:
            return 0

        accuracy = correct / total

        print(f"Odd-One-Out Evaluation:")
        print(f"  Valid groups: {total}")
        print(f"  Accuracy: {accuracy:.4f}")

        return accuracy

    def analyze_vocabulary_coverage(self, test_texts):
        """
        Analyze how well model vocabulary covers test texts

        Args:
            test_texts: List of text strings

        Returns:
            Coverage statistics
        """

        vocab = set(self.wv.index_to_key)

        total_words = 0
        covered_words = 0
        unknown_words = set()

        for text in test_texts:
            words = text.lower().split()
            total_words += len(words)

            for word in words:
                if word in vocab:
                    covered_words += 1
                else:
                    unknown_words.add(word)

        coverage_ratio = covered_words / total_words if total_words > 0 else 0

        print(f"Vocabulary Coverage Analysis:")
        print(f"  Total words in test: {total_words}")
        print(f"  Covered words: {covered_words}")
        print(f"  Coverage ratio: {coverage_ratio:.4f}")
        print(f"  Unknown words: {len(unknown_words)}")

        return {
            'coverage_ratio': coverage_ratio,
            'unknown_words': list(unknown_words)[:20],  # Show first 20
            'total_unknown': len(unknown_words)
        }

    def compare_with_baseline(self, baseline_model, test_words):
        """
        Compare model performance with baseline model

        Args:
            baseline_model: Another Word2Vec model to compare against
            test_words: List of words to test

        Returns:
            Comparison statistics
        """

        common_words = []
        for word in test_words:
            if word in self.wv and word in baseline_model.wv:
                common_words.append(word)

        if len(common_words) < 10:
            print("Warning: Too few common words for reliable comparison")
            return None

        # Compare similarity patterns
        similarities = []

        for i, word1 in enumerate(common_words[:20]):  # Test subset
            for word2 in common_words[i+1:21]:  # Avoid too many comparisons

                sim1 = self.wv.similarity(word1, word2)
                sim2 = baseline_model.wv.similarity(word1, word2)

                similarities.append((sim1, sim2))

        model_sims = [s for s in similarities]
        baseline_sims = [s for s in similarities]

        correlation, _ = spearmanr(model_sims, baseline_sims)

        print(f"Model Comparison:")
        print(f"  Common vocabulary: {len(common_words)}")
        print(f"  Similarity correlation: {correlation:.4f}")

        return correlation

In [None]:
# Example evaluation datasets
word_similarity_pairs = [
    ('king', 'queen', 8.5),
    ('man', 'woman', 8.3),
    ('car', 'automobile', 9.2),
    ('computer', 'laptop', 7.8),
    ('cat', 'dog', 6.1),
    ('happy', 'sad', 2.1),
]

analogy_examples = [
    ('king', 'queen', 'man', 'woman'),
    ('paris', 'france', 'london', 'england'),
    ('walking', 'walked', 'running', 'ran'),
    ('good', 'better', 'bad', 'worse'),
]

# Usage example
evaluator = Word2VecEvaluator(model)
sim_score = evaluator.evaluate_word_similarity(word_similarity_pairs)
analogy_score = evaluator.evaluate_analogies(analogy_examples)

Analogy Evaluation:
Valid analogies: 3
Correct predictions: 0
Accuracy: 0.0000


In [None]:
word = "alice"
if word in model.wv:
    similar_words = model.wv.most_similar(word, topn=10)
    print(f"Most similar words to '{word}':")
    for similar_word, similarity in similar_words:
        print(f"{similar_word}: {similarity}")
else:
    print("Word is not in the vocabulary.")

Most similar words to 'alice':
she: 0.6182680130004883
king: 0.4920527935028076
dormouse: 0.4626016914844513
pig: 0.4381174147129059
it: 0.43195730447769165
he: 0.40746310353279114
mouse: 0.3474424183368683
snail: 0.34479963779449463
sulkily: 0.3345183730125427
thoughtfully: 0.3293803632259369


In [None]:
model.wv.similarity('king', 'man')

0.41290423