# Word Mover Distance using NLTK

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim.downloader as api
from gensim.models import Word2Vec

In [7]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    return [token for token in tokens if token.isalnum() and token not in stop_words]

In [9]:
def word_mover_distance(text1, text2, model):
    # Preprocess the texts
    tokens1 = preprocess_text(text1)
    tokens2 = preprocess_text(text2)

    # Compute Word Mover's Distance
    distance = model.wmdistance(tokens1, tokens2)
    return distance

In [10]:
pip install POT #Python Optimal Transport library

Note: you may need to restart the kernel to use updated packages.


ERROR: Invalid requirement: '#Python'


In [14]:
if __name__ == "__main__":
    # Load the text8 corpus
    corpus = api.load('text8')  # download the corpus and return it opened as an iterable

    # Train the Word2Vec model on the text8 corpus
    word2vec_model = Word2Vec(corpus)

    # Save the model for later use
    word2vec_model.save("text8_word2vec_model")

    # Load the saved Word2Vec model
    word2vec_model = Word2Vec.load("text8_word2vec_model")

    # Sample texts
    text1 = "I like cats"
    text2 = "I love dogs"

    # Compute Word Mover's Distance
    wmd = word_mover_distance(text1, text2, word2vec_model.wv)

    print(f"Word Mover's Distance between the texts: {wmd}")

Word Mover's Distance between the texts: 0.9422008546723


# Word Mover Distance using Spacy

In [None]:
import spacy

!python -m spacy download en_core_web_lg

# Load the spaCy model with word vectors
nlp = spacy.load("en_core_web_lg")

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
def preprocess_text(text):
    # Tokenize and preprocess the text
    doc = nlp(text)
    # Remove stopwords and non-alphanumeric tokens
    tokens = [token.text.lower() for token in doc if token.is_alpha and not token.is_stop]
    return tokens

In [None]:
def word_mover_distance(text1, text2):
    # Preprocess the texts
    tokens1 = preprocess_text(text1)
    tokens2 = preprocess_text(text2)

    # Compute Word Mover's Distance
    distance = nlp(' '.join(tokens1)).similarity(nlp(' '.join(tokens2)))
    return distance

In [None]:
if __name__ == "__main__":
    # Sample texts
    text1 = "I like cats"
    text2 = "I love dogs"

    # Compute Word Mover's Distance
    wmd = word_mover_distance(text1, text2)

    print(f"Word Mover's Distance between the texts: {wmd}")

Word Mover's Distance between the texts: 0.7573659752299831
