In [None]:
#finds document similarity using word2vec - CBOW

In [1]:
import numpy as np
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.data import find
import string

In [2]:
# Download NLTK data if not already available
try:
    find('corpora/stopwords.zip')
except LookupError:
    nltk.download('stopwords')
    nltk.download('punkt')

In [3]:
# Example sentences
sentences = [
    "I love natural language processing",
    "Language processing is fascinating"
]

In [4]:
# Tokenize and preprocess sentences
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

In [5]:
tokenized_sentences = [preprocess(sentence) for sentence in sentences]

In [6]:
# Train Word2Vec model
# Use sg=0 for CBOW or sg=1 for Skip-gram
model = Word2Vec(sentences=tokenized_sentences, vector_size=50, window=5, sg=0, min_count=1)

In [7]:
# Save the model (optional)
model.save("word2vec.model")

In [None]:
# Load the model (optional)
# model = Word2Vec.load("word2vec.model")

In [8]:
# Get word vectors for each word in the vocabulary
word_vectors = model.wv
vocab = list(word_vectors.index_to_key)

In [9]:
# Print word vectors for each word in the vocabulary
print("\nWord Vectors:")
for word in vocab:
    print(f"{word}: {word_vectors[word]}")


Word Vectors:
processing: [-1.0724545e-03  4.7286271e-04  1.0206699e-02  1.8018546e-02
 -1.8605899e-02 -1.4233618e-02  1.2917745e-02  1.7945977e-02
 -1.0030856e-02 -7.5267432e-03  1.4761009e-02 -3.0669428e-03
 -9.0732267e-03  1.3108104e-02 -9.7203208e-03 -3.6320353e-03
  5.7531595e-03  1.9837476e-03 -1.6570430e-02 -1.8897636e-02
  1.4623532e-02  1.0140524e-02  1.3515387e-02  1.5257311e-03
  1.2701781e-02 -6.8107317e-03 -1.8928028e-03  1.1537147e-02
 -1.5043275e-02 -7.8722071e-03 -1.5023164e-02 -1.8600845e-03
  1.9076237e-02 -1.4638334e-02 -4.6675373e-03 -3.8754821e-03
  1.6154874e-02 -1.1861792e-02  9.0324880e-05 -9.5074680e-03
 -1.9207101e-02  1.0014586e-02 -1.7519170e-02 -8.7836506e-03
 -7.0199967e-05 -5.9236289e-04 -1.5322480e-02  1.9229487e-02
  9.9641159e-03  1.8466286e-02]
language: [-0.01631583  0.0089916  -0.00827415  0.00164907  0.01699724 -0.00892435
  0.009035   -0.01357392 -0.00709698  0.01879702 -0.00315531  0.00064274
 -0.00828126 -0.01536538 -0.00301602  0.00493959 -0.0

In [10]:
# Create document vectors by averaging word vectors
def get_document_vector(tokens):
    vectors = [word_vectors[word] for word in tokens if word in word_vectors]
    if len(vectors) == 0:
        return np.zeros(word_vectors.vector_size)
    return np.mean(vectors, axis=0)

doc_vectors = np.array([get_document_vector(sentence) for sentence in tokenized_sentences])

In [11]:
# Print document vectors
print("\nDocument Vectors:")
for i, vector in enumerate(doc_vectors):
    print(f"Document {i+1}: {vector}")


Document Vectors:
Document 1: [-3.9100490e-04 -8.5031858e-04 -3.0259513e-03  5.9637697e-03
  2.9626396e-03  2.0745976e-03  8.1569292e-03  5.6826184e-03
 -8.7963343e-03  9.7755129e-03 -3.6844027e-03 -5.8454985e-04
 -2.0019419e-03 -2.6721025e-03  2.3560759e-03 -2.4973007e-05
  7.4592866e-03  5.3409259e-03 -6.0261115e-03 -2.8494739e-03
  9.9646137e-04 -9.6039148e-05  3.4814281e-03 -5.2358601e-03
  8.9447489e-03 -7.1936878e-03  3.9165975e-03  3.8604005e-03
 -5.7834121e-03 -6.1982395e-03 -6.1340248e-03 -8.5163780e-04
  6.2091537e-03 -4.5851525e-03 -7.7270283e-03 -1.8676852e-03
  6.8936357e-03 -8.6563863e-03  5.3589337e-04 -1.7938162e-03
 -6.2392433e-03  2.6756758e-03 -1.9933234e-03 -2.6309665e-03
  5.4200664e-03  5.9785466e-03 -4.8988215e-03 -1.2590936e-03
 -3.9966968e-03  9.5497407e-03]
Document 2: [-1.1542555e-02  5.5986452e-03  4.1041048e-03  1.0383832e-02
  4.4417260e-03 -1.1831108e-02  8.0546578e-03  5.4888739e-03
 -7.6026451e-03 -3.5892427e-04  3.5950860e-03 -6.3873655e-03
 -9.518169

In [12]:
# Compute cosine similarity
similarity = cosine_similarity(doc_vectors)

In [13]:
print("\nCosine Similarity:")
print(similarity)


Cosine Similarity:
[[0.99999994 0.60899895]
 [0.60899895 1.        ]]
