<a href="https://colab.research.google.com/github/Aravind8281/Natural_language_Processing/blob/main/Word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word Embedding

Word Embedding refers to the representation of words in a continuous vector space where the semantic meaning of words is captured based on their context. Word2Vec is a popular algorithm for creating word embeddings, and it provides a way to represent words as vectors in a high-dimensional space

In [18]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
sentence=[
    "Word embeddings are powerful tools in natural language processing.",
    "Word2Vec is a popular algorithm for generating word embeddings.",
    "It captures semantic relationships between words.",
    "Sentiment analysis benefits from using word embeddings."
]
token=[word_tokenize(word.lower()) for word in sentence]
model=Word2Vec(sentences=token,vector_size=100,window=5,min_count=1,workers=4)
model.save("model")
loaded=Word2Vec.load("model")
vector=loaded.wv['word']
print(vector)

[-8.6312117e-03  3.6726298e-03  5.1969956e-03  5.7580448e-03
  7.4746129e-03 -6.1871242e-03  1.1071811e-03  6.0690669e-03
 -2.8463460e-03 -6.1893342e-03 -4.0650315e-04 -8.3839893e-03
 -5.6131086e-03  7.1156803e-03  3.3478425e-03  7.2476044e-03
  6.8202848e-03  7.5378688e-03 -3.7954499e-03 -5.7778065e-04
  2.3588755e-03 -4.5184004e-03  8.3983373e-03 -9.8824166e-03
  6.7796544e-03  2.9165917e-03 -4.9493043e-03  4.4116327e-03
 -1.7421589e-03  6.7185964e-03  9.9779824e-03 -4.3671504e-03
 -5.9382798e-04 -5.7030660e-03  3.8471050e-03  2.7969566e-03
  6.9126366e-03  6.1113168e-03  9.5536355e-03  9.2885876e-03
  7.9138782e-03 -6.9957683e-03 -9.1701644e-03 -3.6092810e-04
 -3.1049980e-03  7.9076663e-03  5.9406627e-03 -1.5445747e-03
  1.5183457e-03  1.8017795e-03  7.8277048e-03 -9.5158573e-03
 -2.1121567e-04  3.4792917e-03 -9.3503715e-04  8.3848005e-03
  9.0257991e-03  6.5426594e-03 -7.1442639e-04  7.7268728e-03
 -8.5508060e-03  3.2148394e-03 -4.6400535e-03 -5.0977739e-03
  3.5963100e-03  5.38139

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# CBOW
Continuous Bag of Words (CBOW) is a type of Word2Vec model used in natural language processing to learn distributed representations of words. CBOW focuses on predicting a target word based on its context, meaning the words surrounding it

In [20]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

sentence = "Continuous Bag of Words (CBOW) is a type of Word2Vec model used in natural language processing."
tokenized_sentence = word_tokenize(sentence.lower())
tokenized_sentences = [tokenized_sentence]

model = Word2Vec(sentences=tokenized_sentences, vector_size=100, min_count=1, workers=4)
model.save("CBOW")

cbow_model = Word2Vec.load("CBOW")


In [31]:
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
loaded_cbow_model = Word2Vec.load("CBOW_model")
missing_word_sentence = f"I love __ language processing with {chosen_word}."
tokenized_missing_word_sentence = word_tokenize(missing_word_sentence.lower())
missing_word_index = tokenized_missing_word_sentence.index("__")
context_words = tokenized_missing_word_sentence[
    max(0, missing_word_index - 2) : missing_word_index
] + tokenized_missing_word_sentence[missing_word_index + 1 :]
predicted_word = loaded_cbow_model.predict_output_word(context_words)
print(f"Predicted sentence: I love {predicted_word[0]} language processing with {chosen_word}.")


Predicted sentence: I love ('processing', 0.04000341) language processing with natural.


# Skip Grams
Skip gram helps to find the surrounding words

In [34]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
corpus = "Skip-gram is a type of Word2Vec model used for natural language processing tasks."
tokenized_corpus = word_tokenize(corpus.lower())
model = Word2Vec(sentences=[tokenized_corpus], vector_size=100, window=5, sg=1, min_count=1)
vector_representation = model.wv['skip-gram']
similar_words = model.wv.most_similar('skip-gram', topn=5)
print("Vector representation of 'skip-gram':", vector_representation)
print("Similar words to 'skip-gram':", similar_words)

Vector representation of 'skip-gram': [-0.00950053  0.00956329 -0.00777098 -0.00264749 -0.004909   -0.00496774
 -0.00802175 -0.00778174 -0.00455603 -0.00127801 -0.00510316  0.00614038
 -0.00951903 -0.00530913  0.00943649  0.00698812  0.00767476  0.00423312
  0.00050608 -0.00598371  0.00601578  0.00263353  0.00770308  0.00639519
  0.00793947  0.00865565 -0.00989587 -0.00675828  0.00133848  0.006442
  0.00737591  0.00551461  0.0076603  -0.00512752  0.00658372 -0.00410538
 -0.00905433  0.00914211  0.00133122 -0.00276238 -0.00247521 -0.00422346
  0.00481142  0.00439959 -0.00265342 -0.00734195 -0.00356582 -0.00033561
  0.00609489 -0.0028371  -0.00011981  0.00087562 -0.0070936   0.00206835
 -0.00143702  0.00280175  0.00484492 -0.00135458 -0.00278321  0.00773507
  0.00504821  0.00671288  0.00451923  0.00866465  0.00747495 -0.00107953
  0.00874805  0.00460283  0.00543659 -0.001389   -0.00204127 -0.00442064
 -0.00851286  0.00304053  0.00888407  0.00892206 -0.00194521  0.00608881
  0.00377916 -0

# Negative Sampling

Negative sampling is a technique used in word embedding models, particularly in algorithms like Word2Vec, to efficiently train the model by sampling a small number of negative examples (words that do not appear in the context) for each training instance. The goal is to distinguish between true context words and randomly sampled negative words, making the training process computationally more efficient.

In [43]:
import gensim.models
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

# Sample corpus
corpus = [
    "I love natural language processing.",
    "Word embeddings capture semantic relationships.",
    "CBOW is a technique in NLP.",
    "It learns from contextual words.",
    "Training a CBOW model is straightforward."
]

# Tokenize the corpus
tokenized_corpus = [simple_preprocess(sentence, deacc=True) for sentence in corpus]

# Create the Word2Vec model with negative sampling
model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=100,
    window=2,
    sg=0,
    min_count=1,
    negative=15
)
target_word = "processing"
context_words = ["language", "natural"]
model.train([[target_word], context_words], epochs=model.epochs, total_examples=len(tokenized_corpus))



(4, 15)

# Similarity and Analogy
    Similarity: In the context of word embeddings like Word2Vec, similarity refers to the measure of closeness or relatedness between two words in the vector space. Similar words should have similar vector representations, indicating a shared semantic meaning.

    Analogy: Analogy tasks involve finding a word that completes a given analogy relationship. For example, if "man" is to "woman" as "king" is to... (the expected answer is "queen"). Analogies showcase the ability of word embeddings to capture relationships between words.

In [47]:
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api

path = api.load('word2vec-google-news-300', return_path=True)
model = KeyedVectors.load_word2vec_format(path, binary=True)

def similarity(word1, word2):
    similarity = model.similarity(word1, word2)
    print("Similarity:", similarity)

def analogy(positive, negative, topn=1):
    analogy_result = model.most_similar(positive=positive, negative=negative, topn=topn)
    print("Analogy:", analogy_result[0][0])

similarity("king", "queen")
analogy(['man', 'queen'], ['woman'])


Similarity: 0.6510957
Analogy: king
