# Word Embedding
> is a technique used in Natural Language Processing (NLP) to represent words as continuous vectors of real numbers in a lower-dimensional space. These vectors capture the semantic meaning of words, allowing words with similar meanings to have similar vector representations. Word embeddings are learned from large text corpora and enable models to understand the context and relationships between words more effectively than traditional methods like one-hot encoding. Popular word embedding methods include Word2Vec, GloVe, and FastText.

# Techniques :
1. Predictive Models: Word2Vec, FastText
2. Count-based Models: GloVe
3. Contextualized Models: ELMo, BERT, GPT

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Sample corpus (list of sentences)
sentences = [
    "I love natural language processing.",
    "Natural language processing is fun.",
    "I enjoy learning about data science.",
    "Machine learning is a part of data science."
]

# Tokenize the sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Train the Word2Vec model
model_word2vec = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, sg=0)  # CBOW model

# Get the vector for a specific word
word_vector = model_word2vec.wv['data']

# Print the word vector for 'data' (first 5 dimensions)
print(f"Word Vector for 'data': {word_vector[:5]}")

# Find similar words to 'data'
similar_words = model_word2vec.wv.most_similar('data')
print("\nWords similar to 'data':", similar_words)



In [None]:
from gensim.models import FastText
from nltk.tokenize import word_tokenize

# Sample corpus (list of sentences)
sentences = [
    "I love natural language processing.",
    "Natural language processing is fun.",
    "I enjoy learning about data science.",
    "Machine learning is a part of data science."
]

# Tokenize the sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Train the FastText model
model_fasttext = FastText(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1)

# Get the vector for a specific word
word_vector_fasttext = model_fasttext.wv['data']

# Print the word vector for 'data' (first 5 dimensions)
print(f"FastText Word Vector for 'data': {word_vector_fasttext[:5]}")

# Find similar words to 'data'
similar_words_fasttext = model_fasttext.wv.most_similar('data')
print("\nWords similar to 'data' using FastText:", similar_words_fasttext)
