#  Bag of Words (BoW)
 A bar chart showing the frequency of each word in the document.

In [None]:
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

# Sample Document
documents = ["apple banana apple orange"]

# Create CountVectorizer
vectorizer = CountVectorizer(vocabulary=["apple", "banana", "orange"])
X = vectorizer.transform(documents)

# Extract word counts
word_counts = X.toarray()[0]
words = vectorizer.get_feature_names_out()

# Plot Bar Chart
plt.figure(figsize=(8, 6))
plt.bar(words, word_counts, color='skyblue')
plt.title('Bag of Words - Word Frequency')
plt.xlabel('Words')
plt.ylabel('Count')
plt.show()

# Unique Vocabulary Creation
A simple plot showing the size of the vocabulary.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample Documents
documents = ["apple banana apple orange", "banana orange orange"]

# Create CountVectorizer and fit
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

# Vocabulary Size
vocab_size = len(vectorizer.get_feature_names_out())

# Plot Vocabulary Size
plt.figure(figsize=(4, 4))
plt.bar(['Vocabulary Size'], [vocab_size], color='green')
plt.title('Unique Vocabulary Size')
plt.ylabel('Number of Unique Words')
plt.show()


#  Term Frequency-Inverse Document Frequency (TF-IDF)
* Graph: TF-IDF Scores Heatmap
* A heatmap representing TF-IDF scores for each word across documents

In [None]:
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample Documents
documents = ["apple banana apple orange", "banana orange orange"]

# Create TfidfVectorizer and fit
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

# Get TF-IDF scores
tfidf_scores = X.toarray()
words = vectorizer.get_feature_names_out()
doc_ids = [f'Document {i+1}' for i in range(len(documents))]

# Create DataFrame for Seaborn
import pandas as pd
df = pd.DataFrame(tfidf_scores, index=doc_ids, columns=words)

# Plot Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df, annot=True, cmap='Blues')
plt.title('TF-IDF Scores Heatmap')
plt.xlabel('Words')
plt.ylabel('Documents')
plt.show()


In [None]:
import spacy

# Load spaCy small model
nlp = spacy.load("en_core_web_sm")



# Word Embeddings
* Graph: Word Embeddings Visualization (2D Projection)
* A scatter plot of word embeddings reduced to 2 dimensions using PCA.

In [None]:
from sklearn.decomposition import PCA
# Sample Text
text = "apple banana orange grape mango"

# Obtain Word Vectors
doc = nlp(text)
words = [token.text for token in doc]
vectors = [token.vector for token in doc]

# Reduce Dimensions with PCA
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(vectors)

# Plot Scatter Plot
plt.figure(figsize=(8, 6))
for i, word in enumerate(words):
    plt.scatter(reduced_vectors[i, 0], reduced_vectors[i, 1])
    plt.text(reduced_vectors[i, 0]+0.01, reduced_vectors[i, 1]+0.01, word)

plt.title('Word Embeddings Visualization (PCA Reduced)')
plt.xlabel('PCA Dimension 1')
plt.ylabel('PCA Dimension 2')
plt.grid(True)
plt.show()


# One-Hot Encoding
* Graph: One-Hot Encoding Matrix Heatmap
*  A heatmap representing the one-hot encoded vectors.

In [None]:
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Sample Words
words = ["apple", "banana", "orange"]
words_reshaped = np.array(words).reshape(-1, 1)

# Create OneHotEncoder and fit
encoder = OneHotEncoder(sparse=False)
one_hot_encoded = encoder.fit_transform(words_reshaped)

# Create DataFrame for Seaborn
df = pd.DataFrame(one_hot_encoded, index=words, columns=encoder.get_feature_names_out())

# Plot Heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(df, annot=True, cmap='Greens')
plt.title('One-Hot Encoding Heatmap')
plt.xlabel('Features')
plt.ylabel('Words')
plt.show()
