# One Hot Encoding 
>  One-Hot Encoding is a technique used to represent words as vectors. Each word in the vocabulary is converted into a binary vector with a length equal to the size of the vocabulary.

# Step 1: Define the Sample Texts

In [None]:
documents = [
    "I love natural language processing",
    "Natural language processing is fascinating",
    "I love programming and data science"
]

# Step 2: Tokenize the Text

In [None]:
# Tokenize each document by lowercasing and splitting by spaces
tokenized_docs = [doc.lower().split() for doc in documents]

# Print each tokenized document
for i, tokens in enumerate(tokenized_docs, 1):
    print(f"Document {i}: {tokens}")


# Step 3: Create a Vocabulary

In [None]:
all_tokens = [word for tokens in tokenized_docs for word in tokens]

# Unique words (Vocabulary)
vocabulary = sorted(set(all_tokens))
vocabulary

In [None]:
# Flatten the list of tokenized words into a single list
all_tokens = [word for tokens in tokenized_docs for word in tokens]

# Unique words (Vocabulary)
vocabulary = sorted(set(all_tokens))

# Display vocabulary in comma-separated row form
vocabulary_row = ", ".join(vocabulary)
vocabulary_row


# Step 4: Create One-Hot Encodings

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer for One-Hot Encoding
vectorizer = CountVectorizer(binary=True)


In [None]:

# Fit and transform the documents
one_hot_matrix = vectorizer.fit_transform(documents)

In [None]:
# Convert to DataFrame
one_hot_df = pd.DataFrame(one_hot_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display the result
print(one_hot_df)

# Step 5: Display the One-Hot Encoded Matrix

In [None]:
# Convert to DataFrame
one_hot_df = pd.DataFrame(one_hot_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Add a column for document labels
one_hot_df.insert(0, 'Document', [f'Document {i+1}' for i in range(len(documents))])

# Display the DataFrame in tabular form
display(one_hot_df)

# Bag of Words (BoW)
 Bag of Words is a simple and commonly used technique in NLP that converts text into a matrix of token counts, ignoring the grammar and word order but capturing the frequency of each word in the document

# Steps to Create a BoW Model
* Step 1: Tokenize each document.
* Step 2: Create a vocabulary of all unique words across the documents.
* Step 3: Count the occurrence of each word in each document.

# Step 1: Tokenization

In [None]:
# Your list of documents
documents = [
    "I love natural language processing.",
    "Natural language processing is fascinating.",
    "I love programming and data science."
]

# Tokenize each document by lowercasing and splitting by spaces
tokenized_docs = [doc.lower().split() for doc in documents]

# Print each tokenized document
for i, tokens in enumerate(tokenized_docs, 1):
    print(f"Document {i}: {tokens}")



# Step 2: Combine Tokens

In [None]:
# Flatten the list of tokenized words into a single list
all_tokens = [word for tokens in tokenized_docs for word in tokens]

# Print the combined list of all tokens
print(all_tokens)


In [None]:
# Create a set of unique words (vocabulary)
vocabulary = set(all_tokens)
print(vocabulary)

# VOCABULARY 

In [None]:
# Convert to sorted list
sorted_vocabulary = sorted(vocabulary)
print(sorted_vocabulary)


# Bag of Words (BoW)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Create the Bag of Words Model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

# Convert to DataFrame for better visualization
import pandas as pd
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(bow_df)


In [None]:
# Convert to DataFrame
one_hot_df = pd.DataFrame(one_hot_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Add a column for document labels
one_hot_df.insert(0, 'Document', [f'Document {i+1}' for i in range(len(documents))])

# Display the DataFrame in tabular form
display(one_hot_df)

In [None]:
import pandas as pd
import numpy as np

# Assuming X is the vectorized data and vectorizer is your vectorizer object
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Function to add a random background color to each cell
def random_color(val):
    color = "#{:06x}".format(np.random.randint(0, 0xFFFFFF))
    return f'background-color: {color}'

# Apply the random_color function to each cell in the DataFrame
styled_bow_df = bow_df.style.applymap(random_color)

# Display the styled DataFrame
styled_bow_df

# Vectorization without stopwords

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample Texts
documents = [
    "I love natural language processing.",
    "Natural language processing is fascinating.",
    "I love programming and data science."
]

# Create the Bag of Words Model with stop words removal
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# Convert to DataFrame for better visualization
import pandas as pd
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(bow_df)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Sample Texts
documents = [
    "I love natural language processing.",
    "Natural language processing is fascinating.",
    "I love programming and data science."
]

# Create the Bag of Words Model with stop words removal
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

In [None]:
print(X)

# EXAMPLE # 02 

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
documents = [
    "I love programming in Python. Python programming is great.",
    "Python programming is fun. Programming is very interesting.",
    "I love learning new programming languages. Programming is exciting."
]


# . By One Hot Encodeing 

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer for One-Hot Encoding
vectorizer = CountVectorizer(binary=True)

# Fit and transform the documents
one_hot_matrix = vectorizer.fit_transform(documents)

# Convert to DataFrame
one_hot_df = pd.DataFrame(one_hot_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display the result
print(one_hot_df)

In [None]:
# Convert to DataFrame
one_hot_df = pd.DataFrame(one_hot_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Add a column for document labels
one_hot_df.insert(0, 'Document', [f'Document {i+1}' for i in range(len(documents))])

# Display the DataFrame in tabular form
display(one_hot_df)

 # By Bag Of Words : 

In [None]:
# Define the documents
documents = [
    "I love programming in Python. Python programming is great.",
    "Python programming is fun. Programming is very interesting.",
    "I love learning new programming languages. Programming is exciting."
]

# Initialize the CountVectorizer for Bag of Words with frequency counts
vectorizer = CountVectorizer(binary=False)

# Fit and transform the documents
bow_matrix = vectorizer.fit_transform(documents)

# Convert to DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Add a column for document labels
bow_df.insert(0, 'Document', [f'Document {i+1}' for i in range(len(documents))])

# Display the DataFrame in tabular form
display(bow_df)

# Other Examples 

In [56]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
documents = ["I love programming", "Programming is fun Programming", "I love coding"]

# Initialize CountVectorizer (default is binary=False, so it counts frequencies)
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(documents)

# Output results
print("Feature Names:", vectorizer.get_feature_names_out())
print("Frequency Count Matrix:\n", X.toarray())


Feature Names: ['coding' 'fun' 'is' 'love' 'programming']
Frequency Count Matrix:
 [[0 0 0 1 1]
 [0 1 1 0 1]
 [1 0 0 1 0]]


In [55]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
documents = ["I love programming", "Programming is fun Programming", "I love coding"]

# Initialize CountVectorizer (default is binary=False, so it counts frequencies)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

# Output results
print("Feature Names:", vectorizer.get_feature_names_out())
print("Frequency Count Matrix:\n", X.toarray())


Feature Names: ['coding' 'fun' 'is' 'love' 'programming']
Frequency Count Matrix:
 [[0 0 0 1 1]
 [0 1 1 0 2]
 [1 0 0 1 0]]


# With Text Precessing 

In [4]:
# Sample documents with higher frequency words
documents = [
    "I love programming programming programming!",
    "Programming is fun fun fun!",
    "I love coding coding coding, especially in Python Python.",
    "Coding in Python Python is great great, and programming programming is fun!"
]


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import re

def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()  # Split into words
    filtered_words = [word for word in words if word not in ENGLISH_STOP_WORDS]
    return ' '.join(filtered_words)
# Apply preprocessing to each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Step 2: One-Hot Encoding
vectorizer_one_hot = CountVectorizer(binary=True)
X_one_hot = vectorizer_one_hot.fit_transform(preprocessed_documents)

print("Preprocessed Documents:")
for doc in preprocessed_documents:
    print(doc)

print("\nOne-Hot Encoding Feature Names:", vectorizer_one_hot.get_feature_names_out())
print("One-Hot Encoding Matrix:\n", X_one_hot.toarray())

# Step 3: Bag of Words with Frequency Counts
vectorizer_bow = CountVectorizer(binary=False)
X_bow = vectorizer_bow.fit_transform(preprocessed_documents)

print("\nBag of Words Feature Names:", vectorizer_bow.get_feature_names_out())
print("Bag of Words Frequency Count Matrix:\n", X_bow.toarray())


NameError: name 'ENGLISH_STOP_WORDS' is not defined

# TASK 1
Given a set of documents, how can you create a binary representation where each word is represented as a binary feature (1 for presence, 0 for absence)?

**Question**: Write a Python script to perform One-Hot Encoding on the following documents:

1. "**I love programming in Python.**"
1. "**Python programming is fun.**"
1. "**I love learning new languages.**"

Display the result in a tabular format with rows representing documents and columns representing words.

# Task 2

Given a set of documents, create a frequency-based representation where each word is represented by its count in the document. Analyze the frequency of words that appear more than once.

**Question**: Write a Python script to perform Bag of Words with frequency counting on the following documents:

1. "**Data science is fun. Data science involves statistics and coding.**"
1. "**Coding is an essential skill for data science.**"
1. "**Statistics and data analysis are crucial for data science.**"

Display the result in a tabular format showing the frequency of each word in each document. Make sure to highlight words that appear more than once within each document and across documents.