## 1.One Hot encoding

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer


In [16]:
# Sample text data
text_data = [
    "The quick brown fox",
    "Jumped over the lazy dog",
    "it is very difficult"
]

In [23]:
from nltk.tokenize import word_tokenize

In [25]:
tokenized_sentences = [word_tokenize(sentence) for sentence in text_data]

In [27]:
tokenized_sentences

[['The', 'quick', 'brown', 'fox'],
 ['Jumped', 'over', 'the', 'lazy', 'dog'],
 ['it', 'is', 'very', 'difficult']]

In [30]:
for sentence_tokens in tokenized_sentences:
    print(sentence_tokens)

['The', 'quick', 'brown', 'fox']
['Jumped', 'over', 'the', 'lazy', 'dog']
['it', 'is', 'very', 'difficult']


In [31]:
# Create a CountVectorizer to tokenize the text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(text_data)

In [39]:
X_dense = X.toarray()
X_dense

array([[1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0],
       [0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1]], dtype=int64)

In [41]:
# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder()
X_dense = X.toarray()
# Fit and transform the data
X_onehot = onehot_encoder.fit_transform(X_dense)

In [42]:
print("One-hot encoded data:")
print(X_onehot.toarray())

One-hot encoded data:
[[0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0.]
 [1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0.]
 [1. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1.]]


In [43]:
print("Vocabulary:")
print(vectorizer.get_feature_names_out())

Vocabulary:
['brown' 'difficult' 'dog' 'fox' 'is' 'it' 'jumped' 'lazy' 'over' 'quick'
 'the' 'very']


In [44]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Define the data
data = ['ansiya', 'shahir']

# Create a vocabulary (unique elements)
vocab = sorted(set(data))

# Create an instance of OneHotEncoder
onehot_encoder = OneHotEncoder()

# Fit the encoder on the vocabulary
onehot_encoder.fit(np.array(vocab).reshape(-1, 1))

# Transform the data into one-hot encoded vectors
encoded_data = onehot_encoder.transform(np.array(data).reshape(-1, 1))

# Convert the sparse matrix to a dense array for visualization
encoded_data_dense = encoded_data.toarray()

# Print the one-hot encoded array
print("One-hot encoded array:")
print(encoded_data_dense)

# Print the vocabulary
print("Vocabulary:")
print(vocab)


One-hot encoded array:
[[1. 0.]
 [0. 1.]]
Vocabulary:
['ansiya', 'shahir']


In [None]:
[[ansiya, shahir],[hamad,muhammed]]

In [45]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Define the data
data = [['ansiya', 'shahir'], ['hamad', 'muhammed']]

# Flatten the nested list to get a list of all unique elements
flat_data = [item for sublist in data for item in sublist]

# Create a vocabulary (unique elements)
vocab = sorted(set(flat_data))

# Create an instance of OneHotEncoder
onehot_encoder = OneHotEncoder()

# Fit the encoder on the vocabulary
onehot_encoder.fit(np.array(vocab).reshape(-1, 1))

# Transform the data into one-hot encoded vectors
encoded_data = []

for sublist in data:
    # Transform each sublist into one-hot encoded vectors
    encoded_sublist = onehot_encoder.transform(np.array(sublist).reshape(-1, 1))
    # Convert the sparse matrix to a dense array and append it to the list
    encoded_data.append(encoded_sublist.toarray())

# Convert the list of arrays to a numpy array
encoded_data_array = np.array(encoded_data)

# Print the one-hot encoded array
print("One-hot encoded array:")
print(encoded_data_array)

# Print the vocabulary
print("Vocabulary:")
print(vocab)


One-hot encoded array:
[[[1. 0. 0. 0.]
  [0. 0. 0. 1.]]

 [[0. 1. 0. 0.]
  [0. 0. 1. 0.]]]
Vocabulary:
['ansiya', 'hamad', 'muhammed', 'shahir']
