In [1]:
import pandas as pd
import numpy as np
from nltk import word_tokenize

In [15]:
# Input data
text_data = ["Martha waited for the train", "The train was late", "Tim and Jim took the bus"]

# Tokenization
tokens = [word_tokenize(sentence) for sentence in text_data]

# Create Vocabulary
vocab = set([word for sentence in tokens for word in sentence])

In [17]:
# Create Bag-of-Words Model
bow_model = []
for sentence_tokens in tokens:
    bow_vector = [sentence_tokens.count(word) for word in vocab]
    bow_model.append(bow_vector)


In [4]:
# Convert set to list and create DataFrame
vocab_list = list(vocab)
df = pd.DataFrame(np.array(bow_model), columns=vocab_list)



In [5]:
df

Unnamed: 0,took,train,bus,Mary,for,late,was,The,Ma,the,Samantha,and,waited
0,0,1,0,0,1,0,0,0,1,1,0,0,1
1,0,1,0,0,0,1,1,1,0,0,0,0,0
2,1,0,1,1,0,0,0,0,0,1,1,1,0


In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [7]:
# Input data
text_data = ["Dexter waited for the bus in the rain", "The bus was late", "Jane and Nicole took the bus"]

# Create CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(text_data)

# Convert to DataFrame and print in tabular format
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

In [8]:
df

Unnamed: 0,and,bus,dexter,for,in,jane,late,nicole,rain,the,took,waited,was
0,0,1,1,1,1,0,0,0,1,2,0,1,0
1,0,1,0,0,0,0,1,0,0,1,0,0,1
2,1,1,0,0,0,1,0,1,0,1,1,0,0


In [9]:
cosine_sim = cosine_similarity(X)

In [10]:
df = pd.DataFrame(cosine_sim, columns=text_data, index=text_data)
df

Unnamed: 0,Dexter waited for the bus in the rain,The bus was late,Jane and Nicole took the bus
Dexter waited for the bus in the rain,1.0,0.474342,0.387298
The bus was late,0.474342,1.0,0.408248
Jane and Nicole took the bus,0.387298,0.408248,1.0


In [11]:
# Convert to DataFrame and print in tabular format
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [12]:
# Input data
text_data = ["Joe waited for the train", "The train was late", "Mary and Samantha took the bus"]


In [13]:
# Create TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text_data)

# Convert to DataFrame and print in tabular format
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=text_data)

In [14]:
df

Unnamed: 0,and,bus,for,joe,late,mary,samantha,the,took,train,waited,was
Joe waited for the train,0.0,0.0,0.504611,0.504611,0.0,0.0,0.0,0.298032,0.0,0.38377,0.504611,0.0
The train was late,0.0,0.0,0.0,0.0,0.584483,0.0,0.0,0.345205,0.0,0.444514,0.0,0.584483
Mary and Samantha took the bus,0.432385,0.432385,0.0,0.0,0.0,0.432385,0.432385,0.255374,0.432385,0.0,0.0,0.0


In [21]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

string = "Joe waited for the train The train was late Mary and Samantha took the bus"

characters = list(set(string))  # Get unique characters

char_to_int = dict((c, i) for i, c in enumerate(characters))

integer_encoded = [char_to_int[char] for char in string]

encoder = OneHotEncoder(sparse=False)  # Set sparse=False for dense output
encoder.fit(np.array(integer_encoded).reshape(-1, 1))

onehot_encoded = encoder.transform(np.array(integer_encoded).reshape(-1, 1))



In [22]:
print(onehot_encoded)

[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
