Perform bag-of-words approach (count occurrence, normalized count occurrence), TF-IDF on
data. Create embeddings using Word2Vec

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
text = "NLTK is great! I'm loving NLP with Python. Learning NLP is fun and useful for real-world applications. New York is a big city and many developers use Python for data science."


In [None]:

tokens = word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))

tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
print(tokens)

['nltk', 'great', 'loving', 'nlp', 'python', 'learning', 'nlp', 'fun', 'useful', 'applications', 'new', 'york', 'big', 'city', 'many', 'developers', 'use', 'python', 'data', 'science']


In [None]:
#bag of words


In [None]:
#Count occurence
from collections import Counter
bow_counter = Counter(tokens)
print("Bag-of-Words Counter:")
print(bow_counter)

Bag-of-Words Counter:
Counter({'nlp': 2, 'python': 2, 'nltk': 1, 'great': 1, 'loving': 1, 'learning': 1, 'fun': 1, 'useful': 1, 'applications': 1, 'new': 1, 'york': 1, 'big': 1, 'city': 1, 'many': 1, 'developers': 1, 'use': 1, 'data': 1, 'science': 1})


In [None]:
#normalized count occurance
total_tokens = len(tokens)
normalized_bow = {word: count / total_tokens for word, count in bow_counter.items()}
print("\nNormalized Bag-of-Words:")
print(normalized_bow)


Normalized Bag-of-Words:
{'nltk': 0.05, 'great': 0.05, 'loving': 0.05, 'nlp': 0.1, 'python': 0.1, 'learning': 0.05, 'fun': 0.05, 'useful': 0.05, 'applications': 0.05, 'new': 0.05, 'york': 0.05, 'big': 0.05, 'city': 0.05, 'many': 0.05, 'developers': 0.05, 'use': 0.05, 'data': 0.05, 'science': 0.05}


In [None]:
#Tf-IDF
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
corpus = [text]
tfidf = TfidfVectorizer()

tfidf_matrix = tfidf.fit_transform(corpus)
feature_names = tfidf.get_feature_names_out()
print("Tf-idf feature names\n")
print(feature_names)
print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())

Tf-idf feature names

['and' 'applications' 'big' 'city' 'data' 'developers' 'for' 'fun' 'great'
 'is' 'learning' 'loving' 'many' 'new' 'nlp' 'nltk' 'python' 'real'
 'science' 'use' 'useful' 'with' 'world' 'york']

TF-IDF Matrix:
[[0.30151134 0.15075567 0.15075567 0.15075567 0.15075567 0.15075567
  0.30151134 0.15075567 0.15075567 0.45226702 0.15075567 0.15075567
  0.15075567 0.15075567 0.30151134 0.15075567 0.30151134 0.15075567
  0.15075567 0.15075567 0.15075567 0.15075567 0.15075567 0.15075567]]


In [None]:
#word embedding
!pip install gensim
from gensim.models import Word2Vec

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
sentences = [tokens]

In [None]:
word2vec_model = Word2Vec(sentences,
                          vector_size=100,
                          window=5,
                          min_count=1,
                          workers=4)

In [None]:
# Get embedding for a word
print("\nWord2Vec Embedding for 'python':")
print(word2vec_model.wv['python'])


Word2Vec Embedding for 'python':
[-5.4488966e-04  2.4447922e-04  5.1108180e-03  9.0133119e-03
 -9.3160355e-03 -7.1283686e-03  6.4630071e-03  8.9879055e-03
 -5.0211898e-03 -3.7687025e-03  7.3831934e-03 -1.5469143e-03
 -4.5535350e-03  6.5554827e-03 -4.8726299e-03 -1.8234092e-03
  2.8759521e-03  9.9476194e-04 -8.2885372e-03 -9.4592180e-03
  7.3150024e-03  5.0821584e-03  6.7643146e-03  7.6687959e-04
  6.3473321e-03 -3.4060215e-03 -9.5025613e-04  5.7758666e-03
 -7.5251590e-03 -3.9416379e-03 -7.5241765e-03 -9.3960669e-04
  9.5483046e-03 -7.3259678e-03 -2.3425706e-03 -1.9351302e-03
  8.0944169e-03 -5.9369262e-03  4.1266387e-05 -4.7499435e-03
 -9.5993718e-03  5.0056833e-03 -8.7733148e-03 -4.3891678e-03
 -3.3015378e-05 -2.9254478e-04 -7.6746726e-03  9.6176388e-03
  4.9874675e-03  9.2359614e-03 -8.1652086e-03  4.4940533e-03
 -4.1290578e-03  8.3123095e-04  8.4988112e-03 -4.4639944e-03
  4.5358767e-03 -6.7961290e-03 -3.5461746e-03  9.4133997e-03
 -1.5779968e-03  3.1575607e-04 -4.1404814e-03 -7.69