# Word Frequency Distribution

In [3]:
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

def word_frequency(text):
  words = word_tokenize(text)
  return Counter(words)

text = "NLP is amazing. NLP makes machines understand text."
print("Word Frequencies : ",word_frequency(text))

Word Frequencies :  Counter({'NLP': 2, '.': 2, 'is': 1, 'amazing': 1, 'makes': 1, 'machines': 1, 'understand': 1, 'text': 1})


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Bag Of Words

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    "NLP is fun and exciting",
    "Machines understand NLP and text",
    "Text processing is part of NLP"
]

vectorizer = CountVectorizer()
x = vectorizer.fit_transform(corpus)

print("Feature Names:",vectorizer.get_feature_names_out())
print("Bow Representation:")
print(x.toarray())

Feature Names: ['and' 'exciting' 'fun' 'is' 'machines' 'nlp' 'of' 'part' 'processing'
 'text' 'understand']
Bow Representation:
[[1 1 1 1 0 1 0 0 0 0 0]
 [1 0 0 0 1 1 0 0 0 1 1]
 [0 0 0 1 0 1 1 1 1 1 0]]


# TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus)

print("Feature Names:",vectorizer.get_feature_names_out())
print("TF-IDF Representation:")
print(x.toarray())

Feature Names: ['and' 'exciting' 'fun' 'is' 'machines' 'nlp' 'of' 'part' 'processing'
 'text' 'understand']
TF-IDF Representation:
[[0.40619178 0.53409337 0.53409337 0.40619178 0.         0.31544415
  0.         0.         0.         0.         0.        ]
 [0.40619178 0.         0.         0.         0.53409337 0.31544415
  0.         0.         0.         0.40619178 0.53409337]
 [0.         0.         0.         0.35829137 0.         0.27824521
  0.4711101  0.4711101  0.4711101  0.35829137 0.        ]]


# NLP Task - Keyword Extraction

In [7]:
import numpy as np
feature_array = np.array(vectorizer.get_feature_names_out())
importance = np.argsort(x.toarray()).flatten()[::-1]

keywords = feature_array[importance[:5]]
print("Top Keywords:",keywords)

Top Keywords: ['processing' 'of' 'part' 'is' 'text']
