In [1]:
sentence1 = "I will walk 500 miles and I would walk 500 more. Just to be the man who walks a thousand miles to fall down at your door!"
sentence2 = "I played the play playfully as the players were playing in the play with playfullness"

#NLTK
The Natural Language Toolkit (NLTK) is a platform used for building Python programs that work with human language data for applying in statistical natural language processing (NLP).

#Tokenization
Tokenization is essentially splitting a phrase, sentence, paragraph, or an entire text document into smaller units, such as individual words or terms. Each of these smaller units are called tokens.

In [2]:
import nltk
from nltk import word_tokenize , sent_tokenize
# to use word_tokenize and sent_tokenize. we must have to download punkt from nltk
nltk.download('punkt') #punkt is sentence boundary detection algorithm 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
sent_tokenize(sentence1) #split a document or paragraph into sentences

['I will walk 500 miles and I would walk 500 more.',
 'Just to be the man who walks a thousand miles to fall down at your door!']

In [8]:
token1=word_tokenize(sentence1) #split a sentence into tokens or words
token1

['I',
 'will',
 'walk',
 '500',
 'miles',
 'and',
 'I',
 'would',
 'walk',
 '500',
 'more',
 '.',
 'Just',
 'to',
 'be',
 'the',
 'man',
 'who',
 'walks',
 'a',
 'thousand',
 'miles',
 'to',
 'fall',
 'down',
 'at',
 'your',
 'door',
 '!']

In [9]:
token2=word_tokenize(sentence2)
token2

['I',
 'played',
 'the',
 'play',
 'playfully',
 'as',
 'the',
 'players',
 'were',
 'playing',
 'in',
 'the',
 'play',
 'with',
 'playfullness']

In [6]:
sent_tokenize(sentence2)

['I played the play playfully as the players were playing in the play with playfullness']

#Part of Speech (PoS) Tagging
A task of labelling each word in a sentence with its appropriate part of speech. Parts of speech include nouns, verb, adverbs, adjectives, pronouns, conjunction and their sub-categories.


In [7]:
from nltk import pos_tag
# to use pos tagger we must have to download averaged_perceptron_tagger from nltk.download()
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [10]:
tagged_token1=pos_tag(token1)
tagged_token1

[('I', 'PRP'),
 ('will', 'MD'),
 ('walk', 'VB'),
 ('500', 'CD'),
 ('miles', 'NNS'),
 ('and', 'CC'),
 ('I', 'PRP'),
 ('would', 'MD'),
 ('walk', 'VB'),
 ('500', 'CD'),
 ('more', 'JJR'),
 ('.', '.'),
 ('Just', 'NNP'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('the', 'DT'),
 ('man', 'NN'),
 ('who', 'WP'),
 ('walks', 'VBZ'),
 ('a', 'DT'),
 ('thousand', 'NN'),
 ('miles', 'NNS'),
 ('to', 'TO'),
 ('fall', 'VB'),
 ('down', 'RP'),
 ('at', 'IN'),
 ('your', 'PRP$'),
 ('door', 'NN'),
 ('!', '.')]

In [11]:
tagged_token2=pos_tag(token2)
tagged_token2

[('I', 'PRP'),
 ('played', 'VBD'),
 ('the', 'DT'),
 ('play', 'NN'),
 ('playfully', 'RB'),
 ('as', 'IN'),
 ('the', 'DT'),
 ('players', 'NNS'),
 ('were', 'VBD'),
 ('playing', 'VBG'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('play', 'NN'),
 ('with', 'IN'),
 ('playfullness', 'NN')]

#StopWords
The words which are generally filtered out before processing a natural language are called stop words. These are actually the most common words in any language (like articles, prepositions, pronouns, conjunctions, etc) and does not add much information to the text.

In [12]:
from nltk.corpus import stopwords
# to get all the stop words of any language. we first have to download it from nltk.download('stopwords)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
stop_words = stopwords.words('english')

In [14]:
clean_token1 = []
clean_token2 = []
for word in token1:
  if word not in stop_words:
    clean_token1.append(word)

for word in token2:
  if word not in stop_words:
    clean_token2.append(word)

In [15]:
clean_token1

['I',
 'walk',
 '500',
 'miles',
 'I',
 'would',
 'walk',
 '500',
 '.',
 'Just',
 'man',
 'walks',
 'thousand',
 'miles',
 'fall',
 'door',
 '!']

In [16]:
clean_token2

['I',
 'played',
 'play',
 'playfully',
 'players',
 'playing',
 'play',
 'playfullness']

#Stemming
It is normalization technique where the tokenized words are shortened to avoid the redundancy

In [17]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed1 = [stemmer.stem(word) for word in token1]
print(" ".join(stemmed1))

i will walk 500 mile and i would walk 500 more . just to be the man who walk a thousand mile to fall down at your door !


In [18]:
stemmed2 = [stemmer.stem(word) for word in token2]
print(" ".join(stemmed2))

i play the play play as the player were play in the play with playful


#Lemmatization
Considers the context and converts the word to its meaningful base form

The purpose of lemmatization is same as that of stemming but overcomes the drawbacks of stemming. In stemming, for some words, it may not give may not give meaningful representation such as “Histori”. Here, lemmatization comes into picture as it gives meaningful word.

In [19]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
# to use lemmatizer, we must donwload wordnet from nltk.download('wordnet)
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [20]:
lemmatized_output1 = [lemmatizer.lemmatize(word) for word in token1]
lemmatized_output2 = [lemmatizer.lemmatize(word) for word in token2]
print(" ".join(lemmatized_output1))

I will walk 500 mile and I would walk 500 more . Just to be the man who walk a thousand mile to fall down at your door !


In [21]:
print(" ".join(lemmatized_output2))

I played the play playfully a the player were playing in the play with playfullness


#Term Frequency (TF)
It calculates the frequency of each word in a document. It represents how often a word appears in the document

#Inverse Document Frequency (IDF) 
Itmeasures the importance of a word in a collection of documents. It penalizes common words and gives more weight to rare words.

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine the sentences into a single list
sentences = [
    "I will walk 500 miles and I would walk 500 more. Just to be the man who walks a thousand miles to fall down at your door!",
    "I played the play playfully as the players were playing in the play with playfullness"
]

# Create the TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Fit and transform the sentences
tfidf_matrix = vectorizer.fit_transform(sentences)

# Get the feature names
feature_names = vectorizer.get_feature_names_out() 

# Print the TF-IDF matrix
print("Term Frequency-Inverse Document Frequency:")
for i in range(len(sentences)):
    print(f"\nSentence {i+1}:")
    feature_index = tfidf_matrix[i].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
    for feature_idx, tfidf_score in tfidf_scores:
        print(f"Term: {feature_names[feature_idx]}, TF-IDF: {tfidf_score:.4f}")


Term Frequency-Inverse Document Frequency:

Sentence 1:
Term: door, TF-IDF: 0.1782
Term: your, TF-IDF: 0.1782
Term: at, TF-IDF: 0.1782
Term: down, TF-IDF: 0.1782
Term: fall, TF-IDF: 0.1782
Term: thousand, TF-IDF: 0.1782
Term: walks, TF-IDF: 0.1782
Term: who, TF-IDF: 0.1782
Term: man, TF-IDF: 0.1782
Term: the, TF-IDF: 0.1268
Term: be, TF-IDF: 0.1782
Term: to, TF-IDF: 0.3563
Term: just, TF-IDF: 0.1782
Term: more, TF-IDF: 0.1782
Term: would, TF-IDF: 0.1782
Term: and, TF-IDF: 0.1782
Term: miles, TF-IDF: 0.3563
Term: 500, TF-IDF: 0.3563
Term: walk, TF-IDF: 0.3563
Term: will, TF-IDF: 0.1782

Sentence 2:
Term: playfullness, TF-IDF: 0.2387
Term: with, TF-IDF: 0.2387
Term: in, TF-IDF: 0.2387
Term: playing, TF-IDF: 0.2387
Term: were, TF-IDF: 0.2387
Term: players, TF-IDF: 0.2387
Term: as, TF-IDF: 0.2387
Term: playfully, TF-IDF: 0.2387
Term: play, TF-IDF: 0.4773
Term: played, TF-IDF: 0.2387
Term: the, TF-IDF: 0.5094
