In [1]:
pip install nltk scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer


In [5]:
sample_text = """
Natural language processing (NLP) is a field of artificial intelligence that enables computers to understand, interpret, and generate human language. 
NLP is used in various applications, including chatbots, language translation, and sentiment analysis.
"""


In [32]:
# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\a

True

In [28]:
# Step 1: Tokenization
tokens = word_tokenize(sample_text)
print("Tokens:", tokens)

Tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', 'that', 'enables', 'computers', 'to', 'understand', ',', 'interpret', ',', 'and', 'generate', 'human', 'language', '.', 'NLP', 'is', 'used', 'in', 'various', 'applications', ',', 'including', 'chatbots', ',', 'language', 'translation', ',', 'and', 'sentiment', 'analysis', '.']


In [38]:
# Step 2: POS Tagging (Part of Speech)
pos_tags = pos_tag(tokens)
print("\nPOS Tags:", pos_tags)



POS Tags: [('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('field', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('that', 'WDT'), ('enables', 'VBZ'), ('computers', 'NNS'), ('to', 'TO'), ('understand', 'VB'), (',', ','), ('interpret', 'VB'), (',', ','), ('and', 'CC'), ('generate', 'VB'), ('human', 'JJ'), ('language', 'NN'), ('.', '.'), ('NLP', 'NNP'), ('is', 'VBZ'), ('used', 'VBN'), ('in', 'IN'), ('various', 'JJ'), ('applications', 'NNS'), (',', ','), ('including', 'VBG'), ('chatbots', 'NNS'), (',', ','), ('language', 'NN'), ('translation', 'NN'), (',', ','), ('and', 'CC'), ('sentiment', 'NN'), ('analysis', 'NN'), ('.', '.')]


In [34]:
# Step 3: Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("\nFiltered Tokens (No stop words):", filtered_tokens)


Filtered Tokens (No stop words): ['Natural', 'language', 'processing', '(', 'NLP', ')', 'field', 'artificial', 'intelligence', 'enables', 'computers', 'understand', ',', 'interpret', ',', 'generate', 'human', 'language', '.', 'NLP', 'used', 'various', 'applications', ',', 'including', 'chatbots', ',', 'language', 'translation', ',', 'sentiment', 'analysis', '.']


In [36]:
# Step 4: Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("\nStemmed Tokens:", stemmed_tokens)


Stemmed Tokens: ['natur', 'languag', 'process', '(', 'nlp', ')', 'field', 'artifici', 'intellig', 'enabl', 'comput', 'understand', ',', 'interpret', ',', 'gener', 'human', 'languag', '.', 'nlp', 'use', 'variou', 'applic', ',', 'includ', 'chatbot', ',', 'languag', 'translat', ',', 'sentiment', 'analysi', '.']


In [40]:
# Step 5: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("\nLemmatized Tokens:", lemmatized_tokens)


Lemmatized Tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'field', 'artificial', 'intelligence', 'enables', 'computer', 'understand', ',', 'interpret', ',', 'generate', 'human', 'language', '.', 'NLP', 'used', 'various', 'application', ',', 'including', 'chatbots', ',', 'language', 'translation', ',', 'sentiment', 'analysis', '.']


In [42]:
# Step 6: TF-IDF Calculation
# Here, we are considering our sample document as a collection of one document for the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform([sample_text])

In [44]:
# Get the TF-IDF values as a dataframe
import pandas as pd
df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:")
print(df_tfidf)


TF-IDF Matrix:
   analysis       and  applications  artificial  chatbots  computers  \
0  0.150756  0.301511      0.150756    0.150756  0.150756   0.150756   

    enables     field  generate     human  ...       nlp        of  \
0  0.150756  0.150756  0.150756  0.150756  ...  0.301511  0.150756   

   processing  sentiment      that        to  translation  understand  \
0    0.150756   0.150756  0.150756  0.150756     0.150756    0.150756   

       used   various  
0  0.150756  0.150756  

[1 rows x 27 columns]


In [46]:

# Term Frequency (TF)
print("\nTerm Frequency (TF):")
tf_values = df_tfidf.sum(axis=0)
print(tf_values)


Term Frequency (TF):
analysis        0.150756
and             0.301511
applications    0.150756
artificial      0.150756
chatbots        0.150756
computers       0.150756
enables         0.150756
field           0.150756
generate        0.150756
human           0.150756
in              0.150756
including       0.150756
intelligence    0.150756
interpret       0.150756
is              0.301511
language        0.452267
natural         0.150756
nlp             0.301511
of              0.150756
processing      0.150756
sentiment       0.150756
that            0.150756
to              0.150756
translation     0.150756
understand      0.150756
used            0.150756
various         0.150756
dtype: float64


In [48]:
# Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")
idf_values = vectorizer.idf_  # Get the IDF values from the vectorizer
idf_df = pd.DataFrame(idf_values, index=vectorizer.get_feature_names_out(), columns=["IDF"])  # Create DataFrame
print(idf_df)


Inverse Document Frequency (IDF):
              IDF
analysis      1.0
and           1.0
applications  1.0
artificial    1.0
chatbots      1.0
computers     1.0
enables       1.0
field         1.0
generate      1.0
human         1.0
in            1.0
including     1.0
intelligence  1.0
interpret     1.0
is            1.0
language      1.0
natural       1.0
nlp           1.0
of            1.0
processing    1.0
sentiment     1.0
that          1.0
to            1.0
translation   1.0
understand    1.0
used          1.0
various       1.0


In [50]:
# Tokenization
# Breaking text into smaller parts like words or sentences.
# Example: "I love NLP" → ["I", "love", "NLP"]
# POS (Part-of-Speech) Tagging
# Labeling each word with its grammar role, like noun, verb, adjective.
# Example: "run" → ("run", Verb) or ("run", Noun) depending on usage.
# Stop Words Removal
# Removing common words that don’t add much meaning.
# Example: "I am learning NLP" → "learning NLP" (removed: "I", "am")
#  Stemming
# Reducing words to their root form by chopping suffixes.
# Example: "playing", "played" → "play"
# Lemmatization
# Reducing words to their dictionary form (lemma) using grammar rules.
# Example: "better" → "good" (correct base word)
# TF (Term Frequency)
# It shows how often a word appears in a document.
# Formula: TF = (Number of times term appears in a document) / (Total number of terms in the document)
# Example: "I love NLP. NLP is great." → TF of "NLP" = 2 / 6 = 0.33
# IDF (Inverse Document Frequency)
# It shows how rare a word is across all documents.
# Formula: IDF = log(Total number of documents / Number of documents containing the term)
# Example: If "NLP" appears in 1 out of 10 documents → IDF = log(10 / 1) = 1
# TF-IDF
# It is the product of TF and IDF and highlights important words in a document.
# Formula: TF-IDF = TF × IDF
