In [1]:
# Text Analytics -1. Extract Sample document and apply following document preprocessing methods: 
# Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization. 
# 2. Create representation of document by calculating Term Frequency and Inverse Document Frequency.

In [1]:
# Step 1: Import libraries
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Step 2: Sample document
doc = "Natural Language Processing is a branch of artificial intelligence that helps computers understand, interpret and manipulate human language."
print("Original Document:\n", doc)

Original Document:
 Natural Language Processing is a branch of artificial intelligence that helps computers understand, interpret and manipulate human language.


In [5]:
# Step 3: Tokenization
tokens = word_tokenize(doc)
print("\nTokenization:\n", tokens)


Tokenization:
 ['Natural', 'Language', 'Processing', 'is', 'a', 'branch', 'of', 'artificial', 'intelligence', 'that', 'helps', 'computers', 'understand', ',', 'interpret', 'and', 'manipulate', 'human', 'language', '.']


In [6]:
# Step 4: POS Tagging
pos_tags = pos_tag(tokens)
print("\nPOS Tagging:\n", pos_tags)


POS Tagging:
 [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('branch', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('that', 'WDT'), ('helps', 'VBZ'), ('computers', 'NNS'), ('understand', 'VBP'), (',', ','), ('interpret', 'JJ'), ('and', 'CC'), ('manipulate', 'JJ'), ('human', 'JJ'), ('language', 'NN'), ('.', '.')]


In [7]:
# Step 5: Stop Words Removal
stop_words = set(stopwords.words('english'))
tokens_no_stop = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
print("\nAfter Stop Words Removal:\n", tokens_no_stop)


After Stop Words Removal:
 ['Natural', 'Language', 'Processing', 'branch', 'artificial', 'intelligence', 'helps', 'computers', 'understand', 'interpret', 'manipulate', 'human', 'language']


In [8]:
# Step 6: Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens_no_stop]
print("\nAfter Stemming:\n", stemmed_words)


After Stemming:
 ['natur', 'languag', 'process', 'branch', 'artifici', 'intellig', 'help', 'comput', 'understand', 'interpret', 'manipul', 'human', 'languag']


In [9]:
# Step 7: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in tokens_no_stop]
print("\nAfter Lemmatization:\n", lemmatized_words)


After Lemmatization:
 ['natural', 'language', 'processing', 'branch', 'artificial', 'intelligence', 'help', 'computer', 'understand', 'interpret', 'manipulate', 'human', 'language']


In [10]:
# Step 8: TF-IDF Vectorization
# Let's add a second simple document for context
docs = [
    doc,
    "Machine learning is an important part of artificial intelligence that enables computers to learn from data."
]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

In [11]:
# Convert to DataFrame
import pandas as pd
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print("\nTF-IDF Representation:\n")
print(tfidf_df)


TF-IDF Representation:

         an       and  artificial    branch  computers      data   enables  \
0  0.000000  0.249708    0.177669  0.249708   0.177669  0.000000  0.000000   
1  0.276951  0.000000    0.197053  0.000000   0.197053  0.276951  0.276951   

       from     helps     human  ...  learning   machine  manipulate  \
0  0.000000  0.249708  0.249708  ...  0.000000  0.000000    0.249708   
1  0.276951  0.000000  0.000000  ...  0.276951  0.276951    0.000000   

    natural        of      part  processing      that        to  understand  
0  0.249708  0.177669  0.000000    0.249708  0.177669  0.000000    0.249708  
1  0.000000  0.197053  0.276951    0.000000  0.197053  0.276951    0.000000  

[2 rows x 26 columns]
