In [None]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [None]:
# Paragraph to be processed
paragraph = """The quick brown fox jumps over the lazy dog. This pangram is often used to test typewriters and keyboards. It contains every letter of the English alphabet, making it a perfect sentence for testing fonts and layouts. Despite its simplicity, it serves as a great example of how language can be used efficiently. Typists have used this sentence for decades to improve their speed and accuracy. The origins of this sentence are unclear, but its utility is universally acknowledged."""

In [None]:
# Initialize NLTK components
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Tokenize the paragraph into sentences
sentences = nltk.sent_tokenize(paragraph)

In [None]:
sentences

['The quick brown fox jumps over the lazy dog.',
 'This pangram is often used to test typewriters and keyboards.',
 'It contains every letter of the English alphabet, making it a perfect sentence for testing fonts and layouts.',
 'Despite its simplicity, it serves as a great example of how language can be used efficiently.',
 'Typists have used this sentence for decades to improve their speed and accuracy.',
 'The origins of this sentence are unclear, but its utility is universally acknowledged.']

In [None]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
# Process each sentence to create a clean corpus
corpus = []
for sentence in sentences:
    # Remove non-alphabetic characters and convert to lowercase
    review = re.sub('[^a-zA-Z]', ' ', sentence)
    review = review.lower()
    corpus.append(review)

In [None]:
# Example of stemming and lemmatization on a single word
print(stemmer.stem("running"))  # Example of stemming
print(lemmatizer.lemmatize("better", pos='a'))  # Example of lemmatization

run
good


In [None]:
# Tokenization and stopword removal with stemming
for i in corpus:
    words = nltk.word_tokenize(i)
    for word in words:
        if word not in set(stopwords.words('english')):
            print(stemmer.stem(word))

quick
brown
fox
jump
lazi
dog
pangram
often
use
test
typewrit
keyboard
contain
everi
letter
english
alphabet
make
perfect
sentenc
test
font
layout
despit
simplic
serv
great
exampl
languag
use
effici
typist
use
sentenc
decad
improv
speed
accuraci
origin
sentenc
unclear
util
univers
acknowledg


In [None]:
# Tokenization and stopword removal with lemmatization
for i in corpus:
    words = nltk.word_tokenize(i)
    for word in words:
        if word not in set(stopwords.words('english')):
            print(lemmatizer.lemmatize(word))

quick
brown
fox
jump
lazy
dog
pangram
often
used
test
typewriter
keyboard
contains
every
letter
english
alphabet
making
perfect
sentence
testing
font
layout
despite
simplicity
serf
great
example
language
used
efficiently
typist
used
sentence
decade
improve
speed
accuracy
origin
sentence
unclear
utility
universally
acknowledged


In [None]:
# Display an example sentence from the corpus
print(corpus[1])

this pangram is often used to test typewriters and keyboards 


In [None]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features = 3)
X = vectorizer.fit_transform(corpus)

In [None]:
# Example: Display the TF-IDF vector for a specific sentence
print(X[1].toarray())
print(corpus[2])
print(X[2].toarray())

[[0.70710678 0.         0.70710678]]
it contains every letter of the english alphabet  making it a perfect sentence for testing fonts and layouts 
[[0.70710678 0.70710678 0.        ]]
