In [1]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
document = """
   "I love programming in Python. It's very interesting and fun to solve problems using Python This is the Seventh Practical."
"""

In [None]:
tokens = word_tokenize(document)
tokens

['``',
 'I',
 'love',
 'programming',
 'in',
 'Python',
 '.',
 'It',
 "'s",
 'very',
 'interesting',
 'and',
 'fun',
 'to',
 'solve',
 'problems',
 'using',
 'Python',
 'This',
 'is',
 'the',
 'Seventh',
 'Practical',
 '.',
 "''"]

In [None]:
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)

POS Tags: [('``', '``'), ('I', 'PRP'), ('love', 'VBP'), ('programming', 'VBG'), ('in', 'IN'), ('Python', 'NNP'), ('.', '.'), ('It', 'PRP'), ("'s", 'VBZ'), ('very', 'RB'), ('interesting', 'JJ'), ('and', 'CC'), ('fun', 'NN'), ('to', 'TO'), ('solve', 'VB'), ('problems', 'NNS'), ('using', 'VBG'), ('Python', 'NNP'), ('This', 'DT'), ('is', 'VBZ'), ('the', 'DT'), ('Seventh', 'JJ'), ('Practical', 'NNP'), ('.', '.'), ("''", "''")]


In [None]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Filtered Tokens (No Stop Words):", filtered_tokens)

Filtered Tokens (No Stop Words): ['``', 'love', 'programming', 'Python', '.', "'s", 'interesting', 'fun', 'solve', 'problems', 'using', 'Python', 'Seventh', 'Practical', '.', "''"]


In [None]:

stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Tokens:", stemmed_tokens)


Stemmed Tokens: ['``', 'love', 'program', 'python', '.', "'s", 'interest', 'fun', 'solv', 'problem', 'use', 'python', 'seventh', 'practic', '.', "''"]


In [None]:

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Tokens:", lemmatized_tokens)


Lemmatized Tokens: ['``', 'love', 'programming', 'Python', '.', "'s", 'interesting', 'fun', 'solve', 'problem', 'using', 'Python', 'Seventh', 'Practical', '.', "''"]


In [None]:
from collections import Counter
word_counts = Counter(filtered_tokens)
total_words = len(filtered_tokens)
tf = {word: count / total_words for word, count in word_counts.items()}
print("Term Frequency (TF):", tf)

Term Frequency (TF): {'``': 0.0625, 'love': 0.0625, 'programming': 0.0625, 'Python': 0.125, '.': 0.125, "'s": 0.0625, 'interesting': 0.0625, 'fun': 0.0625, 'solve': 0.0625, 'problems': 0.0625, 'using': 0.0625, 'Seventh': 0.0625, 'Practical': 0.0625, "''": 0.0625}
