In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.tag import pos_tag
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Sample document
sample_document = """Natural language processing (NLP) is a subfield of artificial intelligence (AI) that focuses on the interaction between computers and humans using natural language. It enables computers to understand, interpret, and generate human-like text. NLP involves various tasks, such as tokenization, part-of-speech tagging, stop words removal, stemming, and lemmatization."""

# Tokenization
tokens = word_tokenize(sample_document)

# POS Tagging
pos_tags = pos_tag(tokens)

# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Stemming
porter_stemmer = PorterStemmer()
stemmed_tokens = [porter_stemmer.stem(word) for word in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

# TF-IDF Representation
documents = [sample_document]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Display Results
print("Original Document:\n", sample_document, "\n")
print("Tokenization:\n", tokens, "\n")
print("POS Tagging:\n", pos_tags, "\n")
print("Stop Words Removal:\n", filtered_tokens, "\n")
print("Stemming:\n", stemmed_tokens, "\n")
print("Lemmatization:\n", lemmatized_tokens, "\n")
print("TF-IDF Representation:\n", tfidf_matrix.toarray(), "\n")
print("Feature Names:\n", feature_names)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenono\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenono\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Lenono\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenono\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original Document:
 Natural language processing (NLP) is a subfield of artificial intelligence (AI) that focuses on the interaction between computers and humans using natural language. It enables computers to understand, interpret, and generate human-like text. NLP involves various tasks, such as tokenization, part-of-speech tagging, stop words removal, stemming, and lemmatization. 

Tokenization:
 ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'artificial', 'intelligence', '(', 'AI', ')', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'using', 'natural', 'language', '.', 'It', 'enables', 'computers', 'to', 'understand', ',', 'interpret', ',', 'and', 'generate', 'human-like', 'text', '.', 'NLP', 'involves', 'various', 'tasks', ',', 'such', 'as', 'tokenization', ',', 'part-of-speech', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'and', 'lemmatization', '.'] 

POS Tagging:
 [('Natural', 'JJ

In [2]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Lenono\AppData\Roaming\nltk_data...


True

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.tag import pos_tag
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Sample document
sample_document = """Natural language processing (NLP) is a subfield of artificial intelligence (AI) that focuses on the interaction between computers and humans using natural language. It enables computers to understand, interpret, and generate human-like text. NLP involves various tasks, such as tokenization, part-of-speech tagging, stop words removal, stemming, and lemmatization."""

# Tokenization
tokens = word_tokenize(sample_document)

# POS Tagging
pos_tags = pos_tag(tokens)

# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Stemming
porter_stemmer = PorterStemmer()
stemmed_tokens = [porter_stemmer.stem(word) for word in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

# TF-IDF Representation
documents = [sample_document]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Display Results
print("Original Document:\n", sample_document, "\n")
print("Tokenization:\n", tokens, "\n")
print("POS Tagging:\n", pos_tags, "\n")
print("Stop Words Removal:\n", filtered_tokens, "\n")
print("Stemming:\n", stemmed_tokens, "\n")
print("Lemmatization:\n", lemmatized_tokens, "\n")
print("TF-IDF Representation:\n", tfidf_matrix.toarray(), "\n")
print("Feature Names:\n", feature_names)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenono\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenono\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Lenono\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenono\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original Document:
 Natural language processing (NLP) is a subfield of artificial intelligence (AI) that focuses on the interaction between computers and humans using natural language. It enables computers to understand, interpret, and generate human-like text. NLP involves various tasks, such as tokenization, part-of-speech tagging, stop words removal, stemming, and lemmatization. 

Tokenization:
 ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'artificial', 'intelligence', '(', 'AI', ')', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'using', 'natural', 'language', '.', 'It', 'enables', 'computers', 'to', 'understand', ',', 'interpret', ',', 'and', 'generate', 'human-like', 'text', '.', 'NLP', 'involves', 'various', 'tasks', ',', 'such', 'as', 'tokenization', ',', 'part-of-speech', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'and', 'lemmatization', '.'] 

POS Tagging:
 [('Natural', 'JJ

In [2]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

sentence = "Hello I am AMAN MANSURI. I am from Nashik District. I will be an Engineer in few months."
#Tokenization
nltk.download('punkt')
tokenized_words = word_tokenize(sentence)
tokenized_sentences = sent_tokenize(sentence)
print(tokenized_words)
print(tokenized_sentences)

['Hello', 'I', 'am', 'AMAN', 'MANSURI', '.', 'I', 'am', 'from', 'Nashik', 'District', '.', 'I', 'will', 'be', 'an', 'Engineer', 'in', 'few', 'months', '.']
['Hello I am AMAN MANSURI.', 'I am from Nashik District.', 'I will be an Engineer in few months.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenono\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
#Stop words removal
nltk.download('stopwords')
stop_words = stopwords.words('english')
cleaned_token = []
for i in tokenized_words:
 if i not in stop_words:
     cleaned_token.append(i)
print("Unclean version ", tokenized_words)
print("Clean Version", cleaned_token)


Unclean version  ['Hello', 'I', 'am', 'AMAN', 'MANSURI', '.', 'I', 'am', 'from', 'Nashik', 'District', '.', 'I', 'will', 'be', 'an', 'Engineer', 'in', 'few', 'months', '.']
Clean Version ['Hello', 'I', 'AMAN', 'MANSURI', '.', 'I', 'Nashik', 'District', '.', 'I', 'Engineer', 'months', '.']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenono\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#Stemming
snowball_stemmer = SnowballStemmer('english')
stemmed_words = []
for i in tokenized_words:
 stemmed = snowball_stemmer.stem(i)
 stemmed_words.append(stemmed)
print(stemmed_words)

['hello', 'i', 'am', 'aman', 'mansuri', '.', 'i', 'am', 'from', 'nashik', 'district', '.', 'i', 'will', 'be', 'an', 'engin', 'in', 'few', 'month', '.']


In [6]:
#Lemmatization
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized_words = []
for i in tokenized_words:
 lemmatized = wordnet_lemmatizer.lemmatize(i)
 lemmatized_words.append(lemmatized)
print(lemmatized_words)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenono\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['Hello', 'I', 'am', 'AMAN', 'MANSURI', '.', 'I', 'am', 'from', 'Nashik', 'District', '.', 'I', 'will', 'be', 'an', 'Engineer', 'in', 'few', 'month', '.']


In [7]:
#Pos Tagging
# dt - determinnant
# NN - noun
# In - prep / conjunc
nltk.download('averaged_perceptron_tagger')
pos_tag = nltk.pos_tag(tokenized_words)
print(pos_tag)
'''NNP- proper noun
PRP-personal pronoun
VBP-verm base form
JJ-adjective
NN-Noun
IN-preposition/conjunction
RB-adverb
DT-determiner
MD-model auxiliary'''

[('Hello', 'NNP'), ('I', 'PRP'), ('am', 'VBP'), ('AMAN', 'NNP'), ('MANSURI', 'NNP'), ('.', '.'), ('I', 'PRP'), ('am', 'VBP'), ('from', 'IN'), ('Nashik', 'NNP'), ('District', 'NNP'), ('.', '.'), ('I', 'PRP'), ('will', 'MD'), ('be', 'VB'), ('an', 'DT'), ('Engineer', 'NNP'), ('in', 'IN'), ('few', 'JJ'), ('months', 'NNS'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Lenono\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


'NNP- proper noun\nPRP-personal pronoun\nVBP-verm base form\nJJ-adjective\nNN-Noun\nIN-preposition/conjunction\nRB-adverb\nDT-determiner\nMD-model auxiliary'

In [8]:

d0 = "Good Morning"
d1 = "Do daily exercise in the morning "
d2 = "exercise is good for health"
series = [d0, d1, d2]
tfidf = TfidfVectorizer()
result = tfidf.fit_transform(series)
print("Word Indexing: ", tfidf.vocabulary_)
print("tf-idf in matrix form: \n", result.toarray())

Word Indexing:  {'good': 4, 'morning': 8, 'do': 1, 'daily': 0, 'exercise': 2, 'in': 6, 'the': 9, 'is': 7, 'for': 3, 'health': 5}
tf-idf in matrix form: 
 [[0.         0.         0.         0.         0.70710678 0.
  0.         0.         0.70710678 0.        ]
 [0.44036207 0.44036207 0.3349067  0.         0.         0.
  0.44036207 0.         0.3349067  0.44036207]
 [0.         0.         0.37302199 0.49047908 0.37302199 0.49047908
  0.         0.49047908 0.         0.        ]]
