In [14]:
import nltk

Step 1: **Tokenization**

Tokenization is the process of breaking down text into individual words or tokens. We'll use the nltk library to tokenize the text:

In [15]:
import nltk
from nltk.tokenize import word_tokenize

text = "This is an example sentence."
tokens = word_tokenize(text)
print(tokens)  # Output: ['This', 'is', 'an', 'example', 'sentence', '.']

['This', 'is', 'an', 'example', 'sentence', '.']


Step 2: **Stopword Removal**

Stopwords are common words like "the", "and", "a", etc. that don't add much value to the meaning of the text. We'll remove stopwords using the nltk library:

In [16]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
print(filtered_tokens)  # Output: ['This', 'example', 'sentence', '.']

['example', 'sentence', '.']


Step 3: **Stemming or Lemmatization**

Stemming or lemmatization is the process of reducing words to their base form. We'll use the nltk library for stemming:

In [17]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print(stemmed_tokens)  # Output: ['this', 'exampl', 'sentenc']

['exampl', 'sentenc', '.']


In [18]:
# Lemmatization 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
print(lemmatized_tokens)  # Output: ['This', 'example', 'sentence']

['example', 'sentence', '.']


[nltk_data] Downloading package wordnet to /home/qdr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
# POS tagging
nltk.download('averaged_perceptron_tagger')
tagged_tokens = nltk.pos_tag(filtered_tokens)
print(tagged_tokens)  # Output: [('This', 'DT'), ('example', 'NN'), ('sentence', 'NN')]

[('example', 'NN'), ('sentence', 'NN'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/qdr/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [20]:
# Named Entity Recognition
nltk.download('maxent_ne_chunker')
nltk.download('words')
entities = nltk.chunk.ne_chunk(tagged_tokens)
print(entities)  # Output: (S (GPE This/DT) example/NN sentence/NN)

(S example/NN sentence/NN ./.)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/qdr/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/qdr/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [22]:
# vectorization
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([text])
print(vectorizer.get_feature_names_out())  # Output: ['an', 'example', 'is', 'sentence', 'this']
print(X.toarray())  # Output: [[1 1 1 1 1]]


['an' 'example' 'is' 'sentence' 'this']
[[1 1 1 1 1]]
