NLP Lab Implementation Using Python & NLTK

In [1]:
# Install necessary packages if not already installed
!pip install nltk scikit-learn




In [2]:
# Import required libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk, RegexpParser
from sklearn.feature_extraction.text import TfidfVectorizer
import string


In [3]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

1. Tokenization
Word Tokenization

In [5]:
import nltk
nltk.download('punkt_tab')

text = "NLTK is a powerful Python library for NLP. It helps process text data easily."

words = word_tokenize(text)
print("Word Tokenization:", words)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Word Tokenization: ['NLTK', 'is', 'a', 'powerful', 'Python', 'library', 'for', 'NLP', '.', 'It', 'helps', 'process', 'text', 'data', 'easily', '.']


Sentence Tokenization

In [6]:
sentences = sent_tokenize(text)
print("Sentence Tokenization:", sentences)


Sentence Tokenization: ['NLTK is a powerful Python library for NLP.', 'It helps process text data easily.']


2. Stopword Removal

In [7]:
stop_words = set(stopwords.words('english'))

filtered_words = [word for word in words if word.lower() not in stop_words and word not in string.punctuation]
print("After Stopword Removal:", filtered_words)


After Stopword Removal: ['NLTK', 'powerful', 'Python', 'library', 'NLP', 'helps', 'process', 'text', 'data', 'easily']


3. Stemming

In [8]:
# Using Porter Stemmer
porter = PorterStemmer()
stemmed_words_porter = [porter.stem(word) for word in filtered_words]
print("Porter Stemmer:", stemmed_words_porter)

# Using Snowball Stemmer
snowball = SnowballStemmer('english')
stemmed_words_snowball = [snowball.stem(word) for word in filtered_words]
print("Snowball Stemmer:", stemmed_words_snowball)


Porter Stemmer: ['nltk', 'power', 'python', 'librari', 'nlp', 'help', 'process', 'text', 'data', 'easili']
Snowball Stemmer: ['nltk', 'power', 'python', 'librari', 'nlp', 'help', 'process', 'text', 'data', 'easili']


4. Parts of Speech (POS) Tagging

In [11]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
pos_tags = pos_tag(filtered_words)
print("POS Tags:", pos_tags)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


POS Tags: [('NLTK', 'NNP'), ('powerful', 'JJ'), ('Python', 'NNP'), ('library', 'NN'), ('NLP', 'NNP'), ('helps', 'VBZ'), ('process', 'VB'), ('text', 'JJ'), ('data', 'NNS'), ('easily', 'RB')]


5. Lemmatization

In [12]:
lemmatizer = WordNetLemmatizer()

# Function to map POS tag to WordNet POS tag
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
print("Lemmatized Words:", lemmatized_words)


Lemmatized Words: ['NLTK', 'powerful', 'Python', 'library', 'NLP', 'help', 'process', 'text', 'data', 'easily']


Chunking

In [14]:
# Define a simple NP (Noun Phrase) chunk grammar
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunk_parser = RegexpParser(grammar)
tree = chunk_parser.parse(pos_tags)
print(tree)
# tree.draw()  # Opens a tree diagram window

(S
  NLTK/NNP
  powerful/JJ
  Python/NNP
  (NP library/NN)
  NLP/NNP
  helps/VBZ
  process/VB
  text/JJ
  data/NNS
  easily/RB)


7. Named Entity Recognition (NER)

In [16]:
import nltk
nltk.download('maxent_ne_chunker_tab')

ner_tree = ne_chunk(pos_tags)
print(ner_tree)
# ner_tree.draw()  # Opens a tree diagram window for named entities

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.


(S
  NLTK/NNP
  powerful/JJ
  (PERSON Python/NNP)
  library/NN
  (ORGANIZATION NLP/NNP)
  helps/VBZ
  process/VB
  text/JJ
  data/NNS
  easily/RB)


8. TFâ€“IDF Calculation

In [17]:
documents = [
    "NLTK is a powerful Python library for NLP.",
    "It helps process text data easily.",
    "TF-IDF is used to find the importance of words in documents."
]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert to array and display
import pandas as pd
df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(df)


       data  documents    easily      find       for     helps       idf  \
0  0.000000   0.000000  0.000000  0.000000  0.389888  0.000000  0.000000   
1  0.408248   0.000000  0.408248  0.000000  0.000000  0.408248  0.000000   
2  0.000000   0.293884  0.000000  0.293884  0.000000  0.000000  0.293884   

   importance        in        is  ...        of  powerful   process  \
0    0.000000  0.000000  0.296520  ...  0.000000  0.389888  0.000000   
1    0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.408248   
2    0.293884  0.293884  0.223506  ...  0.293884  0.000000  0.000000   

     python      text        tf       the        to      used     words  
0  0.389888  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  
1  0.000000  0.408248  0.000000  0.000000  0.000000  0.000000  0.000000  
2  0.000000  0.000000  0.293884  0.293884  0.293884  0.293884  0.293884  

[3 rows x 24 columns]
