In [2]:
import pandas as pd
import nltk

### 1.word_tokenize, sent_tokenize

punkt is a pre-trained tokenizer model in NLTK (Natural Language Toolkit) used for unsupervised tokenization of text. It helps split text into words and sentences without requiring explicit rules for every language.
punkt_tab is an internal resource used by punkt.

In [3]:

nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sandeep.C\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Sandeep.C\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [4]:


text = "NLP -- dff is >, amazing! I love learning it."
word_tokens = word_tokenize(text)
sentence_tokens = sent_tokenize(text)

print("Word Tokens:", word_tokens)
print("Sentence Tokens:", sentence_tokens)


Word Tokens: ['NLP', '--', 'dff', 'is', '>', ',', 'amazing', '!', 'I', 'love', 'learning', 'it', '.']
Sentence Tokens: ['NLP -- dff is >, amazing!', 'I love learning it.']


In [6]:
from nltk.tokenize import sent_tokenize

text = "Hello. there. How are. you? I'm learning NLP."
print(sent_tokenize(text))


['Hello.', 'there.', 'How are.', 'you?', "I'm learning NLP."]


In [7]:
text = "The event (held yesterday) was amazing!"
print(sent_tokenize(text))


['The event (held yesterday) was amazing!']


In [8]:
text = "Dr. Brown, Ph.D., is an expert. He wrote a paper titled 'AI & NLP: A Revolution.' It's quite famous."
print(sent_tokenize(text))


['Dr. Brown, Ph.D., is an expert.', "He wrote a paper titled 'AI & NLP: A Revolution.'", "It's quite famous."]


### 2. Stopwords Removal

Stopwords are common words that don’t carry much meaning and are often removed from text before further processing.
Examples: "is", "the", "and", "in", "on", "at".
Reduces the size of the text data.
Improves efficiency by focusing on meaningful words


In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sandeep.C\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [11]:

text = "This is an amazing NLP tutorial."
words = word_tokenize(text)
# print(words)

filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]

print("Original:", words)
print("Without Stopwords:", filtered_words)


Original: ['This', 'is', 'an', 'amazing', 'NLP', 'tutorial', '.']
Without Stopwords: ['amazing', 'NLP', 'tutorial', '.']


Advanced: Custom Stopword List
We can define our own stopword list:

In [12]:
custom_stopwords = {"nlp", "tutorial","do","like"}
text = "This is an amazing NLP tutorial.i love this topic, i do not like something"
words = word_tokenize(text)
filtered_words = [word for word in words if word.lower() not in custom_stopwords]
print(filtered_words)


['This', 'is', 'an', 'amazing', 'tutorial.i', 'love', 'this', 'topic', ',', 'i', 'not', 'something']


In [14]:


text = "This is not just an NLP tutorial, but a great learning experience."

# Tokenize words
words = word_tokenize(text)

# Get default English stopwords
default_stopwords = set(stopwords.words("english"))
# print(len(default_stopwords))

# Define words to KEEP (exclude them from removal)
words_to_keep = {"not", "but"}  

# # Create a modified stopword set by removing the words_to_keep
modified_stopwords = default_stopwords - words_to_keep  # Subtracting words_to_keep

# # Filter words
filtered_words = [word for word in words if word.lower() not in modified_stopwords]

print("Original Words:", words)
print("Filtered Words:", filtered_words)


Original Words: ['This', 'is', 'not', 'just', 'an', 'NLP', 'tutorial', ',', 'but', 'a', 'great', 'learning', 'experience', '.']
Filtered Words: ['not', 'NLP', 'tutorial', ',', 'but', 'great', 'learning', 'experience', '.']


###  Stemming(Reducing Words to Their Root Form) 

For example, "running" → "run", "flies" → "fli" (but this can be inaccurate).


In [15]:
from nltk.stem import PorterStemmer
text = "The running dogs are studying harder than others."
words = word_tokenize(text)

ps = PorterStemmer()
# words = ["running", "flies", "easily", "loving"]

stemmed_words = [ps.stem(word) for word in words]
print(stemmed_words)


['the', 'run', 'dog', 'are', 'studi', 'harder', 'than', 'other', '.']


### Lemmatization (Context-Aware Root Word Extraction)

Lemmatization is smarter because it converts words to their dictionary form (lemma) using linguistic rules.
For example, "running" → "run", "flies" → "fly" (correct!).

In [16]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sandeep.C\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:


lemmatizer = WordNetLemmatizer()

text = "The running dogs are studying harder than others."

# Tokenize words
words = word_tokenize(text)
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
# lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
print(lemmatized_words)


['The', 'running', 'dog', 'are', 'studying', 'harder', 'than', 'others', '.']


### 🔹 4. Part-of-Speech (POS) Tagging
POS tagging assigns grammatical labels (noun, verb, adjective, etc.) to words.

In [18]:
from nltk import pos_tag

nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sandeep.C\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Sandeep.C\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [19]:
sentence = word_tokenize("John is running fast.")
print(sentence)

['John', 'is', 'running', 'fast', '.']


In [20]:

pos_tags = pos_tag(sentence)
print(pos_tags)


[('John', 'NNP'), ('is', 'VBZ'), ('running', 'VBG'), ('fast', 'RB'), ('.', '.')]


Lemmatization with POS (Better Results)

In [21]:


# Apply lemmatization with POS tagging
lemmatized_words = [lemmatizer.lemmatize(word, pos="v") for word in words]  # "v" = verb

print("Lemmatized Words (with POS):", lemmatized_words)


Lemmatized Words (with POS): ['The', 'run', 'dog', 'be', 'study', 'harder', 'than', 'others', '.']


### Named Entity Recognition (NER)

NER identifies real-world entities in text, like names, places, dates, and organizations. 😊

In [22]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk

In [23]:
nltk.download("maxent_ne_chunker")
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')  #need to download when using nltk.ne_chunk() for NER for spaCy does't needed.


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Sandeep.C\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\Sandeep.C\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Sandeep.C\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [24]:
text = "Elon Musk founded SpaceX in California in 2002."
words = word_tokenize(text)
pos_tags=pos_tag(words)
ber_tree=ne_chunk(pos_tags)
print(ber_tree)

(S
  (PERSON Elon/NNP)
  (PERSON Musk/NNP)
  founded/VBD
  (ORGANIZATION SpaceX/NNP)
  in/IN
  (GPE California/NNP)
  in/IN
  2002/CD
  ./.)


spaCy provides faster and more accurate NER than nltk

In [1]:
# ! pip install spacy

In [26]:
import spacy

# Load English NLP model
nlp = spacy.load("en_core_web_sm")

text = "Bill Gates founded Microsoft in 1975 in the United States."

# Process text
doc = nlp(text)

# Print named entities
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")


Bill Gates -> PERSON
Microsoft -> ORG
1975 -> DATE
the United States -> GPE


### Chunking & Chinking

Chunking is like grouping words that belong together in a sentence. For example, in "The quick brown fox", all these words describe a noun (fox). Chunking helps us extract meaningful phrases like noun phrases (NP)
Chinking is removing unwanted words from a chunk.
For example, if we chunk "The quick brown fox jumps," but we don’t want the verb "jumps", we chink (remove) it.

Why Do We Need chunk_grammar and POS Tagging in Chunking?
The chunk_grammar is a set of rules written using Regular Expressions (RegEx) to define which POS tags should be grouped into a chunk.
Why Do We Use POS Tagging Before Chunking?
POS tagging tells us the role of each word in a sentence. Without it, we wouldn't know

In [27]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag, RegexpParser

# Step 1: Create a sentence
sentence = "The quick brown fox jumps over the lazy dog"

# Step 2: Tokenize & Tag POS (Parts of Speech)
tokens = word_tokenize(sentence)
pos_tags = pos_tag(tokens)

# Step 3: Define a Simple Chunk Rule
chunk_grammar = r"NP: {<DT>?<JJ>*<NN>}"  # Noun Phrase: Optional Determiner, Adjectives, Noun

chunk_parser = RegexpParser(chunk_grammar) # Step 4: Apply Chunking
chunk_result = chunk_parser.parse(pos_tags)

print(chunk_result) # Step 5: Print & Visualize Chunk Tree
# chunk_result.draw()  # Opens a tree diagram


(S
  (NP The/DT quick/JJ brown/NN)
  (NP fox/NN)
  jumps/VBZ
  over/IN
  (NP the/DT lazy/JJ dog/NN))


###  Dependency Parsing

Dependency Parsing is a technique in NLP that helps understand the grammatical structure of a sentence by analyzing how words are related to each other.
It identifies:
Which word is the main verb (root)
Which words depend on others (subjects, objects, modifiers, etc.)

    The cat chased the mouse
    chased (ROOT)
     ├── cat (Subject)
     ├── the (Determiner for "cat")
     ├── mouse (Object)
     ├── the (Determiner for "mouse")


In [14]:
import spacy
nlp = spacy.load("en_core_web_sm") # Load English NLP model
sentence = "The cat chased the mouse." # Define sentence
doc = nlp(sentence) # Process sentence
for token in doc: # Print word dependencies
    print(f"{token.text} --> {token.dep_} --> {token.head.text}")


The --> det --> cat
cat --> nsubj --> chased
chased --> ROOT --> chased
the --> det --> mouse
mouse --> dobj --> chased
. --> punct --> chased


#### Coreference Resolution

Coreference resolution is the process of linking pronouns and noun phrases to the correct entities in a sentence or document.

🔹 Example
👉 "John went to the market. He bought some apples."
✅ Coreference Resolution: "He" → "John"

👉 "Sara met Priya. She gave her a book."
✅ Coreference Resolution: "She" → "Sara", "her" → "Priya"

In [None]:
#  Using neuralcoref for Coreference Resolution
import spacy
import neuralcoref  # Add coreference resolution to SpaCy
nlp = spacy.load("en_core_web_sm") # Load SpaCy model
neuralcoref.add_to_pipe(nlp) # Add NeuralCoref to SpaCy pipeline
text = "John went to the market. He bought some apples." # Example sentence
doc = nlp(text) # Process the text
print(doc._.coref_resolved)


In [None]:
#Alternative 
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
# Load Coreference Model
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")
text = "Sara met Priya. She gave her a book."
result = predictor.predict(document=text)
print(result)
