# Using NLTk

In [13]:
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
nltk.download('punkt_tab')

text = "Hello world! NLP is amazing . Let's tokenize this sentence"

print("Word tokenize:",word_tokenize(text))

print("sSentence tokenize:",sent_tokenize(text))

Word tokenize: ['Hello', 'world', '!', 'NLP', 'is', 'amazing', '.', 'Let', "'s", 'tokenize', 'this', 'sentence']
sSentence tokenize: ['Hello world!', 'NLP is amazing .', "Let's tokenize this sentence"]


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Using Spacy

In [14]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)
print("word token:",[token.text for token in doc])
print("Sentence token:",[sent.text for sent in doc.sents])

word token: ['Hello', 'world', '!', 'NLP', 'is', 'amazing', '.', 'Let', "'s", 'tokenize', 'this', 'sentence']
Sentence token: ['Hello world!', 'NLP is amazing .', "Let's tokenize this sentence"]


NLP Tasks

# Basic Word Count & Frequency Distribution

In [15]:
from collections import Counter
word = word_tokenize(text.lower())
word_freq = Counter(word)
print("Total word count:",len(word))
print("Word frequency distribution :",word_freq.most_common(5))

Total word count: 12
Word frequency distribution : [('hello', 1), ('world', 1), ('!', 1), ('nlp', 1), ('is', 1)]


# Stopword removal with frequency analysis

In [16]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
filterd_words = [word for word in word if word not in stop_words]
filterd_freq = Counter(filterd_words)

print("Filterd word frequency distribution:", filterd_freq.most_common(5))

Filterd word frequency distribution: [('hello', 1), ('world', 1), ('!', 1), ('nlp', 1), ('amazing', 1)]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# NER with Context Analysis

In [17]:
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm") # load spacy model
text = "Apple is looking at buying U.K. startup for $1 billion."
doc = nlp(text)

entity_freq = Counter([ent.text for ent in doc.ents])
for ent,freq in entity_freq.most_common():
  print(f"Entity:{ent},count:{freq},Lable:{nlp(ent)[0].ent_type}")

Entity:Apple,count:1,Lable:383
Entity:U.K.,count:1,Lable:384
Entity:$1 billion,count:1,Lable:394


# N-Grams with frequency analysis

In [18]:
from nltk.util import ngrams
bigrams = list(ngrams(word,2))
trigrams = list(ngrams(word,3))
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)

print("Most common bigrams:",bigram_freq.most_common(5))
print("Most common trigrams:",trigram_freq.most_common(5))

Most common bigrams: [(('hello', 'world'), 1), (('world', '!'), 1), (('!', 'nlp'), 1), (('nlp', 'is'), 1), (('is', 'amazing'), 1)]
Most common trigrams: [(('hello', 'world', '!'), 1), (('world', '!', 'nlp'), 1), (('!', 'nlp', 'is'), 1), (('nlp', 'is', 'amazing'), 1), (('is', 'amazing', '.'), 1)]


# Keyword Extraction using TF-IDF with Tokenization

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform([text])
feature_array = vectorizer.get_feature_names_out()
importance = X.toarray().flatten()
important_words = sorted(zip(feature_array, importance), key=lambda x: x[1], reverse=True)[:5]
print("Top Keywords:", [word for word, _ in important_words])

Top Keywords: ['apple', 'at', 'billion', 'buying', 'for']


# POS Tagging

In [22]:
for token in doc:
  print(f"Token: {token.text},POS:{token.pos_}")

Token: Apple,POS:PROPN
Token: is,POS:AUX
Token: looking,POS:VERB
Token: at,POS:ADP
Token: buying,POS:VERB
Token: U.K.,POS:PROPN
Token: startup,POS:VERB
Token: for,POS:ADP
Token: $,POS:SYM
Token: 1,POS:NUM
Token: billion,POS:NUM
Token: .,POS:PUNCT


# Sentence similarity

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
sentence = ["I love NLP", "NLP is great for text processing", "Machine learning is amazing"]
vectorizeer = CountVectorizer()
x = vectorizer.fit_transform(sentence)
print("Sentence Similarity Matrix:")
print((x*x.T).toarray())

Sentence Similarity Matrix:
[[1.         0.20273527 0.        ]
 [0.20273527 1.         0.13464597]
 [0.         0.13464597 1.        ]]


# Text Summarization

In [26]:
sentence_score = {}
for sent in doc.sents:
  for word in word_tokenize(sent.text.lower()):
    if word in filterd_freq:
      sentence_score[sent.text] = sentence_score.get(sent.text,0) + filterd_freq[word]
sorted_sentences = sorted(sentence_score,key = sentence_score.get,reverse=True)
summary = " ".join(sorted_sentences[:2])
print("Summary:",summary)

Summary: Apple is looking at buying U.K. startup for $1 billion.


In [27]:
import spacy
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# Download required NLTK tokenizer
nltk.download('punkt')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Input text
text = """Natural Language Processing (NLP) is a field of AI that enables machines to understand human language.
It is used in chatbots, sentiment analysis, and language translation.
One of the key challenges in NLP is understanding context and ambiguity in sentences.
Deep learning models like transformers have significantly improved NLP applications."""

# Process text with spaCy
doc = nlp(text)

# Tokenize words and compute frequency
words = [word.lower() for word in word_tokenize(text) if word.isalnum()]
filtered_freq = Counter(words)

# Compute sentence scores
sentence_scores = {}
for sent in doc.sents:
    for word in word_tokenize(sent.text.lower()):
        if word in filtered_freq:
            sentence_scores[sent.text] = sentence_scores.get(sent.text, 0) + filtered_freq[word]

# Sort sentences by score
sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)

# Select top-ranked sentences for summary
summary = " ".join(sorted_sentences[:2])

# Print summary
print("Summary:", summary)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Summary: Natural Language Processing (NLP) is a field of AI that enables machines to understand human language.
 One of the key challenges in NLP is understanding context and ambiguity in sentences.

