#### Import Libraries & Load Models

In [1]:
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk
from nltk.chunk import tree2conlltags
from nltk.stem import PorterStemmer,WordNetLemmatizer

In [None]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

#### Input Text

In [3]:
text="""Apple Inc. is planning to open a new office in Bengaluru in 2025.
This will create thousands of job opportunities for software engineers.
Tim Cook announced this in a press conference in California."""

#### Sentence Tokenization

In [4]:
sentences=sent_tokenize(text)
print(sentences)

['Apple Inc. is planning to open a new office in Bengaluru in 2025.', 'This will create thousands of job opportunities for software engineers.', 'Tim Cook announced this in a press conference in California.']


#### Word Tokenization

In [5]:
words=word_tokenize(text)
print(words)

['Apple', 'Inc.', 'is', 'planning', 'to', 'open', 'a', 'new', 'office', 'in', 'Bengaluru', 'in', '2025', '.', 'This', 'will', 'create', 'thousands', 'of', 'job', 'opportunities', 'for', 'software', 'engineers', '.', 'Tim', 'Cook', 'announced', 'this', 'in', 'a', 'press', 'conference', 'in', 'California', '.']


#### Stopwords Removal


In [6]:
stop_words_list=stopwords.words('english')
filtered_words=[i for i in words if i.lower() not in stop_words_list]

#### Stemming

In [7]:
stemmer=PorterStemmer()
stemmed=[stemmer.stem(i) for i in filtered_words]

#### Lemmatization

In [8]:
lemma=WordNetLemmatizer()
lemmatized=[lemma.lemmatize(i) for i in filtered_words]

#### Part of Speech tagging

In [10]:
nltk.download('averaged_perceptron_tagger_eng')
#nltk.download('maxent_')
pos_tags=pos_tag(filtered_words)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


#### Named Entity Recognition

In [12]:
nltk.download('maxent_ne_chunker_tab')
ne_tree=ne_chunk(pos_tags)
#ne_tree.draw()
tree2conlltags(ne_tree)

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.


[('Apple', 'NNP', 'B-PERSON'),
 ('Inc.', 'NNP', 'O'),
 ('planning', 'VBG', 'O'),
 ('open', 'JJ', 'O'),
 ('new', 'JJ', 'O'),
 ('office', 'NN', 'O'),
 ('Bengaluru', 'NNP', 'O'),
 ('2025', 'CD', 'O'),
 ('.', '.', 'O'),
 ('create', 'VB', 'O'),
 ('thousands', 'NNS', 'O'),
 ('job', 'NN', 'O'),
 ('opportunities', 'NNS', 'O'),
 ('software', 'NN', 'O'),
 ('engineers', 'NNS', 'O'),
 ('.', '.', 'O'),
 ('Tim', 'NNP', 'B-PERSON'),
 ('Cook', 'NNP', 'I-PERSON'),
 ('announced', 'VBD', 'O'),
 ('press', 'NN', 'O'),
 ('conference', 'NN', 'O'),
 ('California', 'NNP', 'B-GPE'),
 ('.', '.', 'O')]

#### Document similarity using Cosine

In [13]:
text1="I love natural language processing"
text2="Personally i feel nlp is what i love"
text3="I love natural language processing"
from sklearn.feature_extraction.text import CountVectorizer

cv=CountVectorizer()
vectors=cv.fit_transform([text1,text2,text3])

from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(vectors[0],vectors[1])

array([[0.20412415]])

Nouns

NN	Noun, singular	book
NNS	Noun, plural	books
NNP	Proper noun, singular	London
NNPS	Proper noun, plural	Indians



---

Verbs

VB	Base form	eat
VBD	Past tense	ate
VBG	Gerund/Present participle	eating
VBN	Past participle	eaten
VBP	Present (non-3rd person)	eat (I/you eat)
VBZ	Present (3rd person)	eats (he eats)



---

Adjectives

JJ	Adjective	beautiful
JJR	Comparative	prettier
JJS	Superlative	prettiest



---

Adverbs

RB	Adverb	quickly
RBR	Comparative adv.	faster
RBS	Superlative adv.	fastest



---

Pronouns

PRP	Personal pronoun	I, you, he
PRP$	Possessive pronoun	my, your
WP	Wh-pronoun	who
WP$	Possessive wh-pronoun	whose



---

Determiners & Conjunctions

DT	Determiner	the, a, an
CC	Coordinating conj.	and, but
IN	Preposition/Subconj.	in, of, that



---

Others

TO	"to" as preposition	to run
MD	Modal verb	can, must, will
EX	Existential "there"	there is
UH	Interjection	oh, wow
CD	Cardinal number	one, 2023
FW	Foreign word	déjà vu
SYM	Symbol	%, $, +

In [None]:
# Step 1: Install Required Libraries
!pip install wikipedia wordcloud

In [32]:

#web scrapping
# Step 2: Import Libraries
import wikipedia
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Step 3: Ask for or Set the Article Title
article_title = "Indian people"  # You can change this to any topic


# Get Wikipedia content
text = wikipedia.page(article_title).content
print(text)


Indian people or Indians are the citizens and nationals of the Republic of India or people who trace their ancestry to India. While the demonym "Indian" applies to people originating from the present-day India, it was also used as the identifying term for people originating from what is now Pakistan and Bangladesh prior to the Partition of India in 1947.
In 2022, the population of India stood at 1.4 billion people, of various ethnic groups. According to United Nations forecasts, India overtook China as the world's most populous country by the end of April 2023, containing 17.50 percent of the global population. In addition to the Indian population, the Indian overseas diaspora also boasts large numbers, particularly in former British colonies due to the historical Indian indenture system, Arab states of the Persian Gulf, and the Western world.
Particularly in North America and the Caribbean, the terms "Asian Indian" and "East Indian" are sometimes used to differentiate Indians from the

In [None]:

# Generate WordCloud
wordcloud = WordCloud(
    width=1000,
    height=500,
    background_color='white',
    max_font_size=120,
    max_words=100,
).generate(text)

# Display WordCloud
plt.figure(figsize=(15, 7.5))
plt.imshow(wordcloud)
plt.title(f"WordCloud for Wikipedia Article: {article_title}", fontsize=18)
plt.show()
