In [19]:
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Subhadeep_Sarkar\AppData\Roaming\nltk_data...


True

## Input (Text)

In [1]:
text = "India, located in South Asia, is the world's seventh-largest country by land area and the second-most populous country, home to over 1.3 billion people. It boasts a rich cultural heritage, characterized by its diverse languages, religions, and traditions. India's economy is one of the fastest-growing in the world, driven by industries such as information technology, agriculture, and manufacturing. The country is known for its iconic landmarks such as the Taj Mahal, vibrant festivals like Diwali and Holi, and delicious cuisine featuring dishes like biryani and curry. India's political landscape is complex, with a parliamentary democratic system and a history shaped by various dynasties, colonial rule, and independence movements."

## Words and sentences tokenization (separation)

In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [3]:
sentences = sent_tokenize(text)

for idx, sentence in enumerate(sentences):
    print(idx+1,".",sentence)

1 . India, located in South Asia, is the world's seventh-largest country by land area and the second-most populous country, home to over 1.3 billion people.
2 . It boasts a rich cultural heritage, characterized by its diverse languages, religions, and traditions.
3 . India's economy is one of the fastest-growing in the world, driven by industries such as information technology, agriculture, and manufacturing.
4 . The country is known for its iconic landmarks such as the Taj Mahal, vibrant festivals like Diwali and Holi, and delicious cuisine featuring dishes like biryani and curry.
5 . India's political landscape is complex, with a parliamentary democratic system and a history shaped by various dynasties, colonial rule, and independence movements.


In [4]:
words = word_tokenize(text)

for word in words:
    print(word)

print("Total words",len(words))

India
,
located
in
South
Asia
,
is
the
world
's
seventh-largest
country
by
land
area
and
the
second-most
populous
country
,
home
to
over
1.3
billion
people
.
It
boasts
a
rich
cultural
heritage
,
characterized
by
its
diverse
languages
,
religions
,
and
traditions
.
India
's
economy
is
one
of
the
fastest-growing
in
the
world
,
driven
by
industries
such
as
information
technology
,
agriculture
,
and
manufacturing
.
The
country
is
known
for
its
iconic
landmarks
such
as
the
Taj
Mahal
,
vibrant
festivals
like
Diwali
and
Holi
,
and
delicious
cuisine
featuring
dishes
like
biryani
and
curry
.
India
's
political
landscape
is
complex
,
with
a
parliamentary
democratic
system
and
a
history
shaped
by
various
dynasties
,
colonial
rule
,
and
independence
movements
.
Total words 130


## Stopwords

Stopwords are those irrelevant words that can be removed from a sentence without changing its meaning

In [5]:
from nltk.corpus import stopwords

In [8]:
new_words = []

for word in words:
    if word.lower() not in stopwords.words('english') and len(word) > 1:
        new_words.append(word.lower())

print(new_words)
print("Total",len(new_words),"words")

['india', 'located', 'south', 'asia', 'world', "'s", 'seventh-largest', 'country', 'land', 'area', 'second-most', 'populous', 'country', 'home', '1.3', 'billion', 'people', 'boasts', 'rich', 'cultural', 'heritage', 'characterized', 'diverse', 'languages', 'religions', 'traditions', 'india', "'s", 'economy', 'one', 'fastest-growing', 'world', 'driven', 'industries', 'information', 'technology', 'agriculture', 'manufacturing', 'country', 'known', 'iconic', 'landmarks', 'taj', 'mahal', 'vibrant', 'festivals', 'like', 'diwali', 'holi', 'delicious', 'cuisine', 'featuring', 'dishes', 'like', 'biryani', 'curry', 'india', "'s", 'political', 'landscape', 'complex', 'parliamentary', 'democratic', 'system', 'history', 'shaped', 'various', 'dynasties', 'colonial', 'rule', 'independence', 'movements']
Total 72 words


## Remove duplicate words

In [10]:
words = list(set(new_words))

print(len(words))
words

64


['political',
 'system',
 'delicious',
 'people',
 'cultural',
 'information',
 'rule',
 'featuring',
 'land',
 'complex',
 'parliamentary',
 'known',
 'biryani',
 'diverse',
 'shaped',
 'area',
 'holi',
 '1.3',
 'india',
 "'s",
 'cuisine',
 'second-most',
 'curry',
 'movements',
 'boasts',
 'dynasties',
 'heritage',
 'country',
 'mahal',
 'dishes',
 'colonial',
 'landmarks',
 'religions',
 'billion',
 'languages',
 'traditions',
 'vibrant',
 'independence',
 'one',
 'diwali',
 'populous',
 'various',
 'south',
 'driven',
 'technology',
 'home',
 'industries',
 'manufacturing',
 'characterized',
 'seventh-largest',
 'landscape',
 'festivals',
 'world',
 'like',
 'democratic',
 'asia',
 'rich',
 'located',
 'economy',
 'fastest-growing',
 'agriculture',
 'iconic',
 'history',
 'taj']

## Lamination and Stemmer

Both are used to get the root words but Lamination is more accurate but it takes huge time than stemmer

In [11]:
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [12]:
lamination = WordNetLemmatizer()
stemmer = PorterStemmer()

In [22]:
laminated_words = [lamination.lemmatize(word) for word in words]
print(laminated_words)
print(len(laminated_words))

['political', 'system', 'delicious', 'people', 'cultural', 'information', 'rule', 'featuring', 'land', 'complex', 'parliamentary', 'known', 'biryani', 'diverse', 'shaped', 'area', 'holi', '1.3', 'india', "'s", 'cuisine', 'second-most', 'curry', 'movement', 'boast', 'dynasty', 'heritage', 'country', 'mahal', 'dish', 'colonial', 'landmark', 'religion', 'billion', 'language', 'tradition', 'vibrant', 'independence', 'one', 'diwali', 'populous', 'various', 'south', 'driven', 'technology', 'home', 'industry', 'manufacturing', 'characterized', 'seventh-largest', 'landscape', 'festival', 'world', 'like', 'democratic', 'asia', 'rich', 'located', 'economy', 'fastest-growing', 'agriculture', 'iconic', 'history', 'taj']
64


In [18]:
stemmed_words = [stemmer.stem(word) for word in words]
print(stemmed_words)
print(len(stemmed_words))

['polit', 'system', 'delici', 'peopl', 'cultur', 'inform', 'rule', 'featur', 'land', 'complex', 'parliamentari', 'known', 'biryani', 'divers', 'shape', 'area', 'holi', '1.3', 'india', "'s", 'cuisin', 'second-most', 'curri', 'movement', 'boast', 'dynasti', 'heritag', 'countri', 'mahal', 'dish', 'coloni', 'landmark', 'religion', 'billion', 'languag', 'tradit', 'vibrant', 'independ', 'one', 'diwali', 'popul', 'variou', 'south', 'driven', 'technolog', 'home', 'industri', 'manufactur', 'character', 'seventh-largest', 'landscap', 'festiv', 'world', 'like', 'democrat', 'asia', 'rich', 'locat', 'economi', 'fastest-grow', 'agricultur', 'icon', 'histori', 'taj']
64


## Vectorization

Assigning unique numbers to each unique words

In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [26]:
token = Tokenizer()

token.fit_on_texts(laminated_words)

token.word_index

{'political': 1,
 'system': 2,
 'delicious': 3,
 'people': 4,
 'cultural': 5,
 'information': 6,
 'rule': 7,
 'featuring': 8,
 'land': 9,
 'complex': 10,
 'parliamentary': 11,
 'known': 12,
 'biryani': 13,
 'diverse': 14,
 'shaped': 15,
 'area': 16,
 'holi': 17,
 '1': 18,
 '3': 19,
 'india': 20,
 "'s": 21,
 'cuisine': 22,
 'second': 23,
 'most': 24,
 'curry': 25,
 'movement': 26,
 'boast': 27,
 'dynasty': 28,
 'heritage': 29,
 'country': 30,
 'mahal': 31,
 'dish': 32,
 'colonial': 33,
 'landmark': 34,
 'religion': 35,
 'billion': 36,
 'language': 37,
 'tradition': 38,
 'vibrant': 39,
 'independence': 40,
 'one': 41,
 'diwali': 42,
 'populous': 43,
 'various': 44,
 'south': 45,
 'driven': 46,
 'technology': 47,
 'home': 48,
 'industry': 49,
 'manufacturing': 50,
 'characterized': 51,
 'seventh': 52,
 'largest': 53,
 'landscape': 54,
 'festival': 55,
 'world': 56,
 'like': 57,
 'democratic': 58,
 'asia': 59,
 'rich': 60,
 'located': 61,
 'economy': 62,
 'fastest': 63,
 'growing': 64,
 'a

In [28]:
# How all 5 lines will be vectored
token.texts_to_sequences(sentences)

[[20, 61, 45, 59, 52, 53, 30, 9, 16, 23, 24, 43, 30, 48, 18, 19, 36, 4],
 [60, 5, 29, 51, 14],
 [62, 41, 63, 64, 56, 46, 6, 47, 65, 50],
 [30, 12, 66, 68, 31, 39, 57, 42, 17, 3, 22, 8, 57, 13, 25],
 [1, 54, 10, 11, 58, 2, 67, 15, 44, 33, 7, 40]]