In [163]:
import nltk

In [164]:
text = "India is a large democracy. It is 7th largest country in the world. Also population wise it is the 2nd largest among all the countries in the world just after China."

In [165]:
sentences = nltk.tokenize.sent_tokenize(text)

sentences

['India is a large democracy.',
 'It is 7th largest country in the world.',
 'Also population wise it is the 2nd largest among all the countries in the world just after China.']

In [166]:
words = nltk.tokenize.word_tokenize(text)

words

['India',
 'is',
 'a',
 'large',
 'democracy',
 '.',
 'It',
 'is',
 '7th',
 'largest',
 'country',
 'in',
 'the',
 'world',
 '.',
 'Also',
 'population',
 'wise',
 'it',
 'is',
 'the',
 '2nd',
 'largest',
 'among',
 'all',
 'the',
 'countries',
 'in',
 'the',
 'world',
 'just',
 'after',
 'China',
 '.']

## Text Preprocessing

In [167]:
import string

In [168]:
lam = nltk.stem.WordNetLemmatizer()

def lemitization(text):
    return [lam.lemmatize(txt) for txt in text.split()]

print("Punctuation words :",string.punctuation)

remove_punc = dict((ord(char),None) for char in string.punctuation)

def normalization(text):
    return lemitization(text.lower().translate(remove_punc))

Punctuation words : !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [169]:
for sent in sentences:
    print(sent)
    print(lemitization(sent))

India is a large democracy.
['India', 'is', 'a', 'large', 'democracy.']
It is 7th largest country in the world.
['It', 'is', '7th', 'largest', 'country', 'in', 'the', 'world.']
Also population wise it is the 2nd largest among all the countries in the world just after China.
['Also', 'population', 'wise', 'it', 'is', 'the', '2nd', 'largest', 'among', 'all', 'the', 'country', 'in', 'the', 'world', 'just', 'after', 'China.']


In [170]:
test = "How's my car?"

test.lower().translate(remove_punc)

'hows my car'

In [171]:
for sent in sentences:
    print(sent)
    print(lemitization(sent))

India is a large democracy.
['India', 'is', 'a', 'large', 'democracy.']
It is 7th largest country in the world.
['It', 'is', '7th', 'largest', 'country', 'in', 'the', 'world.']
Also population wise it is the 2nd largest among all the countries in the world just after China.
['Also', 'population', 'wise', 'it', 'is', 'the', '2nd', 'largest', 'among', 'all', 'the', 'country', 'in', 'the', 'world', 'just', 'after', 'China.']


## TF-IDF Vectorization

In [172]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [173]:
tf_idf = TfidfVectorizer(tokenizer=lemitization, stop_words='english')

t = tf_idf.fit_transform(sentences)

print(tf_idf.vocabulary_)

{'india': 5, 'large': 7, 'democracy.': 4, '7th': 1, 'largest': 8, 'country': 3, 'world.': 12, 'population': 9, 'wise': 10, '2nd': 0, 'world': 11, 'just': 6, 'china.': 2}


In [174]:
print(tf_idf.idf_)

[1.69314718 1.69314718 1.69314718 1.28768207 1.69314718 1.69314718
 1.69314718 1.69314718 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718]


## Cosine Similarity

In [175]:
test = "India has a huge population in the world"

In [176]:
test_tf = tf_idf.transform([test])

print(test_tf)

  (0, 11)	0.5773502691896257
  (0, 9)	0.5773502691896257
  (0, 5)	0.5773502691896257


In [177]:
similarity = cosine_similarity(test_tf,t)
similarity

array([[0.33333333, 0.        , 0.43162835]])

getting the index of highest similar value

In [178]:
idx = similarity.argsort()[0][-1]

print(idx)

2


In [179]:
flat = similarity.flatten()
flat.sort()
ans = flat[-1]

if ans == 0.0:
    print("Could not find similar line")
else:
    print("Similar line found")

Similar line found


getting the answer line

In [180]:
print("Answer =",sentences[idx])

Answer = Also population wise it is the 2nd largest among all the countries in the world just after China.
