In [1]:
import nltk

# Importing Doc

In [2]:
doc1 = """After a tiring snorkelling session in the clear waters of Koh Phangan, instructors Captain Pumpui and Captain Poo are shepherding me and five others on a private speedboat to nearby Bottle Beach for a picnic lunch of sandwiches, macarons and fruit juices. I am about 10-12 kilometres from Belmond Napasai’s lush tropical resort in Koh Samui, my home for the last day-and-a-half, and the sun is blindingly bright overhead. This is ideal snorkelling weather; we have had a field day gasping at eels and corals underwater. But the heat has stymied chatter on our boat.

However, the aquamen on board won’t have any lull. Blasting dance numbers from the speakers, Pumpui swivels his hips throughout the ride, Shakira-style, exhorting us to join in. We muster a few, muted rhythmic claps. Sometimes he pulls faces through a snorkel mask and, in another instant, casually hangs from outside the boat window like an impertinent daredevil. When Pumpui tires, he steers the boat and Poo takes over as cheerleader. I crack a bemused smile at their relentlessly sunny Laurel & Hardy routine. Perhaps every resort should have a slapstick comedy pairing on hand to liven up spirits."""
doc2 = """The world’s greatest cities are brutal, unsentimental places, precisely the reason why so many of us fall so irrevocably under their spell. In its worst hour, this bond can curdle into bitter complaints of unrequited affection and everyday torment. “The subway doesn’t work, trash is overflowing and it’s too crowded; this is over.” Let me assure you that right now someone somewhere is uttering these words about your dream metropolis, New York, Rome, Rio De Janeiro. Like an unrepentant cad, the city laughs in their face, “Go on… live without me.” Wresting long-term connections comes with the occasional pang of nostalgic regret. Those who can’t escape their love of cities are destined to keep replaying that first flush of romance, that moment when a city went from a destination to home."""
doc3 = """New and exciting has always received top billing in food—the latest fad, the newest restaurant, the trendiest neighbourhood, the healthiest diet. Often, these developments are accompanied by breathless pundit-like pronouncements: Lebanese is the new Chinese; Chinese is new Italian (wait, where does leave that Italian then?) A cuisine or dish has a moment, peaks and then becomes passé.

While there is no shortage of cover versions, originals stand the test of time. And at National Geographic Traveller India, we confess to being partial to one classic. Creating and editing magazines in this day and age, with limited or, sometimes, binding resources, is a mental endurance test. Nothing makes the looming pressures more palatable than food. It is always on our mind. What gets us through the final sweaty hours, month after month, is an absolute Bombay fixture—the vada pav. We are a group of ardent foodies, and the deadline-hour vada pav toast is our modest tradition. Vada pavs are to Mumbai what hot dogs are to New York. Naysayers can knock the street staple all they want but we, at the magazine, are quite misty-eyed about our Bombaiya tastes."""


# Removing stopwords

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [4]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
stop_words = list(set(stopwords.words('english')))
stop_words.extend(['(',')',':','?','.','``',',','\'\''])

doc1_tokens = word_tokenize(doc1)
doc2_tokens = word_tokenize(doc2)
doc3_tokens = word_tokenize(doc3)
clean_doc = []
clean_doc.append([w for w in doc1_tokens if not w in stop_words])
clean_doc.append([w for w in doc2_tokens if not w in stop_words])
clean_doc.append([w for w in doc3_tokens if not w in stop_words])

In [6]:
# Lemmatization step
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [7]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [8]:
docs_lemmatized = [[lemmatizer.lemmatize(j) for j in i]for i in clean_doc]
docs_lemmatized

[['After',
  'tiring',
  'snorkelling',
  'session',
  'clear',
  'water',
  'Koh',
  'Phangan',
  'instructor',
  'Captain',
  'Pumpui',
  'Captain',
  'Poo',
  'shepherding',
  'five',
  'others',
  'private',
  'speedboat',
  'nearby',
  'Bottle',
  'Beach',
  'picnic',
  'lunch',
  'sandwich',
  'macarons',
  'fruit',
  'juice',
  'I',
  '10-12',
  'kilometre',
  'Belmond',
  'Napasai',
  '’',
  'lush',
  'tropical',
  'resort',
  'Koh',
  'Samui',
  'home',
  'last',
  'day-and-a-half',
  'sun',
  'blindingly',
  'bright',
  'overhead',
  'This',
  'ideal',
  'snorkelling',
  'weather',
  ';',
  'field',
  'day',
  'gasping',
  'eel',
  'coral',
  'underwater',
  'But',
  'heat',
  'stymied',
  'chatter',
  'boat',
  'However',
  'aquamen',
  'board',
  '’',
  'lull',
  'Blasting',
  'dance',
  'number',
  'speaker',
  'Pumpui',
  'swivel',
  'hip',
  'throughout',
  'ride',
  'Shakira-style',
  'exhorting',
  'u',
  'join',
  'We',
  'muster',
  'muted',
  'rhythmic',
  'clap',
 

In [9]:
#TF-IDF matrix formation
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [' '.join(i) for i in docs_lemmatized]
tfidf = TfidfVectorizer(use_idf = True).fit_transform(corpus)
tfidf_matrix = tfidf.toarray()
tfidf_matrix

array([[0.08375479, 0.08375479, 0.        , 0.        , 0.        ,
        0.08375479, 0.        , 0.        , 0.06369768, 0.08375479,
        0.08375479, 0.        , 0.        , 0.08375479, 0.        ,
        0.08375479, 0.08375479, 0.        , 0.        , 0.        ,
        0.08375479, 0.08375479, 0.08375479, 0.25126437, 0.        ,
        0.        , 0.        , 0.08375479, 0.        , 0.08375479,
        0.        , 0.08375479, 0.        , 0.16750958, 0.08375479,
        0.08375479, 0.08375479, 0.        , 0.        , 0.08375479,
        0.        , 0.08375479, 0.        , 0.08375479, 0.        ,
        0.        , 0.        , 0.08375479, 0.        , 0.08375479,
        0.        , 0.        , 0.        , 0.        , 0.08375479,
        0.08375479, 0.12739535, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.08375479, 0.        , 0.        ,
        0.08375479, 0.        , 0.        , 0.08

In [10]:
# Cosine similarity 
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(tfidf_matrix)
similarity

array([[1.        , 0.01328267, 0.02272228],
       [0.01328267, 1.        , 0.06427621],
       [0.02272228, 0.06427621, 1.        ]])

In [11]:
import numpy as np
avg = (np.sum(similarity)-4)/12