In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
vect = CountVectorizer(binary=True)
corpus = [
    "I have a German Shepherd",
    "German Shepherd is from Germany",
    "Germans love gossiping",
]
vect.fit(corpus)

CountVectorizer(binary=True)

In [3]:
vocab = vect.vocabulary_
for key in sorted(vocab.keys()):
    print("{}:{}".format(key, vocab[key]))

from:0
german:1
germans:2
germany:3
gossiping:4
have:5
is:6
love:7
shepherd:8


In [4]:
print(vect.transform(["Germany has German Shepherd"]).toarray())

[[0 1 0 1 0 0 0 0 1]]


In [5]:
similarity = cosine_similarity(
    vect.transform(["Germany has German Shepherd"]).toarray(),
    vect.transform(["Germany has Berlin as capital"]).toarray(),
)
print(similarity)

[[0.57735027]]


In [6]:
vectorizer = TfidfVectorizer(binary=True)

In [7]:
vectorizer

TfidfVectorizer(binary=True)

In [14]:
e1 = (
    "Every summer, the pastoral call of their motherland, Kerala, beckoned my parents."
    " Their children would plead that they change direction at least for one year but"
    " these entreaties were dismissed as flakiness. Summer months were set in stone."
    " And contrary to the breezy, game-for-anything attitude that the sun induced in"
    " many folks, for my mother and father, it was a season for constancy."
)
e2 = (
    "J R.R. Tolkien was right to portray the Ents, hulking pieces of tall barks with"
    " sad eyes and mossy facial hair who guarded forests, as perhaps the most stoic"
    " figures in his Lord of the Rings trilogy. Their whole manner spoke to an"
    " endurance that seemed constantly on the verge of dissipation. They had seen and"
    " tolerated far too much in a world at war with itself. Yet they remained calm,"
    " like a steady lighthouse in torrential downpour."
)
e3 = (
    "Once-in-a-lifetime is an overused expression in travel. Like everything else, the"
    " phrase has succumbed to a culture of instant hype. When every mountain hike,"
    " underwater dive or local meal is described with the glowing praise that should be"
    " meant for only the rarest of occurrences, it is difficult to tell a wonderful"
    " experience from a bucket list one. A bucket list just doesn’t translate with the"
    " same impact it was originally intended for."
)
e4 = (
    "New and exciting has always received top billing in food—the latest fad, the"
    " newest restaurant, the trendiest neighbourhood, the healthiest diet. Often, these"
    " developments are accompanied by breathless pundit-like pronouncements: Lebanese"
    " is the new Chinese; Chinese is new Italian (wait, where does leave that Italian"
    " then?) A cuisine or dish has a moment, peaks and then becomes passé."
)
e5 = (
    "In this modern agnostic world, pop culture is the closest thing we have to a"
    " shared religion. Matinee idols, artists and rock stars are our gods and"
    " goddesses, feeding us an endless supply of enchanting lore and myths. For this"
    " movement to thrive though, it needs fans; followers who have gone beyond aloof"
    " observation. Pop culture requires that fans click below to subscribe, not let it"
    " just play in the background."
)
e6 = (
    "Part of the allure of a remote place is that the closer we get to it, the more we"
    " risk everything that made it precious. For years, Mount Everest inspired a sense"
    " of forbidden majesty, which was surmounted only by a few courageous adventurers."
    " It helped that they were entertaining and we were content to immerse ourselves in"
    " the stories they wove from these monumental expeditions."
)
e7 = (
    "The world’s greatest cities are brutal, unsentimental places, precisely the reason"
    " why so many of us fall so irrevocably under their spell. In its worst hour, this"
    " bond can curdle into bitter complaints of unrequited affection and everyday"
    " torment. “The subway doesn’t work, trash is overflowing and it’s too crowded;"
    " this is over.” Let me assure you that right now someone somewhere is uttering"
    " these words about your dream metropolis, New York, Rome, Rio De Janeiro. Like an"
    " unrepentant cad, the city laughs in their face, “Go on… live without me.”"
    " Wresting long-term connections comes with the occasional pang of nostalgic"
    " regret. Those who can’t escape their love of cities are destined to keep"
    " replaying that first flush of romance, that moment when a city went from a"
    " destination to home."
)
e8 = (
    "As soon as a long, gleaming dark green train carriage, emblazoned with ‘Eastern &"
    " Oriental Express’ in engraved gold lettering slides into Hua Hin station, I feel"
    " a tinge of self-consciousness. Neither I nor my co-passengers from India are"
    " dressed for something this imperial. For the last hour, we have been milling"
    " about Hua Hin, Thailand’s oldest railway station—a quirky but fading royal"
    " artefact—in our baggy tees, jeans and dusty shoes, lugging backpacks and"
    " satchels. Once I hop aboard the train though, the air is less intimidating. While"
    " this is a majestic luxury locomotive boasting every accoutrement of refined"
    " sophistication, I see open, friendly faces in casual wear and summer hats."
)
e9 = (
    "After a tiring snorkelling session in the clear waters of Koh Phangan, instructors"
    " Captain Pumpui and Captain Poo are shepherding me and five others on a private"
    " speedboat to nearby Bottle Beach for a picnic lunch of sandwiches, macarons and"
    " fruit juices. I am about 10-12 kilometres from Belmond Napasai’s lush tropical"
    " resort in Koh Samui, my home for the last day-and-a-half, and the sun is"
    " blindingly bright overhead. This is ideal snorkelling weather; we have had a"
    " field day gasping at eels and corals underwater. But the heat has stymied chatter"
    " on our boat."
)
e10 = (
    "Leaning out from a bridge leading into Wat Rong Khun, I squint at a stucco moat of"
    " outstretched hands and grisly skeletons. Two massive horns arch over the walkway,"
    " while a few steps ahead, giant statues of Death and Rahu guard the entrance, like"
    " burly bouncers poised to restrict my entry into heaven. “The bridge of rebirth,”"
    " I overhear a foreign couple talking. A swampland of desire—enslaved arms—lie in"
    " wait blocking a mortal’s road to nirvana. This was rebirth as an infernal"
    " spectre, not Elton John’s PG-13 “Circle of Life.”"
)

In [15]:
edit_corpus = [e1, e2,e3,e4,e5,e6,e7,e8,e9,e10]
vectorizer.fit(edit_corpus)

TfidfVectorizer(binary=True)

In [11]:
vocabulary = vectorizer.vocabulary_
for key in sorted(vocabulary.keys()):
    print("{}:{}".format(key, vocabulary[key]))

an:0
and:1
anything:2
as:3
at:4
attitude:5
barks:6
beckoned:7
breezy:8
but:9
call:10
calm:11
change:12
children:13
constancy:14
constantly:15
contrary:16
direction:17
dismissed:18
dissipation:19
downpour:20
endurance:21
entreaties:22
ents:23
every:24
eyes:25
facial:26
far:27
father:28
figures:29
flakiness:30
folks:31
for:32
forests:33
game:34
guarded:35
had:36
hair:37
his:38
hulking:39
in:40
induced:41
it:42
itself:43
kerala:44
least:45
lighthouse:46
like:47
lord:48
manner:49
many:50
months:51
mossy:52
most:53
mother:54
motherland:55
much:56
my:57
of:58
on:59
one:60
parents:61
pastoral:62
perhaps:63
pieces:64
plead:65
portray:66
remained:67
right:68
rings:69
sad:70
season:71
seemed:72
seen:73
set:74
spoke:75
steady:76
stoic:77
stone:78
summer:79
sun:80
tall:81
that:82
the:83
their:84
these:85
they:86
to:87
tolerated:88
tolkien:89
too:90
torrential:91
trilogy:92
verge:93
war:94
was:95
were:96
who:97
whole:98
with:99
world:100
would:101
year:102
yet:103


In [16]:
new1 = (
    "In a bid to ease paperwork for millions of tourists, the EU has proposed to make"
    " the Schengen visa process an online application system in the next few years. The"
    " Schengen visa allows travellers to visit and travel across 26 EU member nations"
    " for up to 90 days. Right now, tourists submit their visa application to a"
    " country's local consulate and collect their passports in person once the visa is"
    " issued—a time-consuming process which became complicated during the last two"
    " years of the pandemic. The new proposal envisions a single digital visa"
    " application platform for all EU countries. Instead of a physical sticker, the"
    " visa is expected to be a cryptographically signed 2D bar code."
)
new2 = (
    "Travellers can now fly directly between Pune and Singapore starting December 2,"
    " courtesy Vistara which will kick off three to four flights every week."
    " International connections from Pune are rare (right now, Dubai is the sole one);"
    " visitors normally rely on Mumbai airport for the same. However, authorities are"
    " now planning to connect the city to two more Southeast Asian countries once the"
    " Singapore route is up and running."
)

In [17]:
for i in range(10):
    e = edit_corpus[i]
    print(
        cosine_similarity(
            vectorizer.transform([e]).toarray(), vectorizer.transform([new1]).toarray()
        )
    )

[[0.07118881]]
[[0.09253847]]
[[0.236114]]
[[0.08621068]]
[[0.0683277]]
[[0.18032464]]
[[0.12731542]]
[[0.07996824]]
[[0.09001059]]
[[0.09893809]]
