In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from nltk import pos_tag

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
!pip install scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer



In [None]:
document = """
Caltech astrophysicist finds proof of alien life.
Amy you are neuroscientist and you know the latest research into  deja-vu is nothing but the frontal regions of the brain attempting to correct an inaccurate memory.
Sheldon Cooper is a theoretical physicist at caltech university and is running up for nobel prize for his theory of asymmetry and symmetry in universe.
Sheldon does not think that dr. leonard should demand to be in charge of a plasma project.
"""

In [None]:
tokens = word_tokenize(document)
for token in tokens:
  print(token)

Caltech
astrophysicist
finds
proof
of
alien
life
.
Amy
you
are
neuroscientist
and
you
know
the
latest
research
into
deja-vu
is
nothing
but
the
frontal
regions
of
the
brain
attempting
to
correct
an
inaccurate
memory
.
Sheldon
Cooper
is
a
theoretical
physicist
at
caltech
university
and
is
running
up
for
nobel
prize
for
his
theory
of
asymmetry
and
symmetry
in
universe
.
Sheldon
does
not
think
that
dr.
leonard
should
demand
to
be
in
charge
of
a
plasma
project
.


In [None]:
pos_tags = pos_tag(tokens)
for tag in pos_tags:
  print(tag)

('Caltech', 'NNP')
('astrophysicist', 'NN')
('finds', 'VBZ')
('proof', 'NN')
('of', 'IN')
('alien', 'JJ')
('life', 'NN')
('.', '.')
('Amy', 'NNP')
('you', 'PRP')
('are', 'VBP')
('neuroscientist', 'JJ')
('and', 'CC')
('you', 'PRP')
('know', 'VBP')
('the', 'DT')
('latest', 'JJS')
('research', 'NN')
('into', 'IN')
('deja-vu', 'NN')
('is', 'VBZ')
('nothing', 'NN')
('but', 'CC')
('the', 'DT')
('frontal', 'JJ')
('regions', 'NNS')
('of', 'IN')
('the', 'DT')
('brain', 'NN')
('attempting', 'VBG')
('to', 'TO')
('correct', 'VB')
('an', 'DT')
('inaccurate', 'JJ')
('memory', 'NN')
('.', '.')
('Sheldon', 'NNP')
('Cooper', 'NNP')
('is', 'VBZ')
('a', 'DT')
('theoretical', 'JJ')
('physicist', 'NN')
('at', 'IN')
('caltech', 'NN')
('university', 'NN')
('and', 'CC')
('is', 'VBZ')
('running', 'VBG')
('up', 'RP')
('for', 'IN')
('nobel', 'JJ')
('prize', 'NN')
('for', 'IN')
('his', 'PRP$')
('theory', 'NN')
('of', 'IN')
('asymmetry', 'NN')
('and', 'CC')
('symmetry', 'NN')
('in', 'IN')
('universe', 'NN')
('.', 

In [None]:
stop_words = set(stopwords.words('english'))
for word in stop_words:
  print(word)

hasn
both
d
own
those
did
down
it's
nor
them
which
wouldn't
ma
after
shan
o
in
what
shouldn
won't
ours
she's
themselves
couldn
more
for
shan't
these
haven
will
the
herself
with
are
yourself
its
couldn't
here
yourselves
isn't
than
a
theirs
wasn't
won
s
hers
i
an
hadn
mightn't
mightn
don't
aren
myself
other
hasn't
aren't
weren't
him
itself
haven't
when
whom
until
wouldn
now
m
where
you
is
isn
this
then
should
was
you're
doing
it
few
ain
were
had
y
such
because
each
ourselves
mustn
at
has
about
me
am
our
but
or
re
above
have
same
why
my
we
before
that'll
been
only
doesn
don
all
be
any
who
you'll
wasn
needn
hadn't
shouldn't
some
that
during
between
being
his
too
so
she
if
further
as
again
you'd
himself
very
mustn't
from
to
out
having
does
their
they
no
ll
on
her
most
do
while
your
yours
against
needn't
of
below
weren
should've
ve
there
once
didn
just
up
t
over
off
you've
he
how
didn't
under
can
through
and
into
doesn't
by
not


In [None]:
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
for token in filtered_tokens:
  print(token)

Caltech
astrophysicist
finds
proof
alien
life
.
Amy
neuroscientist
know
latest
research
deja-vu
nothing
frontal
regions
brain
attempting
correct
inaccurate
memory
.
Sheldon
Cooper
theoretical
physicist
caltech
university
running
nobel
prize
theory
asymmetry
symmetry
universe
.
Sheldon
think
dr.
leonard
demand
charge
plasma
project
.


In [None]:
stemmer  = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
for token in stemmed_tokens:
  print(token)

caltech
astrophysicist
find
proof
alien
life
.
ami
neuroscientist
know
latest
research
deja-vu
noth
frontal
region
brain
attempt
correct
inaccur
memori
.
sheldon
cooper
theoret
physicist
caltech
univers
run
nobel
prize
theori
asymmetri
symmetri
univers
.
sheldon
think
dr.
leonard
demand
charg
plasma
project
.


In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
for token in lemmatized_tokens:
  print(token)

Caltech
astrophysicist
find
proof
alien
life
.
Amy
neuroscientist
know
latest
research
deja-vu
nothing
frontal
region
brain
attempting
correct
inaccurate
memory
.
Sheldon
Cooper
theoretical
physicist
caltech
university
running
nobel
prize
theory
asymmetry
symmetry
universe
.
Sheldon
think
dr.
leonard
demand
charge
plasma
project
.


In [None]:
corpus = [document]
tfid_vectorizer = TfidfVectorizer()
tfid_matrix = tfid_vectorizer.fit_transform(corpus)
print(tfid_matrix.toarray())

[[0.09245003 0.09245003 0.09245003 0.2773501  0.09245003 0.09245003
  0.09245003 0.09245003 0.09245003 0.09245003 0.09245003 0.09245003
  0.18490007 0.09245003 0.09245003 0.09245003 0.09245003 0.09245003
  0.09245003 0.09245003 0.09245003 0.18490007 0.09245003 0.09245003
  0.18490007 0.09245003 0.09245003 0.2773501  0.09245003 0.09245003
  0.09245003 0.09245003 0.09245003 0.09245003 0.09245003 0.09245003
  0.09245003 0.36980013 0.09245003 0.09245003 0.09245003 0.09245003
  0.09245003 0.09245003 0.09245003 0.09245003 0.18490007 0.09245003
  0.09245003 0.09245003 0.2773501  0.09245003 0.09245003 0.09245003
  0.18490007 0.09245003 0.09245003 0.09245003 0.09245003 0.18490007]]
