In [None]:
!pip install spacy

In [None]:
# Run in your terminal python -m spacy download en_core_web_sm

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
# Here we give it sentence to process
doc = nlp("Tesla is looking at buying U.S. startup for $6 million")

""" 
Here he separates each word from another, and it is a smart separation. Even though the $6 is attached, he understands that each word is separate.
 
It also determines the form of the word, whether the word is letters or numbers, and whether it is stopwords or not.
"""
for token in doc:
    print(token.text)
    # print(token.shape) this will return the id and the value if we add _ we will get the vaue
    print(token.shape_)
    print(token.is_alpha)
    print(token.is_stop)
    print("---------")

Tesla
Xxxxx
True
False
---------
is
xx
True
True
---------
looking
xxxx
True
False
---------
at
xx
True
True
---------
buying
xxxx
True
False
---------
U.S.
X.X.
False
False
---------
startup
xxxx
True
False
---------
for
xxx
True
True
---------
$
$
False
False
---------
6
d
False
False
---------
million
xxxx
True
False
---------


In [3]:
doc[0], doc[1], doc[2], doc[3], doc[4], doc[5], doc[6], doc[7]

(Tesla, is, looking, at, buying, U.S., startup, for)

In [4]:
doc2 = nlp('''Although commmonly attributed to John Lennon from his song "Beautiful Boy", 
10 
the phrase "Life is what happens to us while we are making other plans" was written by 
cartoonist Allen Saunders and 
published in Reader\'s Digest in 1957, when Lennon was 17. 
''')

life_quote = doc2[20:32]
print(life_quote)

Life is what happens to us while we are making other plans


In [5]:
doc2[0], doc2[1], doc2[2], doc2[3], doc2[4], doc2[5], doc2[6], doc2[7], doc2[8], doc2[9], doc2[10], doc2[11], doc2[12], doc2[13], doc2[14], doc2[15], doc2[16]

(Although,
 commmonly,
 attributed,
 to,
 John,
 Lennon,
 from,
 his,
 song,
 ",
 Beautiful,
 Boy,
 ",
 ,,
 ,
 10,
 )

In [6]:
doc2[0]

Although

In [7]:
mystring = "We\'re moving to L.A.!"
mystring

"We're moving to L.A.!"

In [8]:
doc3 = nlp(mystring)
for token in doc3:
    print(token.text, end="|")

We|'re|moving|to|L.A.|!|

In [9]:
mystring.split()

["We're", 'moving', 'to', 'L.A.!']

In [10]:
mystring = "We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!"

doc4 = nlp(mystring)
for token in doc4:
    print(token)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [11]:
# Tokens are only read and cannot be write 
doc7 = nlp("My dinner was horrible.")
doc8 = nlp("Your dinner was delicious.")
doc7[3] = doc8[3]

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [12]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
# From nltk library
from nltk.tokenize import word_tokenize

EXAMPLE_TEXT = """ 
Hello Mr. Smith, how are you doing today? The weather is great, 
and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard. 
""" 
word_tokenize(EXAMPLE_TEXT)

['Hello',
 'Mr.',
 'Smith',
 ',',
 'how',
 'are',
 'you',
 'doing',
 'today',
 '?',
 'The',
 'weather',
 'is',
 'great',
 ',',
 'and',
 'Python',
 'is',
 'awesome',
 '.',
 'The',
 'sky',
 'is',
 'pinkish-blue',
 '.',
 'You',
 'should',
 "n't",
 'eat',
 'cardboard',
 '.']

In [14]:
for line in EXAMPLE_TEXT.split("\n")[:20]:
    print(line.split()[:10])
    print("----------------------------------")
    print(word_tokenize(line)[:10])
    print("==================================")

[]
----------------------------------
[]
['Hello', 'Mr.', 'Smith,', 'how', 'are', 'you', 'doing', 'today?', 'The', 'weather']
----------------------------------
['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?']
['and', 'Python', 'is', 'awesome.', 'The', 'sky', 'is', 'pinkish-blue.', 'You', "shouldn't"]
----------------------------------
['and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.']
[]
----------------------------------
[]
