In [None]:
# What is Spacy?
# * It is an open-source natural language processing library.
# * It is designed to effectively handle NLP tasks with the 
# most efficient implementation of common algorithms.
# * For many NLP tasks, Spacy only has one implemented method,
# which chooses the most efficient algorithm currently available.

# What is NLTK?
# * NLTK is an open-source NLP library, namely Natural Language Toolkit.
# * It is firstly released in 2001, whereas the Spacy is firstly released in 2015.
# * It includes less efficient implementations than Spacy.

# NLTK vs Spacy
# * For many common NLP tasks, Spacy is much faster and more efficient.
# However, this comes with the cost of single choice for each available 
# algorithm.
# * However, Spacy does not include pre-created models for some applications, such as
# sentiment analysis, which is typically easier to perform with NLTK.
# * https://spacy.io/usage/facts-figures (for having more information about the comparisons of NLP libraries 
# such as NLTK, Spacy, and CoreNLP)

# What is Natural Language Processing (NLP)?

# Natural Language Processing (NLP) is an area of computer science and artificial intelligence
# concerned with the interactions between computers and human (natural) languages, in particular
# how to program computers to process and analyze large amounts of natural language data.

# Natural Language Processing attempts to use a variety of techniques in order to 
# create structure out of text data.

# Text data is highly unstructured and can be in multiple languages.

# Some example use cases of NLP:
# Classifying Emails as Spam vs Legitimate
# Sentiment Analysis of Movie Reviews' Texts
# Analyzing trends from written customer feedback forms.
# Understanding text commands like "Hey Siri, play this song".

# Spacy Basics:
# * Loading the language library
# * Building a pipeline object
# * Using tokens
# * Parts-of-Speech Tagging
# * Understanding Token Attributes


# Spacy works with a pipeline object. 
# The nlp() function from Spacy automatically takes a raw text and performs a series of operations to 
# tag, parse, and desribe the text data. These operations include tokenization, parsing, named entity 
# recognition, and so forth.
# Tokenization 



# Stemming



# Lemmatization


# Stop Words

In [None]:
# conda install -c conda-forge spacy

In [None]:
# For downloading the core english language library, we can use below command.
python -m spacy download en

In [None]:
# pip install spacy

In [11]:
# For downloading the small core english language library, we can use below command.
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.9/13.9 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0)
  Downloading pydantic-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pydantic, en-core-web-sm
  Attempting uninstall: pydantic
    Found existing installation: pydantic 1.10.2
    Uninstalling pydantic-1.10.2:
      Successfully uninstalled pydantic-1.10.2
Successfully installed en-core-web-sm-3.2.0 pydantic-1.8.2
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_cor

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')
print(nlp)

<spacy.lang.en.English object at 0x7f88a53a38e0>


In [27]:
document = nlp(u'Tesla is looking at buying U.S. startup for $10 million dollars.')
# document object holds the processed text.
print(document)
print(type(document))

print()
print()
print()

# Parts of Speech: 
# PROPN = Proper Noun
# VERB = Verb
# SYM = Symbol
# NUM = Number
# PUNCT = Punctuation
# NOUN = Noun
# ADP = Adposition
# AUX = Auxillary Verb

# printing the token, text of the token, the part of speech, and the syntactic dependency of the token.
for token in document:
    print(token, token.text, token.pos, token.pos_, token.dep_)
    
    
    # token.pos_ command will get the raw name of part of speech 
    # for each token in the document.

Tesla is looking at buying U.S. startup for $10 million dollars.
<class 'spacy.tokens.doc.Doc'>



Tesla Tesla 96 PROPN nsubj
is is 87 AUX aux
looking looking 100 VERB ROOT
at at 85 ADP prep
buying buying 100 VERB pcomp
U.S. U.S. 96 PROPN dobj
startup startup 100 VERB dep
for for 85 ADP prep
$ $ 99 SYM quantmod
10 10 93 NUM compound
million million 93 NUM nummod
dollars dollars 92 NOUN pobj
. . 97 PUNCT punct


In [28]:
nlp.pipeline # The NLP pipeline for the nlp object named 'nlp'

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f853847a520>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f853847a7c0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f8535eb7f90>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f853a50dbc0>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f853a516300>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f8545e75740>)]

In [29]:
print(nlp.pipeline)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7f853847a520>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7f853847a7c0>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7f8535eb7f90>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7f853a50dbc0>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7f853a516300>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7f8545e75740>)]


In [30]:
pipes = nlp.pipe_names #to get the basic names of the subparts of the NLP pipeline
print(pipes)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [6]:
second_document = nlp(u"Tesla isn't looking into startups anyore.")

# For the word "isn't", Spacy is able to recognize both the root word and the negation attached to it.
for token in second_document:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN compound
anyore NOUN pobj
. PUNCT punct
