# Natural Language Processing with spaCy

#### A modern, high-performance NLP library with an advanced machine learning-based pipeline.

In [1]:
# Importing necessary library and loading the language model
import spacy
nlp = spacy.load("en_core_web_sm")

### Basic Usage

In [2]:
# Process Text
doc = nlp("spaCy is an amazing NLP library for Python!")

# Access tokens
for token in doc:
    print(token.text, token.pos_, token.dep_)

# token.text: The tokenized word.  
# token.pos_: Part-of-speech tagging
# token.dep_: Dependency relation

spaCy NUM nsubj
is AUX ROOT
an DET det
amazing ADJ amod
NLP PROPN compound
library NOUN attr
for ADP prep
Python PROPN pobj
! PUNCT punct


## 1. Lemmatization

In [3]:
# reduces words to their base or dictionary form

doc = nlp("The cars are being driven carefully while the dogs are barking.")
for token in doc:
    print(f"{token.text} = {token.lemma_}")

The = the
cars = car
are = be
being = be
driven = drive
carefully = carefully
while = while
the = the
dogs = dog
are = be
barking = bark
. = .


## 2. Tokenization

In [4]:
# splits text into words, punctuation and other meaningful elements

doc = nlp("I am learning natural language processing using spaCy.")
for token in doc:
    print(token.text)
print("Number of tokens:", len(doc))

I
am
learning
natural
language
processing
using
spaCy
.
Number of tokens: 9


## 3. Named Entity Recognition (NER)

In [5]:
# identifies named entities like people, organizations, and locations in text. 

doc = nlp("Barack Obama served as the 44th president of the United States and was born in Hawaii.")
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")

Entity: Barack Obama, Label: PERSON
Entity: 44th, Label: ORDINAL
Entity: the United States, Label: GPE
Entity: Hawaii, Label: GPE


## 4. Sentence Segmentation 

In [6]:
# splits text into individual sentences.

doc = nlp("Machine learning is fascinating. It enables to learn from the data. Natural language processing is a subfield of AI.")
for sent in doc.sents:
    print(sent.text)

Machine learning is fascinating.
It enables to learn from the data.
Natural language processing is a subfield of AI.


## 5. Stop Word Removal

In [7]:
# eliminates common words (and, the) that do not add meaning

text = "Stop words are common words that often do not add much meaning."
doc = nlp(text)
filtered_tokens = [token.text for token in doc if not token.is_stop]
print("Original Text:", text)
print("Filtered Tokens:", " ".join(filtered_tokens))

Original Text: Stop words are common words that often do not add much meaning.
Filtered Tokens: Stop words common words add meaning .


## 6. Parts of Speech (POS) Recognition

In [8]:
# identifies the grammatical role of each word in the text

text = "Natural language processing is an exciting field of artificial intelligence."
doc = nlp(text)

#POS Tagging
print(f"{'Token':<15} {'POS':<10} {'Explanation'}")
print("-"*40)
for token in doc:
    print(f"{token.text:<15} {token.pos_:<10} {spacy.explain(token.pos_)}")

Token           POS        Explanation
----------------------------------------
Natural         ADJ        adjective
language        NOUN       noun
processing      NOUN       noun
is              AUX        auxiliary
an              DET        determiner
exciting        ADJ        adjective
field           NOUN       noun
of              ADP        adposition
artificial      ADJ        adjective
intelligence    NOUN       noun
.               PUNCT      punctuation


# Lab Wrap-Up Exercise

In [9]:
text = "Albert Einstein, a theoretical physicist, was born in Germany. He developed the theory of relativity, which is one ofthe two pillars of modern physics."

# Processing the text
doc = nlp(text)

# Sentence Segmentation
print("\nSentence Segmentation:")
for i, sent in enumerate(doc.sents, 1):
    print(f"Sentence {i}: {sent.text}")

# Tokenization
print("\nTokenization:")
for token in doc:
    print(token.text)
print("Number of tokens:", len(doc))

# Stop word Removal
print("\nStop Word Removal:")
filtered_tokens = [token.text for token in doc if not token.is_stop]
print("Original Text:", text)
print("Filtered Tokens:", filtered_tokens)

# Lemmatization
print("\nLemmatization:")
for token in doc:
    print(f"{token.text} = {token.lemma_}")

# POS Tagging
print("\nPOS Tagging:")
print(f"{'Token':<15} {'POS':<10}")
print("-"*20)
for token in doc:
    print(f"{token.text:<15} {token.pos_:<10}")

# Named Entity Recognition (NER)
print("\nNamed Entity Recognition:")
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")


Sentence Segmentation:
Sentence 1: Albert Einstein, a theoretical physicist, was born in Germany.
Sentence 2: He developed the theory of relativity, which is one ofthe two pillars of modern physics.

Tokenization:
Albert
Einstein
,
a
theoretical
physicist
,
was
born
in
Germany
.
He
developed
the
theory
of
relativity
,
which
is
one
ofthe
two
pillars
of
modern
physics
.
Number of tokens: 29

Stop Word Removal:
Original Text: Albert Einstein, a theoretical physicist, was born in Germany. He developed the theory of relativity, which is one ofthe two pillars of modern physics.
Filtered Tokens: ['Albert', 'Einstein', ',', 'theoretical', 'physicist', ',', 'born', 'Germany', '.', 'developed', 'theory', 'relativity', ',', 'ofthe', 'pillars', 'modern', 'physics', '.']

Lemmatization:
Albert = Albert
Einstein = Einstein
, = ,
a = a
theoretical = theoretical
physicist = physicist
, = ,
was = be
born = bear
in = in
Germany = Germany
. = .
He = he
developed = develop
the = the
theory = theory
of = 