In [7]:
#1.Create a Doc object from the file peterrabbit.txt
import spacy

nlp = spacy.load("en_core_web_sm")
with open("/content/peterrabbit.txt", "r", encoding="utf-8") as file:
    text = file.read()
doc = nlp(text)

In [8]:
#2. For every token in the third sentence, print the token text, the POS tag, the fine-grained TAG tag, and the description of the fine-grained tag.

third_sent = list(doc.sents)[2]
for token in third_sent:
    print(f"{token.text:<15} {token.pos_:<10} {token.tag_:<10} {spacy.explain(token.tag_)}")

They            PRON       PRP        pronoun, personal
lived           VERB       VBD        verb, past tense
with            ADP        IN         conjunction, subordinating or preposition
their           PRON       PRP$       pronoun, possessive
Mother          PROPN      NNP        noun, proper singular
in              ADP        IN         conjunction, subordinating or preposition
a               DET        DT         determiner
sand            NOUN       NN         noun, singular or mass
-               PUNCT      HYPH       punctuation mark, hyphen
bank            NOUN       NN         noun, singular or mass
,               PUNCT      ,          punctuation mark, comma
underneath      ADP        IN         conjunction, subordinating or preposition
the             DET        DT         determiner
root            NOUN       NN         noun, singular or mass
of              ADP        IN         conjunction, subordinating or preposition
a               DET        DT         determi

In [9]:
#3. Provide a frequency list of POS tags from the entire document
from collections import Counter

pos_counts = Counter([token.pos_ for token in doc])
for pos, count in pos_counts.items():
    print(f"{pos}: {count}")

DET: 90
PROPN: 75
ADP: 124
PUNCT: 172
NUM: 8
SPACE: 99
ADV: 65
SCONJ: 20
NOUN: 173
PRON: 108
VERB: 131
ADJ: 54
CCONJ: 61
AUX: 50
PART: 28


In [10]:
#4. CHALLENGE: What percentage of tokens are nouns?
noun_count = sum(1 for token in doc if token.pos_ == "NOUN")
total_tokens = len(doc)
percentage_nouns = (noun_count / total_tokens) * 100
print(f"Percentage of nouns: {percentage_nouns:.2f}%")

Percentage of nouns: 13.75%


In [11]:
#5.Display the Dependency Parse for the third sentence.
for token in third_sent:
    print(f"{token.text:<15} {token.dep_:<10} {token.head.text}")

They            nsubj      lived
lived           ROOT       lived
with            prep       lived
their           poss       Mother
Mother          pobj       with
in              prep       lived
a               det        bank
sand            compound   bank
-               punct      bank
bank            pobj       in
,               punct      bank
underneath      prep       bank
the             det        root
root            pobj       underneath
of              prep       root
a               det        tree

               dep        a
very            advmod     big
big             amod       tree
fir             compound   tree
-               punct      tree
tree            pobj       of
.               punct      lived


              dep        .


In [12]:
#6. Show the first two named entities from Beatrix Potter's The Tale of Peter Rabbit
for ent in doc.ents[:2]:
    print(f"{ent.text} ({ent.label_}) - {spacy.explain(ent.label_)}")

The Tale of Peter Rabbit (WORK_OF_ART) - Titles of books, songs, etc.
Beatrix Potter (PERSON) - People, including fictional


In [13]:
#7. How many sentences are contained in The Tale of Peter Rabbit?
num_sentences = len(list(doc.sents))
print(f"Total sentences: {num_sentences}")

Total sentences: 57


In [14]:
#8. CHALLENGE: How many sentences contain named entities?
sentences_with_ents = [sent for sent in doc.sents if any(ent.start >= sent.start and ent.end <= sent.end for ent in doc.ents)]
print(f"Sentences with named entities: {len(sentences_with_ents)}")

Sentences with named entities: 38


In [15]:
#9.Display the named entity visualization for list_of_sents[0] from the previous problem
from spacy import displacy

list_of_sents = list(doc.sents)
displacy.render(list_of_sents[0], style="ent", jupyter=True)