In [1]:
# load the dataset
from datasets import load_dataset

# Load the IMDb dataset
dataset = load_dataset("imdb")

In [2]:
import spacy

# Import the displaCy library
from spacy import displacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Select a sample from the dataset
sample_text = dataset['train'][1]['text']

# Process the text with spaCy
doc = nlp(sample_text)


In [3]:
doc

"I Am Curious: Yellow" is a risible and pretentious steaming pile. It doesn't matter what one's political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn't true. I've seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don't exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we're treated to the site of Vincent Gallo's throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) "double-standard" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, and the same cann

In [4]:
len(doc)

261

**How many sentences are contained in the doc ?**


In [5]:
len([sent for sent in doc.sents])

11

# tokenization and part-of-speech

For every token in the third sentence, print the token text, the POS tag, the fine-grained TAG tag, and the description of the fine-grained tag.

In [6]:
for token in list(doc.sents)[2]:
    print(f'{token.text:{12}} {token.pos_:{6}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

As           ADP    IN     conjunction, subordinating or preposition
for          ADP    IN     conjunction, subordinating or preposition
the          DET    DT     determiner
claim        NOUN   NN     noun, singular or mass
that         SCONJ  IN     conjunction, subordinating or preposition
frontal      ADJ    JJ     adjective (English), other noun-modifier (Chinese)
male         PROPN  NNP    noun, proper singular
nudity       NOUN   NN     noun, singular or mass
is           AUX    VBZ    verb, 3rd person singular present
an           DET    DT     determiner
automatic    ADJ    JJ     adjective (English), other noun-modifier (Chinese)
NC-17        NOUN   NN     noun, singular or mass
,            PUNCT  ,      punctuation mark, comma
that         PRON   WDT    wh-determiner
is           AUX    VBZ    verb, 3rd person singular present
n't          PART   RB     adverb
true         ADJ    JJ     adjective (English), other noun-modifier (Chinese)
.            PUNCT  .      punctuati

In [7]:
# Print tokens and their POS tags
print("Tokenization and POS tagging:")
for token in doc:
    print(f"{token.text} -> {token.pos_}")

Tokenization and POS tagging:
" -> PUNCT
I -> PRON
Am -> AUX
Curious -> ADJ
: -> PUNCT
Yellow -> PROPN
" -> PUNCT
is -> AUX
a -> DET
risible -> ADJ
and -> CCONJ
pretentious -> ADJ
steaming -> NOUN
pile -> NOUN
. -> PUNCT
It -> PRON
does -> AUX
n't -> PART
matter -> VERB
what -> PRON
one -> PRON
's -> PART
political -> ADJ
views -> NOUN
are -> AUX
because -> SCONJ
this -> DET
film -> NOUN
can -> AUX
hardly -> ADV
be -> AUX
taken -> VERB
seriously -> ADV
on -> ADP
any -> DET
level -> NOUN
. -> PUNCT
As -> ADP
for -> ADP
the -> DET
claim -> NOUN
that -> SCONJ
frontal -> ADJ
male -> PROPN
nudity -> NOUN
is -> AUX
an -> DET
automatic -> ADJ
NC-17 -> NOUN
, -> PUNCT
that -> PRON
is -> AUX
n't -> PART
true -> ADJ
. -> PUNCT
I -> PRON
've -> AUX
seen -> VERB
R -> NOUN
- -> PUNCT
rated -> VERB
films -> NOUN
with -> ADP
male -> ADJ
nudity -> NOUN
. -> PUNCT
Granted -> VERB
, -> PUNCT
they -> PRON
only -> ADV
offer -> VERB
some -> DET
fleeting -> ADJ
views -> NOUN
, -> PUNCT
but -> CCONJ
where ->

# lemmatization
Lemmatization is the process of reducing words to their base or root form. This is useful in tasks where you want to treat different forms of a word as the same word.

In [8]:
for token in doc:
    print(f"{token.text} -> {token.lemma_}")


" -> "
I -> I
Am -> be
Curious -> curious
: -> :
Yellow -> Yellow
" -> "
is -> be
a -> a
risible -> risible
and -> and
pretentious -> pretentious
steaming -> steaming
pile -> pile
. -> .
It -> it
does -> do
n't -> not
matter -> matter
what -> what
one -> one
's -> 's
political -> political
views -> view
are -> be
because -> because
this -> this
film -> film
can -> can
hardly -> hardly
be -> be
taken -> take
seriously -> seriously
on -> on
any -> any
level -> level
. -> .
As -> as
for -> for
the -> the
claim -> claim
that -> that
frontal -> frontal
male -> male
nudity -> nudity
is -> be
an -> an
automatic -> automatic
NC-17 -> nc-17
, -> ,
that -> that
is -> be
n't -> not
true -> true
. -> .
I -> I
've -> have
seen -> see
R -> r
- -> -
rated -> rate
films -> film
with -> with
male -> male
nudity -> nudity
. -> .
Granted -> grant
, -> ,
they -> they
only -> only
offer -> offer
some -> some
fleeting -> fleeting
views -> view
, -> ,
but -> but
where -> where
are -> be
the -> the
R -> r
- -

**Find the frequency of POS**

In [9]:
POS_counts = doc.count_by(spacy.attrs.POS)
for k,v in sorted(POS_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{5}}: {v}')

84. ADJ  : 28
85. ADP  : 29
86. ADV  : 9
87. AUX  : 17
89. CCONJ: 10
90. DET  : 23
92. NOUN : 49
93. NUM  : 1
94. PART : 11
95. PRON : 14
96. PROPN: 10
97. PUNCT: 33
98. SCONJ: 5
100. VERB : 22


**We can find what percentage of nouns are there?**

In [10]:
# the attribute ID for 'NOUN' is 92
percent = 100*POS_counts[92]/len(doc)
print(f'{POS_counts[92]}/{len(doc)} = {percent:{.4}}%')

49/261 = 18.77%


In [11]:
# Visualize the dependency parse
displacy.render(doc, style="dep", jupyter=True)

# Morphological Analysis
Description: Morphological analysis involves understanding the structure of words (like prefixes, suffixes, etc.). This includes analyzing grammatical features like tense, gender, and number.

In [12]:
for token in doc:
    print(f"{token.text} -> {token.morph}")


" -> PunctSide=Ini|PunctType=Quot
I -> Case=Nom|Number=Sing|Person=1|PronType=Prs
Am -> Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin
Curious -> Degree=Pos
: -> 
Yellow -> Number=Sing
" -> PunctSide=Fin|PunctType=Quot
is -> Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
a -> Definite=Ind|PronType=Art
risible -> Degree=Pos
and -> ConjType=Cmp
pretentious -> Degree=Pos
steaming -> Number=Sing
pile -> Number=Sing
. -> PunctType=Peri
It -> Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs
does -> Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
n't -> Polarity=Neg
matter -> VerbForm=Inf
what -> 
one -> PronType=Prs
's -> 
political -> Degree=Pos
views -> Number=Plur
are -> Mood=Ind|Tense=Pres|VerbForm=Fin
because -> 
this -> Number=Sing|PronType=Dem
film -> Number=Sing
can -> VerbForm=Fin
hardly -> 
be -> VerbForm=Inf
taken -> Aspect=Perf|Tense=Past|VerbForm=Part
seriously -> 
on -> 
any -> 
level -> Number=Sing
. -> PunctType=Peri
As -> 
for -> 
the -> Definite=De

# Named Entity Recognition (NER)

NER is the process of identifying named entities (e.g., people, organizations, locations) in text.

In [13]:
print("\nNamed Entity Recognition:")
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")


Named Entity Recognition:
I Am Curious: Yellow -> WORK_OF_ART
The Brown Bunny -> FAC
Vincent Gallo's -> PERSON
johnson -> PERSON
Chloe Sevigny -> PERSON
American -> NORP


In [14]:
# Visualize named entities
displacy.render(doc, style="ent")

# Dependency Parsing
Dependency parsing analyzes the grammatical structure of a sentence and establishes relationships between "head" words and words that modify those heads.

In [15]:
print("\nDependency Parsing:")
for token in doc:
    print(f"{token.text} -> {token.dep_} (Head: {token.head.text})")



Dependency Parsing:
" -> punct (Head: is)
I -> nsubj (Head: Am)
Am -> ccomp (Head: is)
Curious -> acomp (Head: Am)
: -> punct (Head: Yellow)
Yellow -> nsubj (Head: is)
" -> punct (Head: Yellow)
is -> ROOT (Head: is)
a -> det (Head: pile)
risible -> amod (Head: pile)
and -> cc (Head: risible)
pretentious -> conj (Head: risible)
steaming -> compound (Head: pile)
pile -> attr (Head: is)
. -> punct (Head: is)
It -> nsubj (Head: matter)
does -> aux (Head: matter)
n't -> neg (Head: matter)
matter -> ROOT (Head: matter)
what -> det (Head: views)
one -> poss (Head: views)
's -> case (Head: one)
political -> amod (Head: views)
views -> nsubj (Head: are)
are -> ccomp (Head: matter)
because -> mark (Head: taken)
this -> det (Head: film)
film -> nsubjpass (Head: taken)
can -> aux (Head: taken)
hardly -> advmod (Head: taken)
be -> auxpass (Head: taken)
taken -> advcl (Head: are)
seriously -> advmod (Head: taken)
on -> prep (Head: taken)
any -> det (Head: level)
level -> pobj (Head: on)
. -> punct 

**Display the Dependency Parse for the fourth sentence**

In [16]:
displacy.render(list(doc.sents)[3], style='dep', jupyter=True, options={'distance': 110})

# Sentence Segmentation
Sentence segmentation divides the text into individual sentences.



In [17]:
print("\nSentence Segmentation:")
for sent in doc.sents:
    print(f"Sentence: {sent.text}")



Sentence Segmentation:
Sentence: "I Am Curious: Yellow" is a risible and pretentious steaming pile.
Sentence: It doesn't matter what one's political views are because this film can hardly be taken seriously on any level.
Sentence: As for the claim that frontal male nudity is an automatic NC-17, that isn't true.
Sentence: I've seen R-rated films with male nudity.
Sentence: Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia?
Sentence: Nowhere, because they don't exist.
Sentence: The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight.
Sentence: And those pretentious indie movies like The Brown Bunny, in which we're treated to the site of Vincent Gallo's throbbing johnson, but not a trace of pink visible on Chloe Sevigny.
Sentence: Before crying (or implying) "double-standard" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical

# Text Similarity
spaCy can be also  used to compare the similarity between texts, sentences, or individual tokens. This is useful in tasks like clustering, duplicate detection, or recommendation systems.

In [18]:
doc1 = nlp("I am a bit tired")
doc2 = nlp("I feel so ecstatic")
print(f"Similarity: {doc1.similarity(doc2)}")



Similarity: 0.5591253855795965


  print(f"Similarity: {doc1.similarity(doc2)}")


In [19]:
displacy.render(doc, style="dep", jupyter=True, options={"compact": True, "bg": "#09a3d5", "color": "white", "font": "Source Sans Pro"})


In [20]:
## language detection

In [21]:
#from langdetect import detect
#text = "ami to make bhalobasi"
#language = detect(text) # Output: 'fr'
#print(f"The detected language is: {language}")


# Matcher 

In [22]:
from spacy.matcher import Matcher

In [23]:
# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Define a pattern to match adjectives followed by nouns (e.g., "great movie")
adj_noun_pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}]
matcher.add("ADJ_NOUN_PATTERN", [adj_noun_pattern])

# Define a pattern to match monetary amounts (e.g., "$100 million")
money_pattern = [{"IS_CURRENCY": True}, {"LIKE_NUM": True}, {"LOWER": {"IN": ["million", "billion", "trillion"]}}]
matcher.add("MONEY_PATTERN", [money_pattern])



In [24]:
# Process a sample of text from the dataset
for i in range(7):  # Let's process the first 7 reviews
    text = dataset['train'][i]['text']
    doc = nlp(text)
    
    # Apply the matcher to the document
    matches = matcher(doc)
    
    print(f"Review {i + 1}:")
    for match_id, start, end in matches:
        matched_span = doc[start:end]
        print(f"  Matched span: {matched_span.text}")
    print("\n")


Review 1:
  Matched span: Swedish drama
  Matched span: average Swede
  Matched span: political issues
  Matched span: ordinary denizens
  Matched span: major staple
  Matched span: Swedish cinema
  Matched span: old boy
  Matched span: artistic purposes
  Matched span: pornographic theaters
  Matched span: good film
  Matched span: Swedish cinema


Review 2:
  Matched span: pretentious steaming
  Matched span: political views
  Matched span: automatic NC-17
  Matched span: male nudity
  Matched span: fleeting views
  Matched span: crappy cable
  Matched span: indie movies
  Matched span: anatomical difference
  Matched span: female genitals
  Matched span: American film
  Matched span: explicit erotica
  Matched span: double standard
  Matched span: depressing ability


Review 3:
  Matched span: IMPORTANT issues
  Matched span: discernable motive
  Matched span: new perspectives
  Matched span: pointless film).<br


Review 4:
  Matched span: strong elements
  Matched span: realistic a