In [1]:
# pip install spacy
# python -m spacy download en

In [2]:
import nltk
import spacy

In [3]:
# NLTK
from nltk.tokenize import sent_tokenize,word_tokenize

sent_tokenize = sent_tokenize("Dr. Strange loves pav bhaji of mumbai. Hulk loves chaat of delhi")
word_tokenize = word_tokenize("Dr. Strange loves pav bhaji of mumbai. Hulk loves chaat of delhi")

## SpaCy

### spacy.blank('en')

In [4]:
nlp = spacy.blank("en")
nlp

<spacy.lang.en.English at 0x26d7723dc70>

In [5]:
doc = nlp('Dr. Strange loves pav bhaji of mumbai. Hulk loves chaat of delhi')

for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
of
mumbai
.
Hulk
loves
chaat
of
delhi


In [6]:
dir(doc[0]) # all methods available
print(type(doc))
print(type(doc[1:4]))
print(type(doc[0]))

<class 'spacy.tokens.doc.Doc'>
<class 'spacy.tokens.span.Span'>
<class 'spacy.tokens.token.Token'>


In [7]:
doc2 = nlp('i bought two laptops for my job on 2000$')
doc2[2].like_num

True

In [8]:
doc2[-1].is_currency

True

In [9]:
nlp.add_pipe('sentencizer')
nlp.pipe_names
doc = nlp('Dr. Strange loves pav bhaji of mumbai. Hulk loves chaat of delhi.')
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chaat of delhi.


In [10]:
nlp.pipe_names

['sentencizer']

### spacy.load('en_core_web_sm')

In [11]:
nlp2 = spacy.load('en_core_web_sm')
nlp2

<spacy.lang.en.English at 0x26d776b1a00>

In [12]:
nlp2.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [13]:
new_doc = nlp('Dr. Strange loves pav bhaji of mumbai. Hulk loves chaat of delhi')
new_doc

Dr. Strange loves pav bhaji of mumbai. Hulk loves chaat of delhi

In [14]:
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chaat of delhi.


In [15]:
for sentence in doc.sents:
    for word in sentence:
        print(word)

Dr.
Strange
loves
pav
bhaji
of
mumbai
.
Hulk
loves
chaat
of
delhi
.


#### POS - Part Of Speech

In [16]:
txt = nlp2("Captain america ate 100$ samosa. Then he said i can do this all day.")

for token in txt:
    print(token , '\t' , token.pos_ , '\t |' , spacy.explain(token.pos_) , '\t |' , token.tag_ , 
          '\t |' , spacy.explain(token.tag_))

Captain 	 PROPN 	 | proper noun 	 | NNP 	 | noun, proper singular
america 	 PROPN 	 | proper noun 	 | NNP 	 | noun, proper singular
ate 	 VERB 	 | verb 	 | VBD 	 | verb, past tense
100 	 NUM 	 | numeral 	 | CD 	 | cardinal number
$ 	 SYM 	 | symbol 	 | $ 	 | symbol, currency
samosa 	 NOUN 	 | noun 	 | NNS 	 | noun, plural
. 	 PUNCT 	 | punctuation 	 | . 	 | punctuation mark, sentence closer
Then 	 ADV 	 | adverb 	 | RB 	 | adverb
he 	 PRON 	 | pronoun 	 | PRP 	 | pronoun, personal
said 	 VERB 	 | verb 	 | VBD 	 | verb, past tense
i 	 PRON 	 | pronoun 	 | PRP 	 | pronoun, personal
can 	 AUX 	 | auxiliary 	 | MD 	 | verb, modal auxiliary
do 	 VERB 	 | verb 	 | VB 	 | verb, base form
this 	 PRON 	 | pronoun 	 | DT 	 | determiner
all 	 DET 	 | determiner 	 | DT 	 | determiner
day 	 NOUN 	 | noun 	 | NN 	 | noun, singular or mass
. 	 PUNCT 	 | punctuation 	 | . 	 | punctuation mark, sentence closer


In [17]:
# count = txt.count_by(spacy.attrs.POS)

# for i,j in count.items():
#     print(txt.vacab[i].text,'\t|',j)

#### NER - Named Entity Recognition

In [18]:
txt2 = nlp2("Apple Inc and Tesla Inc is going to acquire Twitter for 45$ billion")

for ent in txt2.ents:
        print(ent.text ,'\t', ent.label_ ,'\t', spacy.explain(ent.label_))
        

from spacy import displacy

displacy.render(txt2,style='ent')

Apple Inc 	 ORG 	 Companies, agencies, institutions, etc.
Tesla Inc 	 ORG 	 Companies, agencies, institutions, etc.
Twitter 	 PRODUCT 	 Objects, vehicles, foods, etc. (not services)
45$ billion 	 MONEY 	 Monetary values, including unit


In [19]:
from spacy.tokens import Span

s1 = Span(txt2,0,2)
s1

Apple Inc

### Stemming & Lemmatization
- Stem : talking -> talk
- Lemma : ate -> eat

In [20]:
#spacy doent have support for stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

words = ["eats","eating","eat","ate","adjustable","rafting","ability","meeting"]
for word in words:
    print(word,'\t',stemmer.stem(word))

eats 	 eat
eating 	 eat
eat 	 eat
ate 	 ate
adjustable 	 adjust
rafting 	 raft
ability 	 abil
meeting 	 meet


In [21]:
#lemmatization
lemmas = nlp2("eats eating eat ate adjustable rafting ability meeting better is was bro bruh")

for word in lemmas:
    print(word, '\t', word.lemma_)

eats 	 eats
eating 	 eat
eat 	 eat
ate 	 eat
adjustable 	 adjustable
rafting 	 raft
ability 	 ability
meeting 	 meeting
better 	 well
is 	 be
was 	 be
bro 	 bro
bruh 	 bruh


In [22]:
nlp2.pipe_names
attribute_ruler = nlp2.get_pipe('attribute_ruler')
attribute_ruler.add([[{"TEXT" : "bro"}] , [{"TEXT" : "bruh"}]],
                    {"LEMMA" : "Brother"})

lemmas = nlp2("eats eating eat ate adjustable rafting ability meeting better is was bro bruh")

for word in lemmas:
    print(word, '\t', word.lemma_)

eats 	 eats
eating 	 eat
eat 	 eat
ate 	 eat
adjustable 	 adjustable
rafting 	 raft
ability 	 ability
meeting 	 meeting
better 	 well
is 	 be
was 	 be
bro 	 Brother
bruh 	 Brother


## Stop words

In [23]:
from spacy.lang.en import STOP_WORDS

len(STOP_WORDS)

txt = "we just opened our wings, the flying part is coming soon"
nlp = spacy.load("en_core_web_sm")
doc = nlp(txt)

type(doc)

for token in doc:
    if not token.is_stop and not token.is_punct:
        print(token)

opened
wings
flying
coming
soon


## BOW

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample text data
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Create an instance of the CountVectorizer
vectorizer = CountVectorizer()

# Learn the vocabulary and transform the documents into vectors
X = vectorizer.fit_transform(documents)

# Get the feature names (words in the vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Print the vocabulary
print("Vocabulary:")
for feature in feature_names:
    print(feature)

# Print the vector representation of the documents
print("\nVector representation of the documents:")
print(X.toarray())


Vocabulary:
and
document
first
is
one
second
the
third
this

Vector representation of the documents:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


## Bag Of n-grams

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
documents = [
    "I love NLP",
    "NLP is fascinating",
    "I enjoy learning NLP"
]

# Create an instance of CountVectorizer with n-gram range (e.g., unigrams and bigrams)
vectorizer = CountVectorizer(ngram_range=(1, 2))

# Learn the vocabulary and transform the documents into a Bag of n-grams representation
X = vectorizer.fit_transform(documents)

# Get the feature names (n-grams in the vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Print the vocabulary
print("Vocabulary:")
for feature in feature_names:
    print(feature)

# Print the Bag of n-grams representation of the documents
print("\nBag of n-grams representation of the documents:")
print(X.toarray())


Vocabulary:
enjoy
enjoy learning
fascinating
is
is fascinating
learning
learning nlp
love
love nlp
nlp
nlp is

Bag of n-grams representation of the documents:
[[0 0 0 0 0 0 0 1 1 1 0]
 [0 0 1 1 1 0 0 0 0 1 1]
 [1 1 0 0 0 1 1 0 0 1 0]]


## TF-IDF

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()

# Learn the vocabulary and transform the documents into a TF-IDF representation
X = vectorizer.fit_transform(documents)

# Get the feature names (terms in the vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Print the vocabulary
print("Vocabulary:")
for feature in feature_names:
    print(feature)

# Print the TF-IDF representation of the documents
print("\nTF-IDF representation of the documents:")
print(X.toarray())


Vocabulary:
and
document
first
is
one
second
the
third
this

TF-IDF representation of the documents:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


## Word Embeddings

In [39]:
from gensim.models import Word2Vec

# Example corpus
corpus = [["I", "love", "NLP"],
          ["Word", "embeddings", "are", "useful"],
          ["Machine", "learning", "is", "interesting"],
          ["king", "man", "woman", "queen"]]

# Train the Word2Vec model
model = Word2Vec(corpus, vector_size=100, window=5, min_count=1)

# Get the word vector for a specific word
word_vector = model.wv["NLP"]

# Find similar words to a given word
similar_words = model.wv.most_similar("learning")

# Perform vector arithmetic
result = model.wv.most_similar(positive=["king", "woman"], negative=["man"])


In [42]:
similar_words

[('man', 0.14595064520835876),
 ('embeddings', 0.05048208683729172),
 ('king', 0.04157736897468567),
 ('interesting', 0.03476494178175926),
 ('is', 0.019152291119098663),
 ('queen', 0.01613471284508705),
 ('love', 0.01281161978840828),
 ('useful', 0.00882619246840477),
 ('I', 0.007729304954409599),
 ('Machine', 0.004842511378228664)]

## Word vector in spacy

In [52]:
# python -m spacy download en_core_web_lg

nlp = spacy.load("en_core_web_lg")

doc = nlp('dog cat banana hejfhz')

for token in doc:
    print(token.text,'\tVector',token.has_vector,'\tOOV',token.is_oov)

dog 	Vector True 	OOV False
cat 	Vector True 	OOV False
banana 	Vector True 	OOV False
hejfhz 	Vector False 	OOV True


In [56]:
base_token = nlp('bread')

doc2 = nlp('bread sandwich burger car tiger human wheat')

for token in doc2:
    print(f'{token.text} <-> {base_token.text}:', token.similarity(base_token))

bread <-> bread: 1.0
sandwich <-> bread: 0.6341067010130894
burger <-> bread: 0.47520687769584247
car <-> bread: 0.06451532596945217
tiger <-> bread: 0.04764611272488976
human <-> bread: 0.2151154210812192
wheat <-> bread: 0.615036141030184


In [63]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
queen = nlp.vocab['queen'].vector
woman = nlp.vocab['woman'].vector

result = king - man + woman

from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result] , [queen])

array([[0.6178014]], dtype=float32)

## Word vector in gensim

In [65]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

wv.similarity(w1="great",w2="good")
wv.most_similar('good')
# king - women + man = queen
# franse - paris + berlin = germany
wv.most_similar(positive=['Franse','Berlin'],negative='Paris')
wv.doesnt_match(['facebook','cat','google','microsoft'])

In [68]:
glv = api.load('glove-twitter-25')
glv.most_similar('good')
glv.doesnt_match('facebook cat google microsoft')


## Word vector in fastText

## Chatbots