<a href="https://colab.research.google.com/github/BALAJIHARIDASAN/Natural-Language-Processing/blob/main/Text%20preprocessing%20and%20Woed%20Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [29]:
import spacy
nlp = spacy.load('en_core_web_sm')


In [30]:
import spacy
nlp = spacy.load('en_core_web_sm')
#Creating an NLP object
doc =nlp("He went to play cricket")

In [31]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [32]:
#nlp.disable_pipes('tagger', 'parser')
#if any of the above components are diasbled, i.e. parser or tagger, w.r.t current context
#then the labels such as .pos, or .dep_ might not work.
#One has to disable or enable the components as per the needs.
#nlp.disable_pipes('parser')
nlp.add_pipe('sentencizer') #will help in splitting sentence

<spacy.pipeline.sentencizer.Sentencizer at 0x7fb76148f000>

In [33]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'sentencizer']

In [34]:
text = "Taylor is learning music"

In [35]:
doc = nlp(text)

In [36]:
print(doc)

Taylor is learning music


In [37]:
print(len(text))

24


In [38]:
print(len(doc))

4


In [39]:
for token in doc:
    print(token.text)

Taylor
is
learning
music


In [40]:
text = "I am going where Taylor went yesterday"

In [41]:
doc = nlp(text)

In [42]:
for token in doc:
    print(token.text, "-", token.lemma_)

I - I
am - be
going - go
where - where
Taylor - Taylor
went - go
yesterday - yesterday


In [43]:
text = "Taylor is learning music. I am going where Taylor went yesterday. I like listening to Taylor's music"

In [44]:
doc = nlp(text)

In [45]:
sentences = [sentence.text for sentence in doc.sents]
sentences

['Taylor is learning music.',
 'I am going where Taylor went yesterday.',
 "I like listening to Taylor's music"]

In [46]:
token_without_punc = [token for token in doc if not token.is_punct]
token_without_punc

[Taylor,
 is,
 learning,
 music,
 I,
 am,
 going,
 where,
 Taylor,
 went,
 yesterday,
 I,
 like,
 listening,
 to,
 Taylor,
 's,
 music]

In [47]:
all_stopwords = nlp.Defaults.stop_words

In [48]:
len(all_stopwords)

326

In [49]:
token_without_stop = [token for token in token_without_punc if not token.is_stop]
token_without_stop

[Taylor,
 learning,
 music,
 going,
 Taylor,
 went,
 yesterday,
 like,
 listening,
 Taylor,
 music]

In [50]:
for token in doc:
  print(token.text, token.pos_)

Taylor PROPN
is AUX
learning VERB
music NOUN
. PUNCT
I PRON
am AUX
going VERB
where SCONJ
Taylor PROPN
went VERB
yesterday NOUN
. PUNCT
I PRON
like VERB
listening VERB
to ADP
Taylor PROPN
's PART
music NOUN


In [51]:
for token in doc:
  print(token.text, token.dep_)

Taylor nsubj
is aux
learning ROOT
music dobj
. punct
I nsubj
am aux
going ROOT
where advmod
Taylor nsubj
went ccomp
yesterday npadvmod
. punct
I nsubj
like ROOT
listening xcomp
to prep
Taylor poss
's case
music pobj


In [52]:
for ent in doc.ents:
  print(ent.text, ent.label_)

Taylor PERSON
Taylor PERSON
yesterday DATE
Taylor PERSON


In [55]:
displacy.render(doc, style="dep", jupyter=True)

In [58]:
displacy.render(doc, style="ent", jupyter=True)

In [60]:
from sklearn.feature_extraction.text import CountVectorizer

In [61]:
doc1 = "The quick brown fox jumps over the lazy dog."
doc2 = "Early bird catches the worm."
doc3 = "A stitch in time saves nine."

In [62]:
docs = [doc1, doc2, doc3]

In [63]:
docs

['The quick brown fox jumps over the lazy dog.',
 'Early bird catches the worm.',
 'A stitch in time saves nine.']

In [64]:
vectorizer = CountVectorizer(stop_words='english')

In [65]:
vectorizer

In [66]:
bow_vectors = vectorizer.fit_transform(docs)

In [67]:
bow_vectors

<3x13 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [68]:
vocab = vectorizer.get_feature_names_out()

In [69]:
vocab

array(['bird', 'brown', 'catches', 'dog', 'early', 'fox', 'jumps', 'lazy',
       'quick', 'saves', 'stitch', 'time', 'worm'], dtype=object)

In [70]:
print("Vocabulary: ", vocab)

Vocabulary:  ['bird' 'brown' 'catches' 'dog' 'early' 'fox' 'jumps' 'lazy' 'quick'
 'saves' 'stitch' 'time' 'worm']


In [71]:
for i, doc in enumerate(docs):
    print("BOW vector for document", i+1, ": ", bow_vectors[i].toarray())

BOW vector for document 1 :  [[0 1 0 1 0 1 1 1 1 0 0 0 0]]
BOW vector for document 2 :  [[1 0 1 0 1 0 0 0 0 0 0 0 1]]
BOW vector for document 3 :  [[0 0 0 0 0 0 0 0 0 1 1 1 0]]


In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [73]:
doc1 = "The quick brown fox jumped over the lazy dog"
doc2 = "The dog and the fox were good friends"
doc3 = "The quick brown cat jumped over the lazy dog"

In [74]:
documents = [doc1, doc2, doc3]

In [75]:
vectorizer = TfidfVectorizer()

In [76]:
tfidf_matrix = vectorizer.fit_transform(documents)

In [77]:
vocab = vectorizer.get_feature_names_out()

In [79]:
vocab

array(['and', 'brown', 'cat', 'dog', 'fox', 'friends', 'good', 'jumped',
       'lazy', 'over', 'quick', 'the', 'were'], dtype=object)

In [78]:
for i, doc in enumerate(documents):
    print(f"Document {i+1}: {doc}")
    print(tfidf_matrix.toarray()[i])

Document 1: The quick brown fox jumped over the lazy dog
[0.         0.33304752 0.         0.25864111 0.33304752 0.
 0.         0.33304752 0.33304752 0.33304752 0.33304752 0.51728221
 0.        ]
Document 2: The dog and the fox were good friends
[0.39769885 0.         0.         0.23488735 0.30246022 0.39769885
 0.39769885 0.         0.         0.         0.         0.46977469
 0.39769885]
Document 3: The quick brown cat jumped over the lazy dog
[0.         0.32034893 0.42122034 0.24877952 0.         0.
 0.         0.32034893 0.32034893 0.32034893 0.32034893 0.49755904
 0.        ]


In [80]:
from gensim.models import Word2Vec

In [81]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [82]:
sentences = [
    "The quick brown fox jumps over the lazy dog",
    "I have a cat named Bob",
    "Mary had a little lamb",
    "The cat chased the mouse"
]

In [83]:
tokenized_sentences = [nltk.word_tokenize(sentence.lower()) for sentence in sentences]

In [85]:
tokenized_sentences

[['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'],
 ['i', 'have', 'a', 'cat', 'named', 'bob'],
 ['mary', 'had', 'a', 'little', 'lamb'],
 ['the', 'cat', 'chased', 'the', 'mouse']]

In [84]:
model_cbow = Word2Vec(sentences=tokenized_sentences, vector_size=10, window=2, sg=0, min_count=1, workers=4, epochs=100)


In [86]:
model_cbow

<gensim.models.word2vec.Word2Vec at 0x7fb757645960>

In [87]:
vocabulary = model_cbow.wv.index_to_key

In [88]:
vocabulary

['the',
 'cat',
 'a',
 'i',
 'quick',
 'brown',
 'fox',
 'jumps',
 'over',
 'lazy',
 'dog',
 'mouse',
 'chased',
 'named',
 'bob',
 'mary',
 'had',
 'little',
 'lamb',
 'have']

In [89]:
vectors = model_cbow.wv[vocabulary]

In [90]:
print("Vocabulary:", vocabulary)
print("Vectors:", vectors)

Vocabulary: ['the', 'cat', 'a', 'i', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'mouse', 'chased', 'named', 'bob', 'mary', 'had', 'little', 'lamb', 'have']
Vectors: [[-0.00579906  0.00216587  0.05165328  0.09021517 -0.09243679 -0.07150614
   0.06579404  0.08985182 -0.05098763 -0.03704023]
 [ 0.07356491 -0.01552288 -0.04470968  0.06539246 -0.04794506 -0.01825717
   0.0302473   0.010129   -0.08385248 -0.0938779 ]
 [ 0.0731724   0.05033677  0.06809691  0.00783     0.06384286 -0.0342177
  -0.00855822  0.05823005 -0.07600778 -0.03897595]
 [-0.07522599 -0.00942782  0.09582447 -0.07332443 -0.02284672 -0.01961325
   0.08160633 -0.05896096 -0.00014829 -0.04732473]
 [-0.09607138  0.04996368 -0.08710957 -0.04383304  0.00020548 -0.0028955
  -0.07576942  0.09639003  0.04925632  0.09292925]
 [-0.08216061  0.04484733 -0.04079904  0.00843407  0.08569652 -0.04498284
   0.04685031 -0.06757127 -0.03652569  0.09466898]
 [-0.01596777  0.00298292 -0.04100777 -0.07673436 -0.01455512  0.0247766


In [91]:
import gensim
import numpy as np

In [92]:
sentences = [["apple", "banana", "orange"], ["orange", "grape", "apple"], ["banana", "grape", "cherry"]]


In [93]:
model = gensim.models.Word2Vec(sentences, min_count=1)


In [94]:
model

<gensim.models.word2vec.Word2Vec at 0x7fb757646770>

In [95]:
def get_sentence_vector(model, sentence):
    vectors = []
    for word in sentence:
        if word in model.wv.index_to_key:
            vectors.append(model.wv[word])
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros((model.vector_size,), dtype=np.float32)

In [96]:
vectors = []
for sentence in sentences:
    vectors.append(get_sentence_vector(model, sentence))


In [97]:
print("Vocabulary:", list(model.wv.index_to_key))
print("Vectors:\n", np.array(vectors))

Vocabulary: ['grape', 'orange', 'banana', 'apple', 'cherry']
Vectors:
 [[-5.5892672e-03  5.3474712e-03 -6.0680759e-04  7.9973222e-04
   6.5797097e-03 -9.7219396e-04  5.8476930e-05  5.2099838e-03
  -1.6972515e-03 -2.4929435e-03  1.4449350e-03 -2.2870821e-03
   2.5591548e-03  5.7813973e-04  6.4344076e-03 -3.3285268e-04
   6.9286474e-03  4.0305206e-03 -6.7919884e-03  3.7568729e-04
  -1.0150893e-03 -3.8600645e-03  4.7079227e-03 -2.9668361e-03
   1.6487334e-03  1.9190244e-04  3.2158245e-03  4.5110714e-03
  -2.0889379e-03  3.6999192e-03  5.3695440e-03 -5.2032056e-03
  -5.0484356e-03 -3.4199134e-03 -2.8366843e-04  3.3412457e-03
   5.2489317e-03  1.1416145e-03  5.2590203e-04  6.0521238e-03
   1.9112648e-03 -3.1007145e-05 -1.2569638e-03  1.2112252e-03
   2.7321510e-03  5.0846864e-03  3.4146283e-03 -2.2086294e-03
  -1.9946997e-03 -5.3739187e-04  4.5103449e-03 -4.0267124e-03
  -5.6211981e-03 -4.6672183e-03 -5.4029841e-03  7.7267765e-04
   3.0535916e-03  3.8815343e-03 -5.2931352e-04 -9.9204633e-05

In [98]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [99]:
sequences = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]

In [100]:
padded_sequences_post = pad_sequences(sequences, maxlen=5, padding='post')
padded_sequences_pre = pad_sequences(sequences, maxlen=5, padding='pre')


In [101]:
padded_sequences_pre

array([[0, 0, 1, 2, 3],
       [0, 0, 0, 4, 5],
       [0, 6, 7, 8, 9]], dtype=int32)

In [102]:
padded_sequences_post

array([[1, 2, 3, 0, 0],
       [4, 5, 0, 0, 0],
       [6, 7, 8, 9, 0]], dtype=int32)