In [1]:
import warnings
warnings.filterwarnings('ignore')

# A. Basic Techniques

## 1. Lowering Case

In [2]:
sentence="The steps should be explained ONE by one" 
sentence_lower=str(sentence).lower()
print("Original Sentence:", sentence)
print("--"*60)
print("Lowered Sentence:", sentence_lower)

Original Sentence: The steps should be explained ONE by one
------------------------------------------------------------------------------------------------------------------------
Lowered Sentence: the steps should be explained one by one


## 2. Removing Punctuations

In [3]:
import string 
punc=string.punctuation
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
sentence="I got $ 100 today, this is my happiest day !"
without_punc=[word for word in sentence.split(" ") if word not in list(punc)]
print("Original Sentence:", sentence)
print("--"*60)
print("Sentence without Punctuations:", " ".join(without_punc))

Original Sentence: I got $ 100 today, this is my happiest day !
------------------------------------------------------------------------------------------------------------------------
Sentence without Punctuations: I got 100 today, this is my happiest day


## 3. Removing Special Characters and Numbers

In [5]:
import re
sentence="Find the remainder when [math]23^{24}[/math] is divided by 24,23?" 
sentence_clean=re.sub("[^a-zA-Z]", " ", sentence)
print("Original Sentence:", sentence)
print("--"*60)
print("Clean Sentence:", sentence_clean)

Original Sentence: Find the remainder when [math]23^{24}[/math] is divided by 24,23?
------------------------------------------------------------------------------------------------------------------------
Clean Sentence: Find the remainder when  math          math  is divided by       


In [6]:
import re
sentence="Area of trapezoid is : A = ½ (a + b) h" 
sentence_clean=re.sub("[^a-zA-Z]", " ", sentence)
print("Original Sentence:", sentence)
print("--"*60)
print("Clean Sentence:", sentence_clean)

Original Sentence: Area of trapezoid is : A = ½ (a + b) h
------------------------------------------------------------------------------------------------------------------------
Clean Sentence: Area of trapezoid is   A      a   b  h


## 4. Removal of HTML Tags

In [7]:
sentence='''<h3 style="color:red; font-family:Arial Black">This is NLP</h3>''' 
clean_sentence=re.sub("<.*?>", "", sentence)
print("Original Sentence:", sentence)
print("--"*60)
print("Clean Sentence:", clean_sentence)

Original Sentence: <h3 style="color:red; font-family:Arial Black">This is NLP</h3>
------------------------------------------------------------------------------------------------------------------------
Clean Sentence: This is NLP


## 5. Removing URL's

In [8]:
sentence="I visited https://github.com/Suthir24 "
clean_sentence=re.sub("(http|https|www)\S+", "", sentence)
print("Original Sentence:", sentence)
print("--"*60)
print("Clean Sentence:", clean_sentence)

Original Sentence: I visited https://github.com/Suthir24 
------------------------------------------------------------------------------------------------------------------------
Clean Sentence: I visited  


## 6. Removing Extra Spaces

In [9]:
sentence="I'm     Learning     NLP" 
clean_sentence=re.sub(" +"," ", sentence)
print("Original Sentence:", sentence)
print("--"*60)
print("Clean Sentence:", clean_sentence)

Original Sentence: I'm     Learning     NLP
------------------------------------------------------------------------------------------------------------------------
Clean Sentence: I'm Learning NLP


## 7. Expanding Contraction

In [10]:
import contractions

In [11]:
sentence="I can't wait to see you"
clear_sentence=contractions.fix(sentence)
print("Original Sentence:", sentence)
print("--"*60)
print("Clear Sentence:", clear_sentence)

Original Sentence: I can't wait to see you
------------------------------------------------------------------------------------------------------------------------
Clear Sentence: I cannot wait to see you


## 8. Text Correction

In [12]:
from textblob import TextBlob
sentence="Proteens are essential building muscles"
textblob=TextBlob(sentence)
correct_sentence=textblob.correct()
print("Original Sentence:", sentence)
print("--"*60)
print("Correct Sentence:", correct_sentence)

Original Sentence: Proteens are essential building muscles
------------------------------------------------------------------------------------------------------------------------
Correct Sentence: Proteins are essential building muscles



# B. Advanced Techniques

# 1. Apply Tokenization

## a. Sentence Tokenization

In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/suka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
from nltk.tokenize import sent_tokenize
sentence='''My name is Suthir. I work for an MNC in ANtartica. It is very hot out there.'''
tokens=sent_tokenize(sentence)
print("Original Sentence:", sentence)
print("--"*60)
print("Sentence Tokens:", tokens)

Original Sentence: My name is Suthir. I work for an MNC in ANtartica. It is very hot out there.
------------------------------------------------------------------------------------------------------------------------
Sentence Tokens: ['My name is Suthir.', 'I work for an MNC in ANtartica.', 'It is very hot out there.']


## b. Word Tokenization

In [15]:
sentence='''Roger Federer is the greatest player of all time in Tennis'''
tokens=sentence.split(" ")
print("Original Sentence:", sentence)
print("--"*60)
print("Word Tokens:", tokens)

Original Sentence: Roger Federer is the greatest player of all time in Tennis
------------------------------------------------------------------------------------------------------------------------
Word Tokens: ['Roger', 'Federer', 'is', 'the', 'greatest', 'player', 'of', 'all', 'time', 'in', 'Tennis']


In [16]:
from nltk.tokenize import word_tokenize
sentence='''Is Roger Federer is the greatest player of all time in Tennis ?'''
tokens=word_tokenize(sentence)
print("Original Sentence:", sentence)
print("--"*60)
print("Word Tokens:", tokens)

Original Sentence: Is Roger Federer is the greatest player of all time in Tennis ?
------------------------------------------------------------------------------------------------------------------------
Word Tokens: ['Is', 'Roger', 'Federer', 'is', 'the', 'greatest', 'player', 'of', 'all', 'time', 'in', 'Tennis', '?']


### We can easily see the difference, when we tokenize using string method, it will consider all the special characters & punctuation attached to a word as a part of that word, but when we tokenize using NLTK word_tokenizer it consider those special characters & punctuation as a seperate toke.

## c. Sub-Word(n-gram character) Tokenization

In [17]:
from nltk import ngrams

In [18]:
sentence='''The Laver Cup is usually played between Europe vs The World '''
n_gram_tokens=list(ngrams((sentence.split(" ")), n=3))
print("Original Sentence:", sentence)
print("--"*60)
print("N-gram Tokens:", n_gram_tokens)


Original Sentence: The Laver Cup is usually played between Europe vs The World 
------------------------------------------------------------------------------------------------------------------------
N-gram Tokens: [('The', 'Laver', 'Cup'), ('Laver', 'Cup', 'is'), ('Cup', 'is', 'usually'), ('is', 'usually', 'played'), ('usually', 'played', 'between'), ('played', 'between', 'Europe'), ('between', 'Europe', 'vs'), ('Europe', 'vs', 'The'), ('vs', 'The', 'World'), ('The', 'World', '')]


## 2. Remove Stop Words

In [19]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/suka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
from nltk.corpus import stopwords 
stopwords_en=stopwords.words("english")
print("Total Stop Words in English=", len(stopwords_en))

Total Stop Words in English= 179


In [21]:
sentence="Lion is considered as the king of the jungle"
sentence_non_stopword=[word for word in sentence.split(" ") if not word in stopwords_en] 
print("Sentence with StopWOrds:", sentence)
print("--"*60)
print("Sentence without StopWOrds:", " ".join(sentence_non_stopword))

Sentence with StopWOrds: Lion is considered as the king of the jungle
------------------------------------------------------------------------------------------------------------------------
Sentence without StopWOrds: Lion considered king jungle


## 3. Apply Stemming

### Types of Stemmer in NLP:    

###    a. Porter Stemmer 

In [22]:
from nltk.stem import PorterStemmer
porter=PorterStemmer()
sentence= "running jumps jumping chocolates better goodness"
porter_stem=[porter.stem(word) for word in sentence.split(" ")]
print("Original Sentence:", sentence)
print("--"*60)
print("Sentence after Porter Stemming:", " ".join(porter_stem))

Original Sentence: running jumps jumping chocolates better goodness
------------------------------------------------------------------------------------------------------------------------
Sentence after Porter Stemming: run jump jump chocol better good


###    b. SnowBall Stemmer  

In [23]:
from nltk.stem import SnowballStemmer
snowball=SnowballStemmer(language="english")
sentence="running jumps jumping chocolates better goodness"
snowball_stem=[snowball.stem(word) for word in sentence.split(" ")]
print("Original Sentence:", sentence)
print("--"*60)
print("Sentence after Porter Stemming:", " ".join(snowball_stem))

Original Sentence: running jumps jumping chocolates better goodness
------------------------------------------------------------------------------------------------------------------------
Sentence after Porter Stemming: run jump jump chocol better good


###    c.Lancaster Stemmer  

In [24]:
from nltk.stem import LancasterStemmer
lancaster=LancasterStemmer()
sentence="running jumps jumping chocolates better goodness "
lancaster_stem=[lancaster.stem(word) for word in sentence.split(" ")]
print("Original Sentence:", sentence)
print("--"*60)
print("Sentence after Porter Stemming:", " ".join(lancaster_stem))

Original Sentence: running jumps jumping chocolates better goodness 
------------------------------------------------------------------------------------------------------------------------
Sentence after Porter Stemming: run jump jump chocol bet good 


###    d. Regexp Stemmer

In [25]:
from nltk.stem import RegexpStemmer
regex=RegexpStemmer(regexp="ing$|s$|e$", min=0)
sentence="running jumps jumping chocolates better kindness"
regex_stem=[regex.stem(word) for word in sentence.split(" ")]
print("Original Sentence:", sentence)
print("--"*60)
print("Sentence after Porter Stemming:", " ".join(regex_stem))

Original Sentence: running jumps jumping chocolates better kindness
------------------------------------------------------------------------------------------------------------------------
Sentence after Porter Stemming: runn jump jump chocolate better kindnes


### We can see the Stemmer is not able to produce the correct outcomes thus we use Lemmatization

## 4. Apply Lemmatization

## Types of Lemmatization in NLP:    

### a. Wordnet Lemmatizer

In [26]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/suka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/suka/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [28]:
from nltk.stem import WordNetLemmatizer
lemma=WordNetLemmatizer()
sentence="The cats are playing with the mice and the birds." 
sentence_lemma=[lemma.lemmatize(word, 'v') for word in sentence.split(" ")] 
print("Original Sentence:", sentence)
print("--"*60)
print("Sentence after Lemmatization:", " ".join(sentence_lemma))

Original Sentence: The cats are playing with the mice and the birds.
------------------------------------------------------------------------------------------------------------------------
Sentence after Lemmatization: The cat be play with the mice and the birds.


### b. TextBlob Lemmatizer

In [29]:
from textblob import TextBlob, Word
sentence="The bats are hanging on their feet in upright positions" 
sent=TextBlob(sentence)
texblob_lemma=[w.lemmatize() for w in sent.words]
print("Original Sentence:", sentence)
print("--"*60)
print("Sentence after Lemmatization:", " ".join(texblob_lemma))

Original Sentence: The bats are hanging on their feet in upright positions
------------------------------------------------------------------------------------------------------------------------
Sentence after Lemmatization: The bat are hanging on their foot in upright position


### c. More Advanced Techniques

#### These Techniques are not used in all the tasks, these are problem specific. These techniques are mainly used in QA System(Question Answer), Word Sense Disambiguiation etc.


## 1. POS Tagging

### Adding a Part of Speech tags to every word in the corpus is called POS tagging. If we want to perform POS tagging then no need to remove stopwords. This is one of the essential steps in the text analysis where we know the sentence structure and which word is connected to the other, which word is rooted from which, eventually, to figure out hidden connections between words which can later boost the performance of our Machine Learning Model.

##### The Pos taglist is given:
* CC - Coordinating conjunction
* CD - Cardinal number
* DT - Determiner
* EX - Existential there
* FW - Foreign word
* IN - Preposition or subordinating conjunction
* JJ - Adjective
* JJR -Adjective, comparative
* JJS- Adjective, superlative
* LS - List item marker
* MD - Modal
* NN - Noun, singular or mass
* NNS- Noun, plural
* NNP- Proper noun, singular
* NNPS-    Proper noun, plural
* PDT -Predeterminer
* POS -Possessive ending
* PRP -Personal pronoun
* PRP -   Possessive pronoun
* RB - Adverb
* RBR -Adverb, comparative
* RBS- Adverb, superlative
* RP - Particle
* SYM -Symbol
* TO - to
* UH - Interjection
* VB - Verb, base form
* VBD - Verb, past tense
* VBG - Verb, gerund or present participle
* VBN  - Verb, past participle
* VBP- Verb, non-3rd person singular present
* VBZ- Verb, 3rd person singular present
* WDT- Wh-determiner
* WP - Wh-pronoun
* WP -Possessive wh-pronoun
* WRB -Wh-adverb
###### -- for WP and PRP if $ is used , it indicates possesive pronoun

## POS Tagging can be performed using two Libraries
###         a. POS Tagging using NLTK    

In [30]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/suka/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [31]:
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
doc=word_tokenize("The quick brown fox jumps over the lazy dog.") 
for i in range(len(doc)):
    print("Word:",pos_tag(doc)[i][0], "||", "POS Tag:", pos_tag(doc)[i][1])

Word: The || POS Tag: DT
Word: quick || POS Tag: JJ
Word: brown || POS Tag: NN
Word: fox || POS Tag: NN
Word: jumps || POS Tag: VBZ
Word: over || POS Tag: IN
Word: the || POS Tag: DT
Word: lazy || POS Tag: JJ
Word: dog || POS Tag: NN
Word: . || POS Tag: .


### b. POS Tagging using Spacy

In [32]:
import spacy

In [33]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("What is the step by step guide to invest in share market in India")
for word in doc:
    print("Word:", word.text, "||", "POS:", word.pos_, "||", "POS Tag:", word.tag_, "||", "POS Tag Description:", word.pos_)


Word: What || POS: PRON || POS Tag: WP || POS Tag Description: PRON
Word: is || POS: AUX || POS Tag: VBZ || POS Tag Description: AUX
Word: the || POS: DET || POS Tag: DT || POS Tag Description: DET
Word: step || POS: NOUN || POS Tag: NN || POS Tag Description: NOUN
Word: by || POS: ADP || POS Tag: IN || POS Tag Description: ADP
Word: step || POS: NOUN || POS Tag: NN || POS Tag Description: NOUN
Word: guide || POS: NOUN || POS Tag: NN || POS Tag Description: NOUN
Word: to || POS: PART || POS Tag: TO || POS Tag Description: PART
Word: invest || POS: VERB || POS Tag: VB || POS Tag Description: VERB
Word: in || POS: ADP || POS Tag: IN || POS Tag Description: ADP
Word: share || POS: NOUN || POS Tag: NN || POS Tag Description: NOUN
Word: market || POS: NOUN || POS Tag: NN || POS Tag Description: NOUN
Word: in || POS: ADP || POS Tag: IN || POS Tag Description: ADP
Word: India || POS: PROPN || POS Tag: NNP || POS Tag Description: PROPN


### Spacy is more powerful than NLTK. Spacy is faster and Grammatically accurate.


## 2. NER Tagging

### Named entity recognition (NER) is a natural language processing (NLP) method that extracts information from text. NER involves detecting and categorizing important information in text known as named entities. Named entities refer to the key subjects of a piece of text, such as names, locations, companies, events and products, as well as themes, topics, times, monetary values and percentages.

## NER can be performed using two Libraries
   ## a. NER using NLTK

In [34]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords_en = stopwords.words("english")
print(stopwords_en)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /Users/suka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
sentence="John Doe is the CEO of XYZ Corp in New York"
words=[word for word in sentence.split(" ") if word not in stopwords_en] 
tagged_tokens=nltk.pos_tag(words)
entities=nltk.ne_chunk(tagged_tokens)
for entity in entities: print(entity)

(PERSON John/NNP)
(ORGANIZATION Doe/NNP)
('CEO', 'NNP')
('XYZ', 'NNP')
('Corp', 'NNP')
(GPE New/NNP York/NNP)


## b. NER using Spacy

In [36]:
nlp = spacy.load("en_core_web_sm")
sentence="Apple Inc. was founded by Steve Jobs and Steve Wozniak on April 1, 1976, in Cupertino, California. It is a leading technology company." 
doc = nlp(sentence)
for entity in doc.ents:
    print(entity.text, entity.label_)

Apple Inc. ORG
Steve Jobs PERSON
Steve Wozniak PERSON
April 1, 1976 DATE
Cupertino GPE
California GPE


### Spacy is a faster and more efficient library for NER. It provides a pre-trained NER model that is highly accurate than NLTK and can recognize a wide range of named entities. Additionally, SpaCy has more advanced features such as named entity linking and coreference resolution.

### --------------------------------------------------------------------------------------------------------------------------------

# Text to Numerical Vector Conversion Techniques :

## Our Machine Learning and Deep Learning models take only numerical data as an input to train the model and do prediction, Hence it is necessary to perform conversion step to make texual data into equivalent numerical representation. There are many text to numerical vector conversion techniques, these techniques are,
## 1. BOW(Bag Of Word): Count Vectorizer

### It is a Collection of words represent a sentence with word count. Steps invloved in this process are Clean Text, Tookenize, Build Vocabulary and Generate Vecors. We can create vocabulory of size 1 to n using uni-ngram, bi- gram, n-gram.

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample corpus (collection of text documents)
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Step 1: Create a CountVectorizer object
vectorizer = CountVectorizer()

# Step 2: Fit the vectorizer on the corpus and transform the data
X = vectorizer.fit_transform(corpus)

# The output is a sparse matrix where each row represents a document,
# and each column represents a unique word in the corpus.

# Step 3: Print the feature names (unique words) found in the corpus
print("Feature names (unique words):")
print(vectorizer.get_feature_names_out())

# Step 4: Print the BOW representation of the first document
print("\nBOW representation of the first document:")
print(X.toarray()[0])

# Step 5: Print the BOW representation of all the documents
print("\nBOW representation of all documents:")
print(X.toarray())


Feature names (unique words):
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']

BOW representation of the first document:
[0 1 1 1 0 0 1 0 1]

BOW representation of all documents:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


## Advantages:
###   a. Simple Procedure and easy to implement.
###   b. Easy to Understand
## Disadvantages:
###   a. Does not consider the symmentic meaning of the word.    b. Due to large vector size computational time is high.    c. Count Vectorizer Generates Spars matrix.
###   d. Out of Vocabulary words are not captured.

## 2. TF-IDF(Term Frequence-Inverse Document Frequency)

## It is a Statistical method. It measures how important a term or word is within a document or setence relative to a collection of documents or Corpus. Words within a text document are transformed into importance numbers by a text vectorization process.

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Step 1: Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Step 2: Fit and transform the documents to obtain the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(documents)

# Step 3: Print the vocabulary (unique words) and the TF-IDF matrix
print("Vocabulary:")
print(vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())


Vocabulary:
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']

TF-IDF Matrix:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


## Advantages:
   ### a. Simple Procedure and easy to implement.
   ### b. Easy to Understand
   ### c. Here unlike BOW, weightage for those words is given high if that word occuring in that document but occuring less in corpus.

## Disadvantages:
### a. It cannot assist in carrying semantic meaning

## 3. Word2Vec(Word to Vector)

## It is a pre-trained word embedded model. Word2Vec creates vectors of the words that are distributed numerical representations of word features. These word features represents the context for the each words present in vocabulary. Two different model architectures that can be used by Word2Vec to create the word embeddings are the Continuous Bag of Words (CBOW) model(Used when dataset is small) & the Skip-Gram model(Used when the dataset is large)

In [41]:
import gensim
from gensim.models import Word2Vec

# Sample corpus
corpus = [
    ['I', 'love', 'natural', 'language', 'processing'],
    ['Word2Vec', 'is', 'a', 'word', 'embedding', 'technique'],
    ['It', 'learns', 'meaningful', 'representations', 'for', 'words'],
    ['Word', 'embeddings', 'are', 'useful', 'in', 'NLP'],
]

# Training the Word2Vec model
model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4)

# You can save the model to disk for later use
model.save("word2vec_model.bin")

# To load the model from disk later
# model = Word2Vec.load("word2vec_model.bin")

# Get the word vector for a specific word
word_vector = model.wv['Word']
print("Word vector for 'Word':", word_vector)

# Find the most similar words to a given word
similar_words = model.wv.most_similar('Word2Vec')
print("Most similar words to 'Word2Vec':", similar_words)


Word vector for 'Word': [ 0.00973568 -0.00978038 -0.00649912  0.00278432  0.00643145 -0.00536809
  0.00275295  0.00912222 -0.00681566 -0.00610047 -0.00498906 -0.00367669
  0.00184911  0.0096832   0.0064373   0.00039713  0.00247128  0.00844069
  0.00912851  0.00562807  0.00594696 -0.00762046 -0.00382721 -0.00568039
  0.00618223 -0.00225665 -0.00877957  0.00761974  0.00839924 -0.00332041
  0.00911608 -0.00073844 -0.00362591 -0.00038552  0.00019424 -0.00350519
  0.00281408  0.00572937  0.00686912 -0.00890355 -0.00219323 -0.0054816
  0.00752046  0.00650147 -0.00436071  0.00232704 -0.00595426  0.00023711
  0.00946227 -0.00260905 -0.00518821 -0.00739698 -0.00291221 -0.00086426
  0.00352841  0.00974166 -0.00338875  0.00190141  0.00968072  0.00153253
  0.00098627  0.00980247  0.00929506  0.00770758 -0.00617046  0.00998438
  0.00584908  0.00907294 -0.00199519  0.00335013  0.00683401 -0.00389338
  0.00664285  0.00256217  0.00931417 -0.00303588 -0.00310872  0.00621545
 -0.00907828 -0.00725467 -0.

## Advantages:
   ### a. Word embeddings eventually help in establishing the association of a word with another similar meaning word through the created vectors.
   ### b. Captures symmantic meaning.
   ### c. Low Dimensional vectors hence the computational time reduces.
   ### d. Dense vectors.

## Disadvantages:
   ### a. Contexual meaning only captured within the window size. or in other word it has local context scope.
   ### b. Not able to generate vectors for unseen words.

## 4. GloVe(Global Vector)

## It is also a Pre-trained word embedding technique used to overcome drawback of Word2Vec.

In [42]:
import numpy as np

# Load pre-trained GloVe word embeddings (You can download these from the official GloVe website)
# For demonstration purposes, we'll use a tiny example with just a few words.
glove_embeddings = {
    "cat": np.array([0.1, 0.2, 0.3]),
    "dog": np.array([0.4, 0.5, 0.6]),
    "fish": np.array([0.7, 0.8, 0.9]),
    "bird": np.array([1.0, 1.1, 1.2])
}

def get_word_vector(word):
    # Get the word vector from the GloVe embeddings
    return glove_embeddings.get(word, np.zeros(3))  # Return zeros for unknown words

def cosine_similarity(vector1, vector2):
    # Calculate the cosine similarity between two vectors
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity

# Example usage
word1 = "cat"
word2 = "dog"
word3 = "fish"
word4 = "bird"

vector1 = get_word_vector(word1)
vector2 = get_word_vector(word2)
vector3 = get_word_vector(word3)
vector4 = get_word_vector(word4)

print(f"Similarity between '{word1}' and '{word2}': {cosine_similarity(vector1, vector2)}")
print(f"Similarity between '{word1}' and '{word3}': {cosine_similarity(vector1, vector3)}")
print(f"Similarity between '{word2}' and '{word4}': {cosine_similarity(vector2, vector4)}")


Similarity between 'cat' and 'dog': 0.9746318461970762
Similarity between 'cat' and 'fish': 0.9594119455666702
Similarity between 'dog' and 'bird': 0.9961498555841325


## Advantages:
   ### a. Contexual meaning captured for both local and global scope.    b. Captures symmantic meaning.
   ### c. Powerful than all previous wod embedding techniques.

## Disadvantages:
   ### a. Utilizes massive memory and takes time to load and train.

## 5. BERT(Bidirectional Encoder Representations from Transformers)


## BERT is the Pre-trained birectional trasformer for Language understanding. It has trained on 2500M Wikipedia words and 800M+ Books words. And BERT used by Google search Engine. BERT uses the encoder part of the Transformer, since it’s goal is to create a model that performs a number of different NLP tasks.

In [43]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Define the text you want to classify
text = "I really enjoyed the movie. It was fantastic!"

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt")

# Make the prediction
with torch.no_grad():
    outputs = model(**inputs)

# Get the prediction scores and predicted label
predictions = outputs.logits
predicted_label = torch.argmax(predictions, dim=1)

# Print the results
print(f"Text: {text}")
print(f"Predicted Label: {predicted_label.item()}")
print(f"Prediction Scores: {predictions}")


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Text: I really enjoyed the movie. It was fantastic!
Predicted Label: 0
Prediction Scores: tensor([[ 0.5397, -0.2734]])


## Advantages:
   ### a. Contexual meaning captured for both local and global scope.    
   ### b. Captures symmantic meaning.
   ### c. Powerful than all previous wod embedding techniques.

## Disadvantages:
   ### a. Utilizes massive memory and takes time to load and train.