In [7]:
# ! pip install nltk

In [8]:
# ! pip show nltk

# Import Libraries

In [1]:
import nltk
import re # Regular Expression
# nltk.download('punkt') # Used for Tokenization
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

from nltk.corpus import stopwords # Used for stopwords
from nltk.stem.porter import PorterStemmer # Used for stemming
from nltk.stem.wordnet import WordNetLemmatizer # Used for Lemmatization
from nltk.tokenize import sent_tokenize, word_tokenize



### Input Text

In [11]:
text = "Natural Language processing is an exciting area. Huge budget have been allocated for this."

### Tokenization

In [12]:
# Sent tokenize
sent_tokenize(text)

['Natural Language processing is an exciting area.',
 'Huge budget have been allocated for this.']

In [19]:
# Word Tokenize
print(word_tokenize(text))

['Natural', 'Language', 'processing', 'is', 'an', 'exciting', 'area', '.', 'Huge', 'budget', 'have', 'been', 'allocated', 'for', 'this', '.']


### Lower case Conversion

In [21]:
text = re.sub(r"[^a-zA-Z0-9]"," ",text.lower())
words = text.split()
print(words)

['natural', 'language', 'processing', 'is', 'an', 'exciting', 'area', 'huge', 'budget', 'have', 'been', 'allocated', 'for', 'this']


### Stop word removal

In [22]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [25]:
final_words=[]
for w in words:
    if w not in stopwords.words("english"):
        final_words.append(w)

In [29]:
words = [w for w in words if w not in stopwords.words("english")]
words

['natural',
 'language',
 'processing',
 'exciting',
 'area',
 'huge',
 'budget',
 'allocated']

In [26]:
final_words

['natural',
 'language',
 'processing',
 'exciting',
 'area',
 'huge',
 'budget',
 'allocated']

### Stemming

In [30]:
stemmer = PorterStemmer()
stemmer.stem('natural')

'natur'

In [32]:
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

['natur', 'languag', 'process', 'excit', 'area', 'huge', 'budget', 'alloc']


### Lemmatization

In [33]:
lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed)

['natural', 'language', 'processing', 'exciting', 'area', 'huge', 'budget', 'allocated']


### Executing in a single cell

In [36]:
text = "Natural Language processing is an exciting area. Huge budget have been allocated for this."
text = re.sub(r"[^a-zA-Z0-9]"," ",text.lower()) # Removing special character and lowering
words = text.split() # Tokenization activity executed
stopText = [w for w in words if w not in stopwords.words("english")] # Stopwords executed
finalWords = [WordNetLemmatizer().lemmatize(w) for w in stopText] # Lemmatization executed
print(finalWords)

['natural', 'language', 'processing', 'exciting', 'area', 'huge', 'budget', 'allocated']


### Parts of Speech

In [37]:
# nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\LAP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [38]:
text = "Natural Language processing is an exciting area. Huge budget have been allocated for this."

tokenized = sent_tokenize(text)
for i in tokenized:
    wordList = word_tokenize(i)
    wordList = [w for w in wordList if w not in stopwords.words("english")]
    tagged = nltk.pos_tag(wordList)
    print(tagged)

[('Natural', 'JJ'), ('Language', 'NNP'), ('processing', 'NN'), ('exciting', 'VBG'), ('area', 'NN'), ('.', '.')]
[('Huge', 'NNP'), ('budget', 'NN'), ('allocated', 'VBD'), ('.', '.')]


In [None]:
# Meanings of Respective Short forms

CC coordinating conjunction 
CD cardinal digit 
DT determiner 
EX existential there (like: “there is” … think of it like “there exists”) 
FW foreign word 
IN preposition/subordinating conjunction 
JJ adjective – ‘big’ 
JJR adjective, comparative – ‘bigger’ 
JJS adjective, superlative – ‘biggest’ 
LS list marker 1) 
MD modal – could, will 
NN noun, singular ‘- desk’ 
NNS noun plural – ‘desks’ 
NNP proper noun, singular – ‘Harrison’ 
NNPS proper noun, plural – ‘Americans’ 
PDT predeterminer – ‘all the kids’ 
POS possessive ending parent’s 
PRP personal pronoun –  I, he, she 
PRP$ possessive pronoun – my, his, hers 
RB adverb – very, silently, 
RBR adverb, comparative – better 
RBS adverb, superlative – best 
RP particle – give up 
TO – to go ‘to’ the store. 
UH interjection – errrrrrrrm 
VB verb, base form – take 
VBD verb, past tense – took 
VBG verb, gerund/present participle – taking 
VBN verb, past participle – taken 
VBP verb, sing. present, non-3d – take 
VBZ verb, 3rd person sing. present – takes 
WDT wh-determiner – which 
WP wh-pronoun – who, what 
WP$ possessive wh-pronoun, eg- whose 
WRB wh-adverb, eg- where, when

# Bag of Words

In [1]:
doc1 = 'Game of Thrones is an amazing tv series!'
doc2 = 'Game of Thrones is the best tv series!'
doc3 = 'Game of Thrones is so great'

In [5]:
lower_doc1 = re.sub(r"[^a-zA-Z0-9]"," ",doc1.lower()).split()
lower_doc2 = re.sub(r"[^a-zA-Z0-9]"," ",doc2.lower()).split()
lower_doc3 = re.sub(r"[^a-zA-Z0-9]"," ",doc3.lower()).split()

In [7]:
# No need to convert to lower case with above code, as CountVectorizer will do everything.

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
x = vectorizer.fit_transform([doc1,doc2,doc3])

In [8]:
x

<3x7 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [9]:
# When we print x, it will not show anything as we can see class is numpy.
# Whenever anything is numpy, know that it is in array format.
# x is a numpy object.
# So, toarray() will give the values in the numpy object.

In [12]:
x.toarray()

array([[1, 0, 1, 0, 1, 1, 1],
       [0, 1, 1, 0, 1, 1, 1],
       [0, 0, 1, 1, 0, 1, 0]], dtype=int64)

In [13]:
vectorizer.get_feature_names_out()

array(['amazing', 'best', 'game', 'great', 'series', 'thrones', 'tv'],
      dtype=object)

In [2]:
import pandas as pd

In [19]:
dfBow = pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out())
dfBow.head()

Unnamed: 0,amazing,best,game,great,series,thrones,tv
0,1,0,1,0,1,1,1
1,0,1,1,0,1,1,1
2,0,0,1,1,0,1,0


In [21]:
# Unigram and Bigram

vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,2))
x = vectorizer.fit_transform([doc1,doc2,doc3])
dfBow = pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out())
dfBow.head()

Unnamed: 0,amazing,amazing tv,best,best tv,game,game thrones,great,series,thrones,thrones amazing,thrones best,thrones great,tv,tv series
0,1,1,0,0,1,1,0,1,1,1,0,0,1,1
1,0,0,1,1,1,1,0,1,1,0,1,0,1,1
2,0,0,0,0,1,1,1,0,1,0,0,1,0,0


In [25]:
# Bigram only

vectorizer = CountVectorizer(stop_words='english',ngram_range=(2,2))
x = vectorizer.fit_transform([doc1,doc2,doc3])
dfBow = pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out())
dfBow.head()

Unnamed: 0,amazing tv,best tv,game thrones,thrones amazing,thrones best,thrones great,tv series
0,1,0,1,1,0,0,1
1,0,1,1,0,1,0,1
2,0,0,1,0,0,1,0


In [26]:
# Unigram,Bigram and Trigram

vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,3))
x = vectorizer.fit_transform([doc1,doc2,doc3])
dfBow = pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out())
dfBow.head()

Unnamed: 0,amazing,amazing tv,amazing tv series,best,best tv,best tv series,game,game thrones,game thrones amazing,game thrones best,...,great,series,thrones,thrones amazing,thrones amazing tv,thrones best,thrones best tv,thrones great,tv,tv series
0,1,1,1,0,0,0,1,1,1,0,...,0,1,1,1,1,0,0,0,1,1
1,0,0,0,1,1,1,1,1,0,1,...,0,1,1,0,0,1,1,0,1,1
2,0,0,0,0,0,0,1,1,0,0,...,1,0,1,0,0,0,0,1,0,0


In [28]:
# This will show the index of each columns and from that we can take columns with meaningful words.
vectorizer.vocabulary_

{'game': 6,
 'thrones': 13,
 'amazing': 0,
 'tv': 19,
 'series': 12,
 'game thrones': 7,
 'thrones amazing': 14,
 'amazing tv': 1,
 'tv series': 20,
 'game thrones amazing': 8,
 'thrones amazing tv': 15,
 'amazing tv series': 2,
 'best': 3,
 'thrones best': 16,
 'best tv': 4,
 'game thrones best': 9,
 'thrones best tv': 17,
 'best tv series': 5,
 'great': 11,
 'thrones great': 18,
 'game thrones great': 10}

# TF-IDF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
d1 = "petrol cars are cheaper than diesel cars"

d2 = "diesel is cheaper than petrol"

In [7]:
doc_corpus=[d1,d2]

In [10]:
tfVec = TfidfVectorizer(stop_words='english')
tfVec_fit = tfVec.fit_transform(doc_corpus)
print(f"Feature Name found - {tfVec.get_feature_names_out()}")

Feature Name found - ['cars' 'cheaper' 'diesel' 'petrol']


In [11]:
tfVec_fit.toarray()

array([[0.85135433, 0.30287281, 0.30287281, 0.30287281],
       [0.        , 0.57735027, 0.57735027, 0.57735027]])

In [16]:
# Single Execution

tfVec = TfidfVectorizer(stop_words='english',ngram_range=(1,3))
tfVec_fit = tfVec.fit_transform(doc_corpus)
print(f"Feature Name found - {tfVec.get_feature_names_out()}")
dfTFIDF = pd.DataFrame(tfVec_fit.toarray(),columns=tfVec.get_feature_names_out())
dfTFIDF.head()

Feature Name found - ['cars' 'cars cheaper' 'cars cheaper diesel' 'cheaper' 'cheaper diesel'
 'cheaper diesel cars' 'cheaper petrol' 'diesel' 'diesel cars'
 'diesel cheaper' 'diesel cheaper petrol' 'petrol' 'petrol cars'
 'petrol cars cheaper']


Unnamed: 0,cars,cars cheaper,cars cheaper diesel,cheaper,cheaper diesel,cheaper diesel cars,cheaper petrol,diesel,diesel cars,diesel cheaper,diesel cheaper petrol,petrol,petrol cars,petrol cars cheaper
0,0.565262,0.282631,0.282631,0.201094,0.282631,0.282631,0.0,0.201094,0.282631,0.0,0.0,0.201094,0.282631,0.282631
1,0.0,0.0,0.0,0.334712,0.0,0.0,0.470426,0.334712,0.0,0.470426,0.470426,0.334712,0.0,0.0


In [17]:
# Single Execution

tfVec = TfidfVectorizer(stop_words='english',ngram_range=(1,3),max_features=10)  

# max_features will give the columns with maximum repeating count according to descending order.
# so that columns with least repeating count can be removed.

tfVec_fit = tfVec.fit_transform(doc_corpus)
print(f"Feature Name found - {tfVec.get_feature_names_out()}")
dfTFIDF = pd.DataFrame(tfVec_fit.toarray(),columns=tfVec.get_feature_names_out())
dfTFIDF.head()

Feature Name found - ['cars' 'cars cheaper' 'cars cheaper diesel' 'cheaper' 'cheaper diesel'
 'cheaper diesel cars' 'cheaper petrol' 'diesel' 'diesel cars' 'petrol']


Unnamed: 0,cars,cars cheaper,cars cheaper diesel,cheaper,cheaper diesel,cheaper diesel cars,cheaper petrol,diesel,diesel cars,petrol
0,0.616664,0.308332,0.308332,0.219381,0.308332,0.308332,0.0,0.219381,0.308332,0.219381
1,0.0,0.0,0.0,0.448321,0.0,0.0,0.630099,0.448321,0.0,0.448321


### Cosine Similarity

In [22]:
import numpy as np
from numpy.linalg import norm

d1 = "Food is good and great"  # +ve statement  
d2 = "The food is not good"    # -ve statement

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
x = vectorizer.fit_transform([d1,d2])
dfBow = pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out())
dfBow.head()

Unnamed: 0,food,good,great
0,1,1,1
1,1,1,0


In [24]:
d1_count=[1,1,1]
d2_count=[1,1,0]

cosine = np.dot(d1_count,d2_count)/(norm(d1_count)*norm(d2_count))
print(f"Cosine similarity is {cosine}")

Cosine similarity is 0.8164965809277259


In [25]:
tfVec = TfidfVectorizer(stop_words='english')
tfVec_fit = tfVec.fit_transform([d1,d2])
print(f"Feature Name found - {tfVec.get_feature_names_out()}")
dfTFIDF = pd.DataFrame(tfVec_fit.toarray(),columns=tfVec.get_feature_names_out())
dfTFIDF.head()

Feature Name found - ['food' 'good' 'great']


Unnamed: 0,food,good,great
0,0.501549,0.501549,0.704909
1,0.707107,0.707107,0.0


In [26]:
d1_count=[0.501549,0.501549,0.704909]
d2_count=[0.707107,0.707107,0.000000]

cosine = np.dot(d1_count,d2_count)/(norm(d1_count)*norm(d2_count))
print(f"Cosine similarity is {cosine}")

Cosine similarity is 0.7092975763535904
