In [1]:
#! pip install nltk

### Import Libraries

In [2]:
import nltk
import re # regular expression
# nltk.download('punkt')  # used for tokenization
# nltk.download('wordnet') 
# nltk.download('omw-1.4')
# nltk.download('stopwords') 

from nltk.corpus import stopwords # used for stopwords
from nltk.stem.porter import PorterStemmer # used for stemming
from nltk.stem.wordnet import WordNetLemmatizer # used for lemmatization
from nltk.tokenize import sent_tokenize, word_tokenize

import pandas as pd


### Input Text

In [3]:
text="Natural Language Processing is an exciting area. Huge budget have been allocated for this."

### Tokenization

In [4]:
# sentence tokenization
print(sent_tokenize(text))

# word tokenization
print(word_tokenize(text))

['Natural Language Processing is an exciting area.', 'Huge budget have been allocated for this.']
['Natural', 'Language', 'Processing', 'is', 'an', 'exciting', 'area', '.', 'Huge', 'budget', 'have', 'been', 'allocated', 'for', 'this', '.']


### Lower case conversion

In [5]:
text=re.sub(r'[^a-zA-Z0-9]',' ',text.lower())
print(text)
words=text.split()
print(words)


natural language processing is an exciting area  huge budget have been allocated for this 
['natural', 'language', 'processing', 'is', 'an', 'exciting', 'area', 'huge', 'budget', 'have', 'been', 'allocated', 'for', 'this']


### Stop word Removal

In [6]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
words=[w for w in words if w not in stopwords.words('english')]

In [8]:
print(words)

['natural', 'language', 'processing', 'exciting', 'area', 'huge', 'budget', 'allocated']


### Stemming

In [9]:
stemmer=PorterStemmer()
stemmer.stem('allocated')

'alloc'

In [10]:
stemmed=[PorterStemmer().stem(w) for w in words]
print(stemmed)

['natur', 'languag', 'process', 'excit', 'area', 'huge', 'budget', 'alloc']


### Lemmatization

In [11]:
lemmed=[WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed)

['natural', 'language', 'processing', 'exciting', 'area', 'huge', 'budget', 'allocated']


### Executing in a single cell

In [12]:
text="Natural Language Processing is an exciting area. Huge budget have been allocated for this."
text=re.sub(r'[^a-zA-Z0-9]',' ',text.lower()) # Removing special characters and lowering
words=text.split() # Tokenization activity executed
stopText=[w for w in words if w not in stopwords.words('english')] # Stopwords executed
finalWords=[WordNetLemmatizer().lemmatize(w) for w in stopText] # Lemmetization executed

### Parts of speech

In [13]:
#nltk.download('averaged_perceptron_tagger')


In [14]:
text="Natural Language Processing is an exciting area. Huge budget have been allocated for this."

tokenized=sent_tokenize(text)
for i in tokenized:
    wordList=word_tokenize(i)
    wordList=[w for w in wordList if w not in stopwords.words('english')]
    tagged=nltk.pos_tag(wordList)
    print(tagged)


[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('exciting', 'VBG'), ('area', 'NN'), ('.', '.')]
[('Huge', 'NNP'), ('budget', 'NN'), ('allocated', 'VBD'), ('.', '.')]


### BAG OF WORDS

In [15]:
doc1 = 'Game of Thrones is an amazing tv series!'
doc2 = 'Game of Thrones is the best tv series!'
doc3 = 'Game of Thrones is so great'


In [16]:
l_doc1=re.sub(r'[^a-zA-Z0-9]',' ',doc1.lower()).split()
l_doc2=re.sub(r'[^a-zA-Z0-9]',' ',doc2.lower()).split()
l_doc2=re.sub(r'[^a-zA-Z0-9]',' ',doc3.lower()).split()


In [17]:
l_doc1

['game', 'of', 'thrones', 'is', 'an', 'amazing', 'tv', 'series']

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(stop_words='english',ngram_range=(1,2))
x=vectorizer.fit_transform([doc1,doc2,doc3])

In [19]:
x.toarray()

array([[1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1],
       [0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1],
       [0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)

In [20]:
vectorizer.get_feature_names_out()

array(['amazing', 'amazing tv', 'best', 'best tv', 'game', 'game thrones',
       'great', 'series', 'thrones', 'thrones amazing', 'thrones best',
       'thrones great', 'tv', 'tv series'], dtype=object)

In [21]:
dfBow=pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out())
dfBow

Unnamed: 0,amazing,amazing tv,best,best tv,game,game thrones,great,series,thrones,thrones amazing,thrones best,thrones great,tv,tv series
0,1,1,0,0,1,1,0,1,1,1,0,0,1,1
1,0,0,1,1,1,1,0,1,1,0,1,0,1,1
2,0,0,0,0,1,1,1,0,1,0,0,1,0,0


### TF-IDF

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
d1='petrol cars are cheaper than diesel cars'

d2='diesel is cheaper than petrol'

In [24]:
doc_corpus=[d1,d2]

In [25]:
tfVec=TfidfVectorizer(stop_words='english')
tfVec_Fit=tfVec.fit_transform(doc_corpus)
print(f'Feature Name found - {tfVec.get_feature_names_out()}')

Feature Name found - ['cars' 'cheaper' 'diesel' 'petrol']


In [26]:
tfVec_Fit.toarray()

array([[0.85135433, 0.30287281, 0.30287281, 0.30287281],
       [0.        , 0.57735027, 0.57735027, 0.57735027]])

In [29]:
# Single execution

tfVec=TfidfVectorizer(stop_words='english',ngram_range=(1,3),max_features=5)
tfVec_Fit=tfVec.fit_transform(doc_corpus)
print(f'Feature Name found - {tfVec.get_feature_names_out()}')
dfTFIDF=pd.DataFrame(tfVec_Fit.toarray(),columns=tfVec.get_feature_names_out())
dfTFIDF

Feature Name found - ['cars' 'cars cheaper' 'cheaper' 'diesel' 'petrol']


Unnamed: 0,cars,cars cheaper,cheaper,diesel,petrol
0,0.783337,0.391668,0.278675,0.278675,0.278675
1,0.0,0.0,0.57735,0.57735,0.57735


### Cosine Similarity

In [31]:
import numpy as np
from numpy.linalg import norm

d1='The food is good and great'
d2='The food is not good'

#using countervectorizer

from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(stop_words='english')
x=vectorizer.fit_transform([d1,d2])
dfbow=pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out())
dfbow

Unnamed: 0,food,good,great
0,1,1,1
1,1,1,0


In [32]:
d1_count=[1,1,1]
d2_count=[1,1,0]

cosine=np.dot(d1_count,d2_count)/(norm(d1_count)*norm(d2_count))
print(f'Cosine Similarity is {cosine}')

Cosine Similarity is 0.8164965809277259


In [34]:
#using tfidfVectorizer

tfVec=TfidfVectorizer(stop_words='english')
tfVec_Fit =tfVec.fit_transform([d1,d2])
print(f"Feature Name found - {tfVec.get_feature_names_out()}")
dfTFIDF= pd.DataFrame(tfVec_Fit.toarray(),columns=tfVec.get_feature_names_out())
dfTFIDF.head()


Feature Name found - ['food' 'good' 'great']


Unnamed: 0,food,good,great
0,0.501549,0.501549,0.704909
1,0.707107,0.707107,0.0


In [35]:
d1_count=[0.000000,0.449436,0.631667,0.631667]
d2_count=[0.814802,0.579739,0.000000,0.000000]

cosine = np.dot(d1_count,d2_count)/(norm(d1_count)*norm(d2_count))
print(f"Cosine Similarity is {cosine}")


Cosine Similarity is 0.2605557435429249
