# Stemming in NLP

In [1]:
import nltk
import warnings
warnings.filterwarnings('ignore')

In [2]:
word = ['change', 'changing', 'changes', 'changed']

In [3]:
word

['change', 'changing', 'changes', 'changed']

In [4]:
from nltk.stem import PorterStemmer

In [5]:
p = PorterStemmer()

In [6]:
p

<PorterStemmer>

In [7]:
p.stem('change')

'chang'

In [8]:
for w in word:
    print(p.stem(w))

chang
chang
chang
chang


In [9]:
for w in word:
    print(w, p.stem(w))

change chang
changing chang
changes chang
changed chang


In [10]:
sen = 'The constant flux of life necessitates embracing change, whether its adapting to the changes around us or actively changing ourselves to meet new challenges.'

In [11]:
sen

'The constant flux of life necessitates embracing change, whether its adapting to the changes around us or actively changing ourselves to meet new challenges.'

In [12]:
from nltk.tokenize import word_tokenize

In [13]:
token = word_tokenize(sen)

In [14]:
token

['The',
 'constant',
 'flux',
 'of',
 'life',
 'necessitates',
 'embracing',
 'change',
 ',',
 'whether',
 'its',
 'adapting',
 'to',
 'the',
 'changes',
 'around',
 'us',
 'or',
 'actively',
 'changing',
 'ourselves',
 'to',
 'meet',
 'new',
 'challenges',
 '.']

In [15]:
len(token)

26

In [16]:
len(sen.split())

24

In [17]:
for w in token:
    print(p.stem(w))

the
constant
flux
of
life
necessit
embrac
chang
,
whether
it
adapt
to
the
chang
around
us
or
activ
chang
ourselv
to
meet
new
challeng
.


# Lemmatization in NLP

In [18]:
from nltk.stem import WordNetLemmatizer

In [19]:
le = WordNetLemmatizer()

In [20]:
le

<WordNetLemmatizer>

In [21]:
token

['The',
 'constant',
 'flux',
 'of',
 'life',
 'necessitates',
 'embracing',
 'change',
 ',',
 'whether',
 'its',
 'adapting',
 'to',
 'the',
 'changes',
 'around',
 'us',
 'or',
 'actively',
 'changing',
 'ourselves',
 'to',
 'meet',
 'new',
 'challenges',
 '.']

In [22]:
for w in token:
    print(le.lemmatize(w))

The
constant
flux
of
life
necessitates
embracing
change
,
whether
it
adapting
to
the
change
around
u
or
actively
changing
ourselves
to
meet
new
challenge
.


In [23]:
le.lemmatize('changing')

'changing'

In [24]:
word

['change', 'changing', 'changes', 'changed']

In [25]:
for w in word:
    print(le.lemmatize(w))

change
changing
change
changed


# Tokenization in NLP

In python there are several libraries and tools available for performing tokenization and other NLP tasks. Here are few examples

# NLTK

NLTK ( Natural Language Tool kit) is a widely used library for NLP tasks. To perform tokenization using NLTK, you need to install it first. You can do so by running pip install nltk.

In [26]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [27]:
sentence = "I'm from aiQuest Intelligence. I live in germany. I am learning NLP. It is fascinating!"

In [28]:
word_token = word_tokenize(sentence)
sentence_token = sent_tokenize(sentence)

In [29]:
print(word_token)

['I', "'m", 'from', 'aiQuest', 'Intelligence', '.', 'I', 'live', 'in', 'germany', '.', 'I', 'am', 'learning', 'NLP', '.', 'It', 'is', 'fascinating', '!']


In [30]:
print(sentence_token)

["I'm from aiQuest Intelligence.", 'I live in germany.', 'I am learning NLP.', 'It is fascinating!']


# Spacy

Spacy is another powerful library for NLP. To install spaCy you need to install pip install spacy and then download the appropriate language model. Here is an example of tokenization using spaCy

In [31]:
import spacy

## Load the English Language Model

In [32]:
spc = spacy.load('en_core_web_sm')

In [33]:
spc

<spacy.lang.en.English at 0x24724523390>

In [34]:
sentence = "I'm from aiQuest Intelligence. I live in germany. I am learning NLP. It is fascinating!"

In [35]:
doc = spc(sentence)

In [36]:
doc

I'm from aiQuest Intelligence. I live in germany. I am learning NLP. It is fascinating!

In [37]:
word_token = [token.text for token in doc]

In [38]:
word_token

['I',
 "'m",
 'from',
 'aiQuest',
 'Intelligence',
 '.',
 'I',
 'live',
 'in',
 'germany',
 '.',
 'I',
 'am',
 'learning',
 'NLP',
 '.',
 'It',
 'is',
 'fascinating',
 '!']

# Transformer

Transformer is a library  built by Hugging face that provides state-of-the-art pre-trained model for NLP. It offers various functionalities 
including tokenization. To install transformers, run pip install transformers. Here is an example of
tokenization using transformer

In [39]:
from transformers import AutoTokenizer

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [40]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [41]:
sentence = "I'm from aiQuest Intelligence. I am learning NLP. It is fascinating!"

In [42]:
tokens = tokenizer.tokenize(sentence)
print(tokens)

['i', "'", 'm', 'from', 'ai', '##quest', 'intelligence', '.', 'i', 'am', 'learning', 'nl', '##p', '.', 'it', 'is', 'fascinating', '!']


# Named Entity Tokenization using NLTK

To perform named entity tokenization using NLTK, you can utilize named entity recognition(NER) functionality provided by NLTK.

In [43]:
[x*2 for x in range(5)]

[0, 2, 4, 6, 8]

In [44]:
([x*2 for x in range(5)])

[0, 2, 4, 6, 8]

In [45]:
tuple([x*2 for x in range(5)])

(0, 2, 4, 6, 8)

In [46]:
import sys
sys.getsizeof([x*2 for x in range(5)])

120

In [47]:
sys.getsizeof(tuple([x*2 for x in range(5)]))

80

In [48]:
from nltk import word_tokenize, pos_tag, ne_chunk

In [49]:
sentence = "I'm from aiQuest Intelligence. I live in Germany. I am learning NLP. It is fascinating!, Hasan khan, my name is Joe"

In [50]:
tokens = word_tokenize(sentence) # Tokenize the sentence into words
pos_tags = pos_tag(tokens) # Perform parts of speech tagging
ner_tag = ne_chunk(pos_tags) # Perform Named Entity Recognition

In [51]:
named_entity_tokens = []
for chunk in ner_tag:
    if hasattr(chunk, 'label'):
        named_entity_tokens.append(' '.join(c[0] for c in chunk))
named_entity_tokens

['aiQuest Intelligence', 'Germany', 'NLP', 'Hasan', 'Joe']

In [52]:
sentence2 = "Shakil lives in Germany"
token = word_tokenize(sentence2)
pos_tag = pos_tag(token)

In [53]:
pos_tag

[('Shakil', 'NNP'), ('lives', 'VBZ'), ('in', 'IN'), ('Germany', 'NNP')]

# Text Vectorizer

In [54]:
import pandas as pd
df = pd.read_excel("D:/DL_ML_AiQuest_PacticeWork/class_practice_project/Datasets/data.xlsx")

In [55]:
df

Unnamed: 0,text,class
0,"Hey, I love Bangladesh;",1
1,"Good afternoon, I am happy!",1
2,I live in Germany,1
3,Nice to meet you man-,1
4,You won an iPhone,0


# Text Processing

In [56]:
from nltk.corpus import stopwords

In [57]:
en_stopwords = set(stopwords.words('english'))

In [58]:
print(en_stopwords)

{'through', "wasn't", 'into', 'ourselves', 'which', 'we', 'haven', 'than', 'ma', 'has', 'same', 'above', 'over', "won't", 'my', 'only', 'so', 'isn', 'i', 'hers', "hasn't", 'wouldn', 'mustn', "aren't", 'all', 'have', 'myself', 'very', 'wasn', 'about', 'no', 'nor', 'him', 'here', 'for', 'that', 'is', 'they', 'these', 'do', 'to', 'itself', 'this', 'having', 'again', "you've", 'once', 'yours', 're', 'there', "shan't", 'such', "don't", 'too', 'are', 'm', "you're", 'yourself', 'more', 'did', 'were', 'her', "mustn't", 'against', 'any', "should've", 'how', 'been', 'what', 'aren', 'y', 'theirs', 'doesn', 'each', 'hasn', "she's", 'now', 'had', 'won', 'of', "needn't", 'own', 'while', 'whom', 'other', 'those', "mightn't", 'the', 'should', 'herself', 'he', 'or', 'needn', 'as', 'their', 'before', 'shan', 'doing', 'she', 'mightn', "you'll", 'hadn', 'between', "you'd", 'both', 'down', 'will', 'yourselves', 'was', 'on', 'few', 'am', "couldn't", 'ain', 'just', "haven't", 'our', 'shouldn', 'didn', "doesn

In [59]:
print(stopwords.fileids())

['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']


In [60]:
print(stopwords.words('bengali'))

['অতএব', 'অথচ', 'অথবা', 'অনুযায়ী', 'অনেক', 'অনেকে', 'অনেকেই', 'অন্তত', 'অন্য', 'অবধি', 'অবশ্য', 'অর্থাত', 'আই', 'আগামী', 'আগে', 'আগেই', 'আছে', 'আজ', 'আদ্যভাগে', 'আপনার', 'আপনি', 'আবার', 'আমরা', 'আমাকে', 'আমাদের', 'আমার', 'আমি', 'আর', 'আরও', 'ই', 'ইত্যাদি', 'ইহা', 'উচিত', 'উত্তর', 'উনি', 'উপর', 'উপরে', 'এ', 'এঁদের', 'এঁরা', 'এই', 'একই', 'একটি', 'একবার', 'একে', 'এক্', 'এখন', 'এখনও', 'এখানে', 'এখানেই', 'এটা', 'এটাই', 'এটি', 'এত', 'এতটাই', 'এতে', 'এদের', 'এব', 'এবং', 'এবার', 'এমন', 'এমনকী', 'এমনি', 'এর', 'এরা', 'এল', 'এস', 'এসে', 'ঐ', 'ও', 'ওঁদের', 'ওঁর', 'ওঁরা', 'ওই', 'ওকে', 'ওখানে', 'ওদের', 'ওর', 'ওরা', 'কখনও', 'কত', 'কবে', 'কমনে', 'কয়েক', 'কয়েকটি', 'করছে', 'করছেন', 'করতে', 'করবে', 'করবেন', 'করলে', 'করলেন', 'করা', 'করাই', 'করায়', 'করার', 'করি', 'করিতে', 'করিয়া', 'করিয়ে', 'করে', 'করেই', 'করেছিলেন', 'করেছে', 'করেছেন', 'করেন', 'কাউকে', 'কাছ', 'কাছে', 'কাজ', 'কাজে', 'কারও', 'কারণ', 'কি', 'কিংবা', 'কিছু', 'কিছুই', 'কিন্তু', 'কী', 'কে', 'কেউ', 'কেউই', 'কেখা', 'কেন', 'কোটি', 'কোন', 'কোনও'

In [61]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [62]:
len(string.punctuation)

32

In [63]:
'sHakIl'.lower()

'shakil'

In [64]:
li = [1, 2, 3, 4, 54]
[l for l in li]

[1, 2, 3, 4, 54]

In [65]:
[l for l in li if l%2 !=0]

[1, 3]

In [66]:
df

Unnamed: 0,text,class
0,"Hey, I love Bangladesh;",1
1,"Good afternoon, I am happy!",1
2,I live in Germany,1
3,Nice to meet you man-,1
4,You won an iPhone,0


In [67]:
[x*2 for x in range(5) if x % 3 == 0]

[0, 6]

In [68]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [69]:
def preprocess_text(text):
    remove_punc = [char for char in text if char not in string.punctuation]
    clean_words = ''.join(remove_punc)
    split_words = clean_words.split()
    text = [word for word in split_words if word.lower() not in stopwords.words('english')]
    return text

In [70]:
df['text'] = df['text'].apply(preprocess_text)
df['text']

0     [Hey, love, Bangladesh]
1    [Good, afternoon, happy]
2             [live, Germany]
3           [Nice, meet, man]
4                    [iPhone]
Name: text, dtype: object

In [71]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    lemmatized_text = ''.join([lemmatizer.lemmatize(word) for word in text])
    return lemmatized_text

In [72]:
df['text'] = df['text'].apply(lemmatize_text)
df['text']

0     HeyloveBangladesh
1    Goodafternoonhappy
2           liveGermany
3           Nicemeetman
4                iPhone
Name: text, dtype: object

# Count Vectorizer

In [73]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [74]:
cv = CountVectorizer()

In [75]:
cv

In [76]:
cv_x = cv.fit_transform(df['text'])
cv_x

<5x5 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [77]:
cv_x.toarray()

array([[0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0]], dtype=int64)

In [78]:
cv_df = pd.DataFrame(cv_x.toarray())
cv_df

Unnamed: 0,0,1,2,3,4
0,0,1,0,0,0
1,1,0,0,0,0
2,0,0,0,1,0
3,0,0,0,0,1
4,0,0,1,0,0


In [79]:
cv.get_feature_names_out()

array(['goodafternoonhappy', 'heylovebangladesh', 'iphone', 'livegermany',
       'nicemeetman'], dtype=object)

In [80]:
cv_df = pd.DataFrame(cv_x.toarray(), index=df['text'], columns=cv.get_feature_names_out())
cv_df

Unnamed: 0_level_0,goodafternoonhappy,heylovebangladesh,iphone,livegermany,nicemeetman
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HeyloveBangladesh,0,1,0,0,0
Goodafternoonhappy,1,0,0,0,0
liveGermany,0,0,0,1,0
Nicemeetman,0,0,0,0,1
iPhone,0,0,1,0,0


In [81]:
tf = TfidfVectorizer()
tf

In [82]:
tf_z = tf.fit_transform(df['text'])
tf_z

<5x5 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [83]:
cv_tf = pd.DataFrame(tf_z.toarray(), index=df['text'], columns=tf.get_feature_names_out())
cv_tf

Unnamed: 0_level_0,goodafternoonhappy,heylovebangladesh,iphone,livegermany,nicemeetman
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HeyloveBangladesh,0.0,1.0,0.0,0.0,0.0
Goodafternoonhappy,1.0,0.0,0.0,0.0,0.0
liveGermany,0.0,0.0,0.0,1.0,0.0
Nicemeetman,0.0,0.0,0.0,0.0,1.0
iPhone,0.0,0.0,1.0,0.0,0.0


# Word2Vec

In [85]:
from gensim.models import Word2Vec, KeyedVectors

In [86]:
text_vector = [nltk.word_tokenize(test) for test in df['text']]
text_vector

[['HeyloveBangladesh'],
 ['Goodafternoonhappy'],
 ['liveGermany'],
 ['Nicemeetman'],
 ['iPhone']]

In [87]:
model = Word2Vec(text_vector, min_count=1)
model

<gensim.models.word2vec.Word2Vec at 0x2473b90ef50>

In [96]:
model.wv.most_similar('liveGermany')

[('HeyloveBangladesh', 0.17018887400627136),
 ('Goodafternoonhappy', -0.013514947146177292),
 ('Nicemeetman', -0.023671656847000122),
 ('iPhone', -0.05234673246741295)]