# Stemming in NLP

In [1]:
import nltk
import warnings
warnings.filterwarnings('ignore')

In [2]:
word = ['change', 'changing', 'changes', 'changed']

In [3]:
word

['change', 'changing', 'changes', 'changed']

In [4]:
from nltk.stem import PorterStemmer

In [5]:
p = PorterStemmer()

In [6]:
p

<PorterStemmer>

In [7]:
p.stem('change')

'chang'

In [8]:
for w in word:
    print(p.stem(w))

chang
chang
chang
chang


In [9]:
for w in word:
    print(w, p.stem(w))

change chang
changing chang
changes chang
changed chang


In [10]:
sen = 'The constant flux of life necessitates embracing change, whether its adapting to the changes around us or actively changing ourselves to meet new challenges.'

In [11]:
sen

'The constant flux of life necessitates embracing change, whether its adapting to the changes around us or actively changing ourselves to meet new challenges.'

In [12]:
from nltk.tokenize import word_tokenize

In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
token = word_tokenize(sen)

In [15]:
token

['The',
 'constant',
 'flux',
 'of',
 'life',
 'necessitates',
 'embracing',
 'change',
 ',',
 'whether',
 'its',
 'adapting',
 'to',
 'the',
 'changes',
 'around',
 'us',
 'or',
 'actively',
 'changing',
 'ourselves',
 'to',
 'meet',
 'new',
 'challenges',
 '.']

In [16]:
len(token)

26

In [17]:
len(sen.split())

24

In [18]:
for w in token:
    print(p.stem(w))

the
constant
flux
of
life
necessit
embrac
chang
,
whether
it
adapt
to
the
chang
around
us
or
activ
chang
ourselv
to
meet
new
challeng
.


# Lemmatization in NLP

In [19]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
from nltk.stem import WordNetLemmatizer

In [21]:
le = WordNetLemmatizer()

In [22]:
le

<WordNetLemmatizer>

In [23]:
token

['The',
 'constant',
 'flux',
 'of',
 'life',
 'necessitates',
 'embracing',
 'change',
 ',',
 'whether',
 'its',
 'adapting',
 'to',
 'the',
 'changes',
 'around',
 'us',
 'or',
 'actively',
 'changing',
 'ourselves',
 'to',
 'meet',
 'new',
 'challenges',
 '.']

In [24]:
for w in token:
    print(le.lemmatize(w))

The
constant
flux
of
life
necessitates
embracing
change
,
whether
it
adapting
to
the
change
around
u
or
actively
changing
ourselves
to
meet
new
challenge
.


In [25]:
le.lemmatize('changing')

'changing'

In [26]:
word

['change', 'changing', 'changes', 'changed']

In [27]:
for w in word:
    print(le.lemmatize(w))

change
changing
change
changed


# Tokenization in NLP

In Python, there are several libraries and tools available for performing tokenization and other NLP tasks. Here are a few examples using popular libraries

# NLTK

### NLTK (Natural Language Toolkit) is a widely used library for NLP tasks. To perform tokenization using NLTK, you need to install it first. You can do so by running pip install nltk. Here's an example of tokenizing a sentence using NLTK

In [28]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [29]:
sentence = "I'm from aiQuest Intelligence. I live in germany. I am learning NLP. It is fascinating!"

In [30]:
word_token = word_tokenize(sentence)
sentence_token = sent_tokenize(sentence)

In [31]:
print(word_token)

['I', "'m", 'from', 'aiQuest', 'Intelligence', '.', 'I', 'live', 'in', 'germany', '.', 'I', 'am', 'learning', 'NLP', '.', 'It', 'is', 'fascinating', '!']


In [32]:
sentence_token

["I'm from aiQuest Intelligence.",
 'I live in germany.',
 'I am learning NLP.',
 'It is fascinating!']

# spaCy

spaCy is another powerful library for NLP. To install spaCy, you can run pip install spacy and then download the appropriate language model. Here's an example of tokenization using spaCy

In [33]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [34]:
import spacy

### Load the English Language Model

In [35]:
spc = spacy.load('en_core_web_sm')

In [36]:
sentence = "I'm from aiQuest Intelligence. I live in germany. I am learning NLP. It is fascinating!"

In [37]:
doc = spc(sentence)

In [38]:
word_token = [token.text for token in doc]

In [39]:
print(word_token)

['I', "'m", 'from', 'aiQuest', 'Intelligence', '.', 'I', 'live', 'in', 'germany', '.', 'I', 'am', 'learning', 'NLP', '.', 'It', 'is', 'fascinating', '!']


# Transformer

Transformers is a library built by Hugging Face that provides state-of-the-art pre-trained models for NLP. It offers various functionalities, including tokenization. To install Transformers, run pip install transformers. Here's an example of tokenization using Transformers

In [40]:
pip install transformers




In [41]:
from transformers import AutoTokenizer

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [42]:
pip install -U jupyterlab ipywidgets jupyterlab-widgets

Note: you may need to restart the kernel to use updated packages.


In [43]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [44]:
sentence = "I'm from aiQuest Intelligence. I am learning NLP. It is fascinating!"

In [45]:
tokens = tokenizer.tokenize(sentence)

print(tokens)

['i', "'", 'm', 'from', 'ai', '##quest', 'intelligence', '.', 'i', 'am', 'learning', 'nl', '##p', '.', 'it', 'is', 'fascinating', '!']


# Named Entity Tokenization using NLTK

To perform named entity tokenization using NLTK (Natural Language Toolkit), you can utilize the named entity recognition (NER) functionality provided by NLTK. Here's an example of how to extract named entity tokens from a sentence using NLTK

In [46]:
[x*2 for x in range(5)]

[0, 2, 4, 6, 8]

In [47]:
(x*2 for x in range(5))

<generator object <genexpr> at 0x000001BD0A142670>

In [48]:
tuple((x*2 for x in range(5)))

(0, 2, 4, 6, 8)

In [49]:
import sys
sys.getsizeof([x*2 for x in range(5)])

120

In [50]:
sys.getsizeof(tuple(x*2 for x in range(5)))

80

In [51]:
from nltk import word_tokenize, pos_tag, ne_chunk

In [52]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [53]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [54]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [55]:
sentence = "I'm from aiQuest Intelligence. I live in Germany. I am learning NLP. It is fascinating!, Hasan khan, my name is Joe"

In [56]:
tokens = word_tokenize(sentence) # Tokenize the sentence into words
pos_tags = pos_tag(tokens) # Perform parts of speech tagging
ner_tags = ne_chunk(pos_tags) # Perform named entity recognition

In [57]:
named_entity_tokens = []
for chunk in ner_tags:
    if hasattr(chunk, 'label'):
        named_entity_tokens.append(' '.join(c[0] for c in chunk))

print(named_entity_tokens)

['aiQuest Intelligence', 'Germany', 'NLP', 'Hasan', 'Joe']


In [58]:
sentence2 = "Shakil lives in Germany"
token = word_tokenize(sentence2)
pos_tags = pos_tag(token)

In [59]:
pos_tags

[('Shakil', 'NNP'), ('lives', 'VBZ'), ('in', 'IN'), ('Germany', 'NNP')]

# Text Vectorizer

In [60]:
import pandas as pd
df = pd.read_excel("D:/DL_ML_AiQuest_PacticeWork/class_practice_project/Datasets/data.xlsx")

In [61]:
df

Unnamed: 0,text,class
0,"Hey, I love Bangladesh;",1
1,"Good afternoon, I am happy!",1
2,I live in Germany,1
3,Nice to meet you man-,1
4,You won an iPhone,0


# Text Processing

In [62]:
from nltk.corpus import stopwords

In [63]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [64]:
en_stopwords = set(stopwords.words('english'))

In [65]:
en_stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [66]:
stopwords.fileids()

['arabic',
 'azerbaijani',
 'basque',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [67]:
stopwords.words('bengali')

['অতএব',
 'অথচ',
 'অথবা',
 'অনুযায়ী',
 'অনেক',
 'অনেকে',
 'অনেকেই',
 'অন্তত',
 'অন্য',
 'অবধি',
 'অবশ্য',
 'অর্থাত',
 'আই',
 'আগামী',
 'আগে',
 'আগেই',
 'আছে',
 'আজ',
 'আদ্যভাগে',
 'আপনার',
 'আপনি',
 'আবার',
 'আমরা',
 'আমাকে',
 'আমাদের',
 'আমার',
 'আমি',
 'আর',
 'আরও',
 'ই',
 'ইত্যাদি',
 'ইহা',
 'উচিত',
 'উত্তর',
 'উনি',
 'উপর',
 'উপরে',
 'এ',
 'এঁদের',
 'এঁরা',
 'এই',
 'একই',
 'একটি',
 'একবার',
 'একে',
 'এক্',
 'এখন',
 'এখনও',
 'এখানে',
 'এখানেই',
 'এটা',
 'এটাই',
 'এটি',
 'এত',
 'এতটাই',
 'এতে',
 'এদের',
 'এব',
 'এবং',
 'এবার',
 'এমন',
 'এমনকী',
 'এমনি',
 'এর',
 'এরা',
 'এল',
 'এস',
 'এসে',
 'ঐ',
 'ও',
 'ওঁদের',
 'ওঁর',
 'ওঁরা',
 'ওই',
 'ওকে',
 'ওখানে',
 'ওদের',
 'ওর',
 'ওরা',
 'কখনও',
 'কত',
 'কবে',
 'কমনে',
 'কয়েক',
 'কয়েকটি',
 'করছে',
 'করছেন',
 'করতে',
 'করবে',
 'করবেন',
 'করলে',
 'করলেন',
 'করা',
 'করাই',
 'করায়',
 'করার',
 'করি',
 'করিতে',
 'করিয়া',
 'করিয়ে',
 'করে',
 'করেই',
 'করেছিলেন',
 'করেছে',
 'করেছেন',
 'করেন',
 'কাউকে',
 'কাছ',
 'কাছে',
 'কাজ',
 'কাজে',
 'কারও',
 '

In [68]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [69]:
len(string.punctuation)

32

In [70]:
'ShaKIl'.lower()

'shakil'

In [71]:
li = [1, 2, 3, 4, 54]
[l for l in li]

[1, 2, 3, 4, 54]

In [72]:
[l for l in li if l%2 != 0]

[1, 3]

In [73]:
df

Unnamed: 0,text,class
0,"Hey, I love Bangladesh;",1
1,"Good afternoon, I am happy!",1
2,I live in Germany,1
3,Nice to meet you man-,1
4,You won an iPhone,0


In [74]:
[x*2 for x in range(5) if x % 3 == 0]

[0, 6]

In [75]:
for x in range(5):
    print(x)

0
1
2
3
4


In [76]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [77]:
def preprocess_text(text):
    remove_punc = [char for char in text if char not in string.punctuation] # Remove Punctuation
    clean_words = ''.join(remove_punc) # char joining
    split_words = clean_words.split()
    #Remove stopwords
    text = ([word for word in split_words if word.lower() not in en_stopwords])  # en_stopwords = stopwords.words('english')
    return text

In [78]:
df['text'] = df['text'].apply(preprocess_text)

In [79]:
df['text']

0     [Hey, love, Bangladesh]
1    [Good, afternoon, happy]
2             [live, Germany]
3           [Nice, meet, man]
4                    [iPhone]
Name: text, dtype: object

In [80]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    lemmatized_text = ''.join([lemmatizer.lemmatize(word) for word in text])
    return lemmatized_text

In [81]:
df['text'] = df['text'].apply(lemmatize_text)

In [82]:
df['text']

0     HeyloveBangladesh
1    Goodafternoonhappy
2           liveGermany
3           Nicemeetman
4                iPhone
Name: text, dtype: object

In [83]:
df

Unnamed: 0,text,class
0,HeyloveBangladesh,1
1,Goodafternoonhappy,1
2,liveGermany,1
3,Nicemeetman,1
4,iPhone,0


# Count Vectorizer

In [84]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [85]:
cv = CountVectorizer()
cv

In [86]:
cv_x = cv.fit_transform(df['text'])
cv_x

<5x5 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [88]:
cv_x.toarray()

array([[0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0]], dtype=int64)

In [89]:
cv_df = pd.DataFrame(cv_x.toarray())
cv_df

Unnamed: 0,0,1,2,3,4
0,0,1,0,0,0
1,1,0,0,0,0
2,0,0,0,1,0
3,0,0,0,0,1
4,0,0,1,0,0


In [90]:
cv.get_feature_names_out()

array(['goodafternoonhappy', 'heylovebangladesh', 'iphone', 'livegermany',
       'nicemeetman'], dtype=object)

In [91]:
cv_df = pd.DataFrame(cv_x.toarray(), index=df['text'], columns=cv.get_feature_names_out())
cv_df

Unnamed: 0_level_0,goodafternoonhappy,heylovebangladesh,iphone,livegermany,nicemeetman
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HeyloveBangladesh,0,1,0,0,0
Goodafternoonhappy,1,0,0,0,0
liveGermany,0,0,0,1,0
Nicemeetman,0,0,0,0,1
iPhone,0,0,1,0,0


# TfidfVectorizer

In [92]:
tf = TfidfVectorizer()

In [93]:
tf

In [95]:
tf_z = tf.fit_transform(df['text'])
tf_z

<5x5 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [96]:
cv_tf = pd.DataFrame(tf_z.toarray(), index=df['text'], columns=tf.get_feature_names_out())
cv_tf

Unnamed: 0_level_0,goodafternoonhappy,heylovebangladesh,iphone,livegermany,nicemeetman
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HeyloveBangladesh,0.0,1.0,0.0,0.0,0.0
Goodafternoonhappy,1.0,0.0,0.0,0.0,0.0
liveGermany,0.0,0.0,0.0,1.0,0.0
Nicemeetman,0.0,0.0,0.0,0.0,1.0
iPhone,0.0,0.0,1.0,0.0,0.0


# Word2Vec

In [97]:
pip install gensim

Collecting FuzzyTM>=0.4.0 (from gensim)
  Downloading FuzzyTM-2.0.9-py3-none-any.whl.metadata (7.9 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim)
  Downloading pyFUME-0.3.4-py3-none-any.whl.metadata (9.7 kB)
Collecting scipy>=1.7.0 (from gensim)
  Downloading scipy-1.10.1-cp311-cp311-win_amd64.whl.metadata (58 kB)
     ---------------------------------------- 0.0/59.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/59.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/59.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/59.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/59.0 kB ? eta -:--:--
     ------ --------------------------------- 10.2/59.0 kB ? eta -:--:--
     ------ --------------------------------- 10.2/59.0 kB ? eta -:--:--
     ------ --------------------------------- 10.2/59.0 kB ? eta -:--:--
     ------ --------------------------------- 10.2/59.0 kB ? eta -:--:--
     ------ ---------------

ERROR: Exception:
Traceback (most recent call last):
  File "C:\Users\hp\anaconda3\Lib\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
    yield
  File "C:\Users\hp\anaconda3\Lib\site-packages\pip\_vendor\urllib3\response.py", line 561, in read
    data = self._fp_read(amt) if not fp_closed else b""
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\hp\anaconda3\Lib\site-packages\pip\_vendor\urllib3\response.py", line 527, in _fp_read
    return self._fp.read(amt) if amt is not None else self._fp.read()
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\hp\anaconda3\Lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 98, in read
    data: bytes = self.__fp.read(amt)
                  ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hp\anaconda3\Lib\http\client.py", line 473, in read
    s = self.fp.read(amt)
        ^^^^^^^^^^^^^^^^^
  File "C:\Users\hp\anaconda3\Lib\socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^


   ----------------- ---------------------- 18.0/42.2 MB 130.5 kB/s eta 0:03:06
   ----------------- ---------------------- 18.0/42.2 MB 130.5 kB/s eta 0:03:06
   ----------------- ---------------------- 18.0/42.2 MB 130.5 kB/s eta 0:03:06
   ----------------- ---------------------- 18.1/42.2 MB 130.6 kB/s eta 0:03:06
   ----------------- ---------------------- 18.1/42.2 MB 130.6 kB/s eta 0:03:06
   ----------------- ---------------------- 18.1/42.2 MB 130.6 kB/s eta 0:03:06
   ----------------- ---------------------- 18.1/42.2 MB 130.6 kB/s eta 0:03:06
   ----------------- ---------------------- 18.1/42.2 MB 130.6 kB/s eta 0:03:06
   ----------------- ---------------------- 18.1/42.2 MB 130.3 kB/s eta 0:03:06
   ----------------- ---------------------- 18.1/42.2 MB 130.3 kB/s eta 0:03:06
   ----------------- ---------------------- 18.1/42.2 MB 130.3 kB/s eta 0:03:06
   ----------------- ---------------------- 18.1/42.2 MB 130.3 kB/s eta 0:03:06
   ----------------- ------------------

In [98]:
from gensim.models import Word2Vec, KeyedVectors

In [99]:
text_vector = [nltk.word_tokenize(test) for test in df['text']]
text_vector

[['HeyloveBangladesh'],
 ['Goodafternoonhappy'],
 ['liveGermany'],
 ['Nicemeetman'],
 ['iPhone']]

In [100]:
model = Word2Vec(text_vector, min_count=1)
model

<gensim.models.word2vec.Word2Vec at 0x1bd12ecaad0>

In [106]:
model.wv.most_similar('iPhone')

[('Nicemeetman', -0.010839171707630157),
 ('HeyloveBangladesh', -0.027750372886657715),
 ('liveGermany', -0.052346739917993546),
 ('Goodafternoonhappy', -0.1116705909371376)]