# Import Libraries

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Load And tokenized the data

In [2]:
TEXT = "Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million."
print(sent_tokenize(TEXT))

['Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.']


In [3]:
doc=nlp(TEXT)
for i in doc:
    print(i)

Over
the
last
quarter
Apple
sold
nearly
20
thousand
iPods
for
a
profit
of
$
6
million
.


# stemming Vs lemmatization


Stemming is preferred when you need a quick and straightforward way to reduce words to a common base form. It's useful in information retrieval tasks and text indexing where speed is crucial, but it may not always provide valid dictionary words.
for example in sentence classification type projects.

Lemmatization is the better choice when you require the correct base form of words for in-depth text analysis, semantic understanding, or tasks where word meaning is essential. It is more accurate but also computationally more intensive.
For example chatbot,text generation type projects where dictionary is also matter.

# stemming

In [3]:
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()
s_stemmer = SnowballStemmer(language='english')

In [4]:
## applying porterstemmer
words = ['run','runner','running','ran','runs','easily','fairly']
for word in words:
    print(word+' --> '+ps.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


In [5]:
### applying SnowballStemmer
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair


# Lemitizer

In [6]:
from nltk import WordNetLemmatizer
lemitizer=WordNetLemmatizer()

for word in words:
    print(word+' --> '+lemitizer.lemmatize(word))

run --> run
runner --> runner
running --> running
ran --> ran
runs --> run
easily --> easily
fairly --> fairly


In [7]:
for word in words:
    print(word+' --> '+lemitizer.lemmatize(word,pos='v'))    

run --> run
runner --> runner
running --> run
ran --> run
runs --> run
easily --> easily
fairly --> fairly


In [8]:
## applying the sttemmer and limitier of our text tokens.

def preprocess(stemmer):
        processed_tokens = []

        for token in doc:
            if token.like_num:
                # If the token is numeric, leave it as is
                processed_tokens.append(token.text)
            else:
                # If the token is a word, apply stemming
                processed_tokens.append(stemmer.stem(token.text))

        processed_text = " ".join(processed_tokens)
        return processed_text

In [9]:
porter_text=preprocess(ps)
print(porter_text)

snowball_text=preprocess(s_stemmer)
print(snowball_text)

over the last quarter appl sold nearli 20 thousand ipod for a profit of $ 6 million .
over the last quarter appl sold near 20 thousand ipod for a profit of $ 6 million .


In [10]:
processed_tokens = []

for token in doc:
    if token.like_num:
        # If the token is numeric, leave it as is
        processed_tokens.append(token.text)
    else:
        # If the token is a word, apply stemming
        processed_tokens.append(lemitizer.lemmatize(token.text))

processed_text = " ".join(processed_tokens)
processed_text

'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $ 6 million .'

# If we want to do the limitization with spacy library we can also do that , below is the code  for do this

we already made the token with spacy libaray and we apply the limitization and stemming on that token using NLTK library , but we can get also the same thing with using sublibray that exist like token.lemma_ 

In [11]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}}  {token.lemma_}')

In [12]:
show_lemmas(doc)

Over         ADP     over
the          DET     the
last         ADJ     last
quarter      NOUN    quarter
Apple        PROPN   Apple
sold         VERB    sell
nearly       ADV     nearly
20           NUM     20
thousand     NUM     thousand
iPods        PROPN   iPods
for          ADP     for
a            DET     a
profit       NOUN    profit
of           ADP     of
$            SYM     $
6            NUM     6
million      NUM     million
.            PUNCT   .
