In [2]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [3]:
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Sample text
text = "Stemming is the process of reducing words to their word stem, base or root form." #Remove Suffix

In [5]:
# Tokenize the text
tokens = word_tokenize(text)

### Porter Stemmer Algorithm

#### The Porter Stemmer is relatively simple and fast but may produce stems that are not always real words, as it prioritizes speed and simplicity over linguistic accuracy.

In [6]:
# Initialize Porter Stemmer
stemmer = PorterStemmer()

In [7]:
# Stem each token in the text
stemmed_text = [stemmer.stem(token) for token in tokens]

In [8]:
# Join the stemmed tokens back into a string
stemmed_text = ' '.join(stemmed_text)

In [9]:
print("Original Text:")
print(text)
print("\nStemmed Text:")
print(stemmed_text)

Original Text:
Stemming is the process of reducing words to their word stem, base or root form.

Stemmed Text:
stem is the process of reduc word to their word stem , base or root form .


### Snowball Stemmer Algorithm

#### The Snowball Stemmer is more flexible and customizable, allowing users to define their own stemming rules and algorithms for different languages or domains.

In [10]:
from nltk.stem import SnowballStemmer


In [11]:
text = "running walks danced laughing"
tokens = word_tokenize(text)

In [12]:
# Initialize Snowball Stemmer for English
snowball_stemmer = SnowballStemmer('english')

# Snowball stemming
snowball_stemmed_text = [snowball_stemmer.stem(token) for token in tokens]

# Join the stemmed tokens back into strings
snowball_stemmed_text = ' '.join(snowball_stemmed_text)

In [13]:
print("\nSnowball Stemmed Text:")
print(snowball_stemmed_text)


Snowball Stemmed Text:
run walk danc laugh


In [14]:
pip install snowballstemmer

Collecting snowballstemmer
  Downloading snowballstemmer-2.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading snowballstemmer-2.2.0-py2.py3-none-any.whl (93 kB)
   ---------------------------------------- 0.0/93.0 kB ? eta -:--:--
   ---------------------------------------- 0.0/93.0 kB ? eta -:--:--
   ---------------------------------------- 0.0/93.0 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/93.0 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/93.0 kB ? eta -:--:--
   ------------- -------------------------- 30.7/93.0 kB 163.8 kB/s eta 0:00:01
   ------------- -------------------------- 30.7/93.0 kB 163.8 kB/s eta 0:00:01
   -------------------------- ------------- 61.4/93.0 kB 233.8 kB/s eta 0:00:01
   ----------------------------------- ---- 81.9/93.0 kB 286.7 kB/s eta 0:00:01
   ---------------------------------------- 93.0/93.0 kB 252.3 kB/s eta 0:00:00
Installing collected packages: snowballstemmer
Successfully installed snowballstemm

## Snowball for Multilingual Processing

In [15]:
from snowballstemmer import stemmer

def snowball_stemming(text, lang='english'):
    # Initialize Snowball stemmer for the specified language
    stemmer_obj = stemmer(lang)

    # Tokenize the text
    tokens = text.split()

    # Stem each token
    stemmed_tokens = [stemmer_obj.stemWord(token) for token in tokens]

    # Join the stemmed tokens back into a single string
    stemmed_text = ' '.join(stemmed_tokens)

    return stemmed_text

# Example text in English and French
english_text = "I am running in the beautiful gardens"
french_text = "Je cours dans les beaux jardins"

# Apply Snowball stemming for English and French
stemmed_english = snowball_stemming(english_text, lang='english')
stemmed_french = snowball_stemming(french_text, lang='french')

# Print the stemmed text
print("Stemmed English Text:", stemmed_english)
print("Stemmed French Text:", stemmed_french)

Stemmed English Text: I am run in the beauti garden
Stemmed French Text: Je cour dan le beau jardin


## Snowball Algorithm specifying unique stemming rules

In [16]:
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Define custom stemming rules
def custom_stem(word):
    # Add your custom stemming rules here
    if word.endswith('ers'):
        return word[:-3]  # Remove 'ers' suffix
    elif word.endswith('ing'):
        return word[:-3]  # Remove 'ing' suffix
    else:
        return word

# Create an instance of the Snowball stemmer for English
snowball_stemmer = SnowballStemmer(language='english')

# Example text
text = "running walks danced laughing walkers"

# Tokenize the text
tokens = word_tokenize(text)

# Apply custom stemming to each token
stemmed_tokens = [custom_stem(token) for token in tokens]

# Alternatively, you can use the Snowball stemmer directly
snowball_stemmed_tokens = [snowball_stemmer.stem(token) for token in tokens]

# Join the stemmed tokens back into a single string
stemmed_text = ' '.join(stemmed_tokens)
snowball_stemmed_text = ' '.join(snowball_stemmed_tokens)

# Print the stemmed text
print("Original Text:", text)
print("Custom Stemmed Text:", stemmed_text)
print("Snowball Stemmed Text:", snowball_stemmed_text)

Original Text: running walks danced laughing walkers
Custom Stemmed Text: runn walks danced laugh walk
Snowball Stemmed Text: run walk danc laugh walker


### Lancaster Stemmer Algorithm

#### The Lancaster Stemmer is known for its aggressive stemming behavior, which can sometimes result in stems that are more drastically truncated compared to the Porter and Snowball stemmers.

#### It is faster and more aggressive than the Porter and Snowball stemmers but may produce stems that are less recognizable as real words.

#### The Lancaster Stemmer is particularly useful in applications where speed and simplicity are prioritized over linguistic accuracy, such as information retrieval systems.

In [19]:
from nltk.stem import LancasterStemmer

# Initialize Lancaster Stemmer for English
lancaster_stemmer = LancasterStemmer()

# Tokenize and stem in one step
lancaster_stemmed_text = [lancaster_stemmer.stem(token) for token in tokens]

# Join the stemmed tokens back into a string
lancaster_stemmed_text = ' '.join(lancaster_stemmed_text)

In [20]:
print("\nLancaster Stemmed Text:")
print(lancaster_stemmed_text)


Lancaster Stemmed Text:
run walk dant laugh walk
