# Using NLTK

In [29]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

In [30]:
text = "Tokenization is the process of splitting text into tokens. Stemming reduces words to their root form."

In [31]:
text = text.lower()
text

'tokenization is the process of splitting text into tokens. stemming reduces words to their root form.'

In [32]:
# Tokenization
tokens = word_tokenize(text)
tokens = [token for token in tokens if token not in string.punctuation]
print(tokens)

['tokenization', 'is', 'the', 'process', 'of', 'splitting', 'text', 'into', 'tokens', 'stemming', 'reduces', 'words', 'to', 'their', 'root', 'form']


In [33]:
# Stemming
stemmer = PorterStemmer()
stems = [stemmer.stem(token) for token in tokens]
print(stems)

['token', 'is', 'the', 'process', 'of', 'split', 'text', 'into', 'token', 'stem', 'reduc', 'word', 'to', 'their', 'root', 'form']


In [34]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token) for token in tokens]
print(lemmas)

['tokenization', 'is', 'the', 'process', 'of', 'splitting', 'text', 'into', 'token', 'stemming', 'reduces', 'word', 'to', 'their', 'root', 'form']


# Using Hugging Face

In [35]:
from transformers import AutoTokenizer
import re

In [36]:
text = "Tokenization is the process of splitting text into tokens. Stemming reduces words to their root form."

In [37]:
text = text.lower()
print(text)

tokenization is the process of splitting text into tokens. stemming reduces words to their root form.


In [38]:
# Remove punctuation using regex
text = re.sub(r'[^\w\s]', '', text)
print(text)

tokenization is the process of splitting text into tokens stemming reduces words to their root form


In [39]:
# Hugging Face Tokenization
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize(text)

In [40]:
stemmer = PorterStemmer()
stems = [stemmer.stem(token) for token in tokens]
print(stems)

['token', '##izat', 'is', 'the', 'process', 'of', 'split', 'text', 'into', 'token', '##', 'stem', 'reduc', 'word', 'to', 'their', 'root', 'form']
