# Using NLTK

In [16]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

In [17]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [18]:
text = "Tokenization is the process of splitting text into tokens. Stemming reduces words to their root form."

In [19]:
text = text.lower()
text

'tokenization is the process of splitting text into tokens. stemming reduces words to their root form.'

In [20]:
# Tokenization
tokens = word_tokenize(text)
tokens = [token for token in tokens if token not in string.punctuation]
print(tokens)

['tokenization', 'is', 'the', 'process', 'of', 'splitting', 'text', 'into', 'tokens', 'stemming', 'reduces', 'words', 'to', 'their', 'root', 'form']


In [21]:
# Stemming
stemmer = PorterStemmer()
stems = [stemmer.stem(token) for token in tokens]
print(stems)

['token', 'is', 'the', 'process', 'of', 'split', 'text', 'into', 'token', 'stem', 'reduc', 'word', 'to', 'their', 'root', 'form']


In [22]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token) for token in tokens]
print(lemmas)

['tokenization', 'is', 'the', 'process', 'of', 'splitting', 'text', 'into', 'token', 'stemming', 'reduces', 'word', 'to', 'their', 'root', 'form']


# Using Hugging Face

In [23]:
from transformers import AutoTokenizer
import re

In [24]:
text = "Tokenization is the process of splitting text into tokens. Stemming reduces words to their root form."

In [25]:
text = text.lower()
print(text)

tokenization is the process of splitting text into tokens. stemming reduces words to their root form.


In [26]:
# Remove punctuation using regex
text = re.sub(r'[^\w\s]', '', text)
print(text)

tokenization is the process of splitting text into tokens stemming reduces words to their root form


In [27]:
# Hugging Face Tokenization
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize(text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [28]:
stemmer = PorterStemmer()
stems = [stemmer.stem(token) for token in tokens]
print(stems)

['token', '##izat', 'is', 'the', 'process', 'of', 'split', 'text', 'into', 'token', '##', 'stem', 'reduc', 'word', 'to', 'their', 'root', 'form']
