In [1]:
# Uncomment the next line if you are using Google Colab
# !pip install transformers 

In [2]:
# Import the BertTokenizer from the transformers package.
from transformers import BertTokenizer

2023-11-30 10:07:24.264846: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Instantiate the BertTokenizer on the pre-trained data.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
# Define an input text.
text = "I am learning about subword tokenization."

# Tokenize the text into subwords.
subwords = tokenizer.tokenize(text)
subwords

['i', 'am', 'learning', 'about', 'sub', '##word', 'token', '##ization', '.']

## NLTK tokenization

In [5]:
# Uncomment the next line if you are using Google Colab
# !pip install nltk

# Import Reuters database from the nltk corpus
from nltk.corpus import reuters
# Import tokenizers
from nltk.tokenize import sent_tokenize, word_tokenize

# Download the "punkt" sentence tokenizer.
import nltk
nltk.download("reuters")
nltk.download('punkt')

[nltk_data] Downloading package reuters to /Users/tberton/nltk_data...
[nltk_data]   Unzipping corpora/reuters.zip.
[nltk_data] Downloading package punkt to /Users/tberton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# We'll find the first article about cocoa.
print(reuters.categories())

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [7]:
# We'll find the first article about cocoa.
reuters.fileids(categories = 'cocoa')[0]

'test/15095'

In [8]:
article = reuters.raw('test/15095')
print(article)

COCOA EXPORTERS EXPECTED TO LIMIT SALES
  Major cocoa exporters are likely to
  limit sales in the weeks ahead in an effort to boost world
  prices, sources close to a meeting of the Cocoa Producers
  Alliance (CPA) said.
      The sources said the depressed world market had been one of
  the main topics discussed in a closed door meeting of the
  11-member CPA which began on Monday.
      They said producers agreed that cutting sales would aid the
  buffer stock manager of a new international cocoa pact in his
  effort to support prices.
      Major cocoa producing and consuming nations agreed
  operation rules for the buffer stock at a meeting in London
  last month and the stock manager is expected to enter the
  market soon.
      Prices, under the weight of three successive cocoa
  surpluses, recently fell to the level at which the manager has
  to buy cocoa under stock rules.
      The buffer stock aims to keep prices within a pre-set range
  by buying when prices fall and sellin

In [9]:
# NLTK tokenizes in similar way by using the `sent_tokenize` function
sent_tokenize(article)

['COCOA EXPORTERS EXPECTED TO LIMIT SALES\n  Major cocoa exporters are likely to\n  limit sales in the weeks ahead in an effort to boost world\n  prices, sources close to a meeting of the Cocoa Producers\n  Alliance (CPA) said.',
 'The sources said the depressed world market had been one of\n  the main topics discussed in a closed door meeting of the\n  11-member CPA which began on Monday.',
 'They said producers agreed that cutting sales would aid the\n  buffer stock manager of a new international cocoa pact in his\n  effort to support prices.',
 'Major cocoa producing and consuming nations agreed\n  operation rules for the buffer stock at a meeting in London\n  last month and the stock manager is expected to enter the\n  market soon.',
 'Prices, under the weight of three successive cocoa\n  surpluses, recently fell to the level at which the manager has\n  to buy cocoa under stock rules.',
 'The buffer stock aims to keep prices within a pre-set range\n  by buying when prices fall and 

In [10]:
# Print the first  sentence.
sent = sent_tokenize(article)[0]
print(sent)

COCOA EXPORTERS EXPECTED TO LIMIT SALES
  Major cocoa exporters are likely to
  limit sales in the weeks ahead in an effort to boost world
  prices, sources close to a meeting of the Cocoa Producers
  Alliance (CPA) said.


In [11]:
# Tokenize the first sentence with the `word_tokenize` function.
word_tokenize(sent)

['COCOA',
 'EXPORTERS',
 'EXPECTED',
 'TO',
 'LIMIT',
 'SALES',
 'Major',
 'cocoa',
 'exporters',
 'are',
 'likely',
 'to',
 'limit',
 'sales',
 'in',
 'the',
 'weeks',
 'ahead',
 'in',
 'an',
 'effort',
 'to',
 'boost',
 'world',
 'prices',
 ',',
 'sources',
 'close',
 'to',
 'a',
 'meeting',
 'of',
 'the',
 'Cocoa',
 'Producers',
 'Alliance',
 '(',
 'CPA',
 ')',
 'said',
 '.']

## Tokenizing using spaCy

In [12]:
# Import the spaCy library
import spacy
# Load the small English language model for spaCy
nlp = spacy.load("en_core_web_sm")

In [13]:
# Tokenize the first sentence using token.text
spacy_sent = nlp(sent)
[token.text for token in spacy_sent]

['COCOA',
 'EXPORTERS',
 'EXPECTED',
 'TO',
 'LIMIT',
 'SALES',
 '\n  ',
 'Major',
 'cocoa',
 'exporters',
 'are',
 'likely',
 'to',
 '\n  ',
 'limit',
 'sales',
 'in',
 'the',
 'weeks',
 'ahead',
 'in',
 'an',
 'effort',
 'to',
 'boost',
 'world',
 '\n  ',
 'prices',
 ',',
 'sources',
 'close',
 'to',
 'a',
 'meeting',
 'of',
 'the',
 'Cocoa',
 'Producers',
 '\n  ',
 'Alliance',
 '(',
 'CPA',
 ')',
 'said',
 '.']

## Tokenize the first sentence using bert-base-uncased.

In [14]:
# Tokenize the first sentence into subwords.
sentence_subwords = tokenizer.tokenize(sent)
sentence_subwords

['cocoa',
 'export',
 '##ers',
 'expected',
 'to',
 'limit',
 'sales',
 'major',
 'cocoa',
 'export',
 '##ers',
 'are',
 'likely',
 'to',
 'limit',
 'sales',
 'in',
 'the',
 'weeks',
 'ahead',
 'in',
 'an',
 'effort',
 'to',
 'boost',
 'world',
 'prices',
 ',',
 'sources',
 'close',
 'to',
 'a',
 'meeting',
 'of',
 'the',
 'cocoa',
 'producers',
 'alliance',
 '(',
 'cp',
 '##a',
 ')',
 'said',
 '.']