# Basic Tokenization and some Processing

In [3]:
import warnings
warnings.simplefilter("ignore", UserWarning)

# English 經典NLP工具
import nltk
# nltk.download("all")

# Chinese 斷詞工具
import jieba



In [4]:
example_en = "Anthropic is a public benefit corporation dedicated to securing its benefits and mitigating its risks."
example_ch = "TAIDE團隊今日釋出模型，已完成基本測試的最新具臺灣文化的大型繁體中文模型。"

## Tokenization

In [5]:
from nltk.tokenize import word_tokenize

tok_en = word_tokenize(example_en)
print(" | ".join(tok_en))

Anthropic | is | a | public | benefit | corporation | dedicated | to | securing | its | benefits | and | mitigating | its | risks | .


In [6]:
sl_list = jieba.cut(example_ch, cut_all=True)
print(" | ".join(sl_list))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/py/cw4ghzp92ld3ckngb734svy00000gn/T/jieba.cache
Loading model cost 0.232 seconds.
Prefix dict has been built successfully.


TAIDE | 團 | 隊 | 今日 | 釋 | 出 | 模型 | ， | 已 | 完成 | 基本 | 測 | 試 | 的 | 最新 | 具 | 臺 | 灣 | 文化 | 的 | 大型 | 繁 | 體 | 中文 | 模型 | 。


## Stop Word Removal

In [7]:
from nltk.corpus import stopwords

In [8]:
stop_words = set(stopwords.words('english'))
swr_tok_en = [w for w in tok_en if w.lower() not in stop_words]


print(tok_en)
print(swr_tok_en)


['Anthropic', 'is', 'a', 'public', 'benefit', 'corporation', 'dedicated', 'to', 'securing', 'its', 'benefits', 'and', 'mitigating', 'its', 'risks', '.']
['Anthropic', 'public', 'benefit', 'corporation', 'dedicated', 'securing', 'benefits', 'mitigating', 'risks', '.']


## Stemming

In [9]:
from nltk.stem import PorterStemmer, SnowballStemmer

In [10]:
stem_swr_tok_en = [PorterStemmer().stem(w) for w in swr_tok_en]
print(swr_tok_en)
print(stem_swr_tok_en)

['Anthropic', 'public', 'benefit', 'corporation', 'dedicated', 'securing', 'benefits', 'mitigating', 'risks', '.']
['anthrop', 'public', 'benefit', 'corpor', 'dedic', 'secur', 'benefit', 'mitig', 'risk', '.']


In [11]:
stem_swr_tok_en = [SnowballStemmer(language="english").stem(w.lower()) for w in swr_tok_en]
print(swr_tok_en)
print(stem_swr_tok_en)

['Anthropic', 'public', 'benefit', 'corporation', 'dedicated', 'securing', 'benefits', 'mitigating', 'risks', '.']
['anthrop', 'public', 'benefit', 'corpor', 'dedic', 'secur', 'benefit', 'mitig', 'risk', '.']


## Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

In [14]:
lemm_swr_tok_en = [WordNetLemmatizer().lemmatize(w.lower()) for w in swr_tok_en]
print(swr_tok_en)
print(lemm_swr_tok_en)

['Anthropic', 'public', 'benefit', 'corporation', 'dedicated', 'securing', 'benefits', 'mitigating', 'risks', '.']
['anthropic', 'public', 'benefit', 'corporation', 'dedicated', 'securing', 'benefit', 'mitigating', 'risk', '.']
