In [1]:
#针对特定NLP任务的分词器
from nltk import data
data.path.append("/Users/leonwong/nltk_data")

from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import MWETokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import StanfordSegmenter

In [2]:
sentence = "Don't miss the great hit. Tomorrow is another day @priscillachan728 is one of my favorite song."

In [3]:
def tokenize_with_word_tokenizer(text):
    tokenizer = word_tokenize(text)
    return tokenizer

tokenize_with_word_tokenizer(sentence)

['Do',
 "n't",
 'miss',
 'the',
 'great',
 'hit',
 '.',
 'Tomorrow',
 'is',
 'another',
 'day',
 '@',
 'priscillachan728',
 'is',
 'one',
 'of',
 'my',
 'favorite',
 'song',
 '.']

In [6]:
#针对推特文本，去除@用户名，保留表情等一些特殊符号，分词分为两种情况：
#1、不带参数token=TweetTokenizer().tokenize(sentence) 能够拆分无效用的标点符号
#2、带参数token=TweetTokenizer(strip_handles=True, reduce=True) 当一个词中相同字符连续出现3次以上，就只保留3个，设置strip_handles=True会删去@xxx
def tokenize_with_tweet_tokenizer(text):
    tweet_tokenizer = TweetTokenizer(strip_handles=True);
    return tweet_tokenizer.tokenize(text)

tokenize_with_tweet_tokenizer(sentence)

["Don't",
 'miss',
 'the',
 'great',
 'hit',
 '.',
 'Tomorrow',
 'is',
 'another',
 'day',
 '8',
 'is',
 'one',
 'of',
 'my',
 'favorite',
 'song',
 '.']

In [7]:
#对特定短语进行组合划分
def tokenize_with_mwe(text):
    mwe_tokenizer = MWETokenizer()
    mwe_tokenizer.add_mwe(("Tomorrow", "is", "another", "day"))
    return mwe_tokenizer.tokenize(text.split())

tokenize_with_mwe(sentence)

["Don't",
 'miss',
 'the',
 'great',
 'hit.',
 'Tomorrow_is_another_day',
 '@priscillachan728',
 'is',
 'one',
 'of',
 'my',
 'favorite',
 'song.']

In [8]:
#使用正则表达式进行分词，如对一些金钱表示或者其他非空白序列
def tokenize_with_regex_tokenizer(text):
    reg_tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S')
    return reg_tokenizer.tokenize(text)

tokenize_with_regex_tokenizer(sentence)

['Don',
 "'",
 't',
 'miss',
 'the',
 'great',
 'hit',
 '.',
 'Tomorrow',
 'is',
 'another',
 'day',
 '@',
 'priscillachan728',
 'is',
 'one',
 'of',
 'my',
 'favorite',
 'song',
 '.']

In [9]:
#空格符号分割
def tokenize_with_wst(text):
    wh_tokenizer = WhitespaceTokenizer()
    return wh_tokenizer.tokenize(text)

tokenize_with_wst(sentence)

["Don't",
 'miss',
 'the',
 'great',
 'hit.',
 'Tomorrow',
 'is',
 'another',
 'day',
 '@priscillachan728',
 'is',
 'one',
 'of',
 'my',
 'favorite',
 'song.']

In [10]:
#单词标点进行分割
def tokenize_with_wordpunct_tokenizer(text):
    wp_tokenizer = WordPunctTokenizer()
    return wp_tokenizer.tokenize(text)

tokenize_with_wordpunct_tokenizer(sentence)

['Don',
 "'",
 't',
 'miss',
 'the',
 'great',
 'hit',
 '.',
 'Tomorrow',
 'is',
 'another',
 'day',
 '@',
 'priscillachan728',
 'is',
 'one',
 'of',
 'my',
 'favorite',
 'song',
 '.']