In [1]:
"""
English tokenize
"""
import nltk

sentence = 'hello, world'
tokens = nltk.word_tokenize(sentence)
print(tokens)

['hello', ',', 'world']


In [2]:
"""
Chinese tokenize
"""
import jieba

seg_list = jieba.cut("我来到福建福州大学", cut_all=True)
print("Full Mode:", "/ ".join(seg_list))  # 全模式
seg_list = jieba.cut("我来到福建福州大学", cut_all=False)
print("Default Mode:", "/ ".join(seg_list)) # 精确模式
seg_list = jieba.cut("他来到了望京大厦")  # 默认是精确模式
print(", ".join(seg_list))
seg_list = jieba.cut_for_search("刘涛硕士毕业于厦门大学，后在日本东京大学深造")
# 搜索引擎模式
print(", ".join(seg_list))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.010 seconds.
Prefix dict has been built succesfully.


Full Mode: 我/ 来到/ 福建/ 福州/ 福州大学/ 大学
Default Mode: 我/ 来到/ 福建/ 福州大学
他, 来到, 了, 望京, 大厦
刘涛, 硕士, 毕业, 于, 厦门, 大学, 厦门大学, ，, 后, 在, 日本, 东京, 大学, 日本东京大学, 深造


In [12]:
"""
Social language tokenize
"""
import re
from nltk.tokenize import word_tokenize

tweet = 'RT @angelababy: love you baby! :D http://ah.love #168cm'
print(word_tokenize(tweet))

emotions_str = r"""
(?:
    [:=;]  # eyes
    [oO\-]?  # nose
    [D\)\]\(\]/\\OpP]  # mouth
)"""
regex_str = [
    emotions_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @somebody
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # topic tag
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # number
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and‘
    r'(?:[\w_]+)', # others
    r'(?:\S)' # others
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emotion_re = re.compile(r'^'+emotions_str+'$', re.VERBOSE | re.IGNORECASE)


def preprocess(s, lowercase=False):
    tokens = tokens_re.findall(s)
    if lowercase:
        # emotion can not be lower
        tokens = [token if emotion_re.search(token) else token.lower() for token in tokens]
    return tokens
print(preprocess(tweet))

['RT', '@', 'angelababy', ':', 'love', 'you', 'baby', '!', ':', 'D', 'http', ':', '//ah.love', '#', '168cm']
['RT', '@angelababy', ':', 'love', 'you', 'baby', '!', ':D', 'http://ah.love', '#168cm']


In [None]:
"""
Parts of speech
"""
