In [1]:
"""
English tokenize
"""
import nltk


sentence = 'hello, world'
tokens = nltk.word_tokenize(sentence)
print(tokens)

['hello', ',', 'world']


In [2]:
"""
Chinese tokenize
"""
import jieba


seg_list = jieba.cut("我来到福建福州大学", cut_all=True)
print("Full Mode:", "/ ".join(seg_list))  # 全模式
seg_list = jieba.cut("我来到福建福州大学", cut_all=False)
print("Default Mode:", "/ ".join(seg_list)) # 精确模式
seg_list = jieba.cut("他来到了望京大厦")  # 默认是精确模式
print(", ".join(seg_list))
seg_list = jieba.cut_for_search("刘涛硕士毕业于厦门大学，后在日本东京大学深造")
# 搜索引擎模式
print(", ".join(seg_list))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.200 seconds.
Prefix dict has been built succesfully.


Full Mode: 我/ 来到/ 福建/ 福州/ 福州大学/ 大学
Default Mode: 我/ 来到/ 福建/ 福州大学
他, 来到, 了, 望京, 大厦
刘涛, 硕士, 毕业, 于, 厦门, 大学, 厦门大学, ，, 后, 在, 日本, 东京, 大学, 日本东京大学, 深造


In [3]:
"""
Social language tokenize
"""
import re
from nltk.tokenize import word_tokenize


tweet = 'RT @angelababy: love you baby! :D http://ah.love #168cm'
print(word_tokenize(tweet))

emotions_str = r"""
(?:
    [:=;]  # eyes
    [oO\-]?  # nose
    [D\)\]\(\]/\\OpP]  # mouth
)"""
regex_str = [
    emotions_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @somebody
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # topic tag
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # number
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and‘
    r'(?:[\w_]+)', # others
    r'(?:\S)' # others
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emotion_re = re.compile(r'^'+emotions_str+'$', re.VERBOSE | re.IGNORECASE)


def preprocess(s, lowercase=False):
    tokens = tokens_re.findall(s)
    if lowercase:
        # emotion can not be lower
        tokens = [token if emotion_re.search(token) else token.lower() for token in tokens]
    return tokens
print(preprocess(tweet))

['RT', '@', 'angelababy', ':', 'love', 'you', 'baby', '!', ':', 'D', 'http', ':', '//ah.love', '#', '168cm']
['RT', '@angelababy', ':', 'love', 'you', 'baby', '!', ':D', 'http://ah.love', '#168cm']


In [4]:
"""
Parts of speech
词性归一
Stemming 词干提取 ---> 直接砍尾巴 
Lemmatization 词形归⼀ ---> 各种类型词性归为一个形式
"""
print("------- PorterStemmer -------")
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem('maximum'))
print(porter_stemmer.stem('presumably'))
print(porter_stemmer.stem('multiply'))
print(porter_stemmer.stem('provision'))
print(porter_stemmer.stem('went'))
print(porter_stemmer.stem('wenting'))

print("------- SnowballStemmer -------")
from nltk.stem import SnowballStemmer
# 偷懒用Snowball
snowball_stemmer = SnowballStemmer("english")
print(snowball_stemmer.stem('maximum'))
print(snowball_stemmer.stem('presumably'))

print("------- LancasterStemmer -------")
from nltk.stem.lancaster import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
print(lancaster_stemmer.stem('maximum'))
print(lancaster_stemmer.stem('presumably'))

------- PorterStemmer -------
maximum
presum
multipli
provis


NameError: name 'p' is not defined

In [None]:
"""Lema
语言学家弄出的英语语料网络
坏处：永远需要更新，新词无法detail出来
好处：预处理会得到便利，复数单一化
"""
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
print(wordnet_lemmatizer.lemmatize('dogs'))
print(wordnet_lemmatizer.lemmatize('churches'))
print(wordnet_lemmatizer.lemmatize('aardwolves'))
print(wordnet_lemmatizer.lemmatize('abaci'))
print(wordnet_lemmatizer.lemmatize('hardrock'))

# Lema：how to recognize Went？ v.go or n.Went
print(wordnet_lemmatizer.lemmatize('are'))
print(wordnet_lemmatizer.lemmatize('is'))
print(wordnet_lemmatizer.lemmatize('are', pos='v'))
print(wordnet_lemmatizer.lemmatize('is', pos='v'))

In [None]:
"""
NLTK POS Tag
This is import in English
"""
import nltk
text = nltk.word_tokenize('This quiet chant shall relieve your wasted heart')
print(nltk.pos_tag(text))

In [None]:
"""
Stopwords
对于注重理解文本“意思”的应用场景来说歧义太多
英文停止词列表：https://www.ranks.nl/stopwords
"""
from nltk.corpus import stopwords
# tokenize get a word list
word_list = nltk.word_tokenize('you are the apple of my eyes')
# filter
filtered_words = [word for word in word_list if word not in stopwords.words('english')]
print(filtered_words)

In [14]:
"""
Sentiment analysis
最简单 sentiment dictionary
like 1
good 2
bad -2
terrible -3
类似于关键词打分机制
http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010
"""
words = "What a terrible trip".split(' ')
sentiment_dictionary = {}
for line in open('AFINN/AFINN-111.txt', 'r'):
    word, score = line.split('\t')
    sentiment_dictionary[word] = int(score)
total_score = sum(sentiment_dictionary.get(word, 0) for word in words)
print(total_score)

-3


In [4]:
"""
Sentiment analysis with Machine Learning
"""
from nltk.classify import NaiveBayesClassifier
# 随⼿手造点训练集
s1 = 'this is a good book'
s2 = 'this is a awesome book'
s3 = 'this is a bad book'
s4 = 'this is a terrible book'
def preprocess(s):
    """
    :param s: str
    :return: dict
    """
# Func: 句子处理
# 这⾥简单的用了split(),把句子中每个单词分开
# 显然还有更多的processing method可以用
    return {word: True for word in s.lower().split()}
# return: {'this': True, 'is':True, 'a':True, 'good':True, 'book':True}
# 这里我们用最简单的True,来表示,这个词『出现在当前的句子中』的意义。
training_data = [[preprocess(s1), 'pos'],
                 [preprocess(s2), 'pos'],
                 [preprocess(s3), 'neg'],
                 [preprocess(s4), 'neg']]
model = NaiveBayesClassifier.train(training_data)
print(model.classify(preprocess('this is a good book')))

pos


In [5]:
"""
Application: Text Similarity
Frequency 频率统计
"""
import nltk
from nltk import FreqDist


corpus = 'this is my sentence ' \
         'this is my life ' \
         'this is the day'

token = nltk.word_tokenize(corpus)
print(token)
# us FreqDist to count the word's frequency
fdist = FreqDist(token)
print(fdist['is'])

# 拿出最常用的50个单词
standard_freq_vector = fdist.most_common(50)
size = len(standard_freq_vector)
print(standard_freq_vector)

def position_lookup(v):
    res = {word[0]: k for k, word in enumerate(v)}
    return res

standard_position_dict = position_lookup(standard_freq_vector)

print(standard_position_dict)

# 新句子
sentence = "this is cool"
# 建立一个和我们标准vector同样大小的向量
freq_vector = [0] * size
tokens = nltk.word_tokenize(sentence)
for word in tokens:
    try:
        # 如果在我们的词库⾥里里出现过
        # 那么就在"标准位置"上+1
        freq_vector[standard_position_dict[word]] += 1
    except KeyError:
        # 如果是个新词
        # 就pass掉
        continue
print(freq_vector)

['this', 'is', 'my', 'sentence', 'this', 'is', 'my', 'life', 'this', 'is', 'the', 'day']
3
[('this', 3), ('is', 3), ('my', 2), ('sentence', 1), ('life', 1), ('the', 1), ('day', 1)]
{'this': 0, 'is': 1, 'my': 2, 'sentence': 3, 'life': 4, 'the': 5, 'day': 6}
[1, 1, 0, 0, 0, 0, 0]


In [13]:
"""
Application: Text Categorization
TF-IDF
Term Frequency, 衡量⼀个term在⽂档中出现得有多频繁。
IDF: Inverse Document Frequency, 衡量⼀个term有多重要。
TF-IDF = TF * IDF
"""
from nltk.text import TextCollection
# 首先,把所有的文档放到TextCollection类中。
# 这个类会自动帮你断句,做统计,做计算
corpus = TextCollection(['this is sentence one',
                         'this is sentence two',
                         'this is sentence three'])
# 直接就能算出tfidf
# (term:一句话中的某个term, text:这句话)
print(corpus.tf_idf('this', 'this is sentence four'))
# 0.0
# 同理,怎么得到⼀一个标准大小的vector来表示所有的句子?
# 对于每个新句子
new_sentence = 'this is sentence five'
# 遍历一遍所有的vocabulary中的词:
for word in token:
    print(corpus.tf_idf(word, new_sentence))
# 我们会得到⼀一个巨⻓长(=所有vocab⻓长度)的向量量


0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
