In [None]:
#### 8.1 文本数据分析工具

In [None]:
import jieba
import nltk
# nltk.download()

In [None]:
from nltk.corpus import brown
brown.words()

In [None]:
brown.categories()

In [None]:
'brown中一共有{}个句子'.format(len(brown.sents()))

In [None]:
'brown中一共有{}个单词'.format(len(brown.words()))

In [None]:
#### 8.2 文本预处理

In [None]:
sentence = 'Python is a structured and powerful object-oriented programming language.'
words = nltk.word_tokenize(sentence)
words

In [None]:
import jieba
sentence = '传智专修学院推出颠覆式办学模式'
terms_list = jieba.cut(sentence, cut_all = True)
print('【全模式】: '+'/'.join(terms_list))
terms_list = jieba.cut(sentence, cut_all = False)
print('【精确模式】: '+'/'.join(terms_list))

In [None]:
word = nltk.word_tokenize('Python is a structured and powerful object-oriented programming language.')
nltk.pos_tag(words)

In [None]:
from nltk.stem.porter import PorterStemmer
porter_stem = PorterStemmer()
porter_stem.stem('watched')

In [None]:
porter_stem.stem('watching')

In [None]:
from nltk.stem.lancaster import LancasterStemmer
lancaster_stem = LancasterStemmer()
lancaster_stem.stem('jumped')
lancaster_stem.stem('jumping')

In [None]:
from nltk.stem.lancaster import SnowballStemmer
snowball_stem = SnowballStemmer('english')
snowball_stem.stem('jumped')
snowball_stem.stem('jumping')

In [None]:
from nltk.stem.lancaster import WordNetLemmatizer
wordnet_lem = WordNetLemmatizer()
wordnet_lem.lemmatize('book')
wordnet_lem.lemmatize('went')
wordnet_lem.lemmatize('did')

In [None]:
wordnet_lem.lemmatize('went', pos = 'v')
wordnet_lem.lemmatize('did', pos = 'v')

In [None]:
from nltk.corpus import stopwords
sentence = 'Python is a structured and powerful object-oriented programming language.'
words = nltk.word_tokenize(sentence)
words

In [None]:
stop_words = stopwords.words('english')
remian_words = []

In [None]:
for word in words:
    if word not in stop_words:
        remain_words.append(word)
remain_words

In [None]:
#### 8.3 文本情感分析

In [None]:
text_one = 'This is a wonderful book'
text_two = 'I like reading this book very much.'
text_thr = 'This book reads well'
text_fou = 'This book is not good'
text_fiv = 'This is a very bad book'

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
def pret_text(text):
    words = nltk.word_tokenize(text)
    wordnet_lematizer = WordNetLemmatizer()
    words = [wordnet_lematizer.lemmatize(word) for word in words]
    remain_words = [word for word in words if word not in stopwords.words('english')]
    return {word: True for word in remain_words}    

In [None]:
train_data = [[pret_text(text_one), 1],
              [pret_text(text_two), 1],
              [pret_text(text_thr), 1],
              [pret_text(text_fou), -1],
              [pret_text(text_fiv), -1]
demo_model = NaiveBayesClassifier.train(train_data)

In [None]:
test_text1 = 'I like this movie very much'
demo_model.classify(pret_text(test_text1)

In [None]:
test_text2 = 'The film is very much'
demo_model.classify(pret_text(test_text2)

In [None]:
test_text3 = 'The film is terrible'
demo_model.classify(pret_text(test_text3)

In [None]:
#### 8.4 文本相似度

In [None]:
import nltk 
from nltk import FreqDist
text1 = 'John likes to watch movies'
text2 = 'John also likes to watch football games'
all_text = text1 + " " + text2
words = nltk.word_tokenize(all_text)
freq_dist = FreqDist(words)
freq_dist

In [None]:
freq_dist['John']

In [None]:
n = 5
most_common_words = freq_dist.most_common(n)
most_common_words

In [None]:
def find_position(common_words):
    result = {}
    pos = 0
    for word in common_words:
        result[word[0]] = pos
        pos += 1
    return result
pos_dict = find_position(most_common_words)
pos_dict

In [None]:
def text_to_vector(words):
    freq_vec = [0] * n
    for word in words:
        if word in list(pos_dict.keys()):
            freq_vec[pos_dict[word]] += 1
        return freq_vec       

In [None]:
vector1 = text_to_vector(nltk.word_tokenize(text1))
vector1

In [None]:
vector2 = text_to_vector(nltk.word_tokenize(text2))
vector2

In [None]:
from nltk.cluster.util import cosine_distance
1 - cosine_distance(vector1, vector2)

In [None]:
#### 8.5 文本分类

In [None]:
import nltk
from nltk.corpus import names
import random
names = [(name, 'male') for name in names.words('male.txt')] \ 
        + [(name, 'female') for name in names.words('female.txt')]
random.shuffle(names)
names

In [None]:
def gender_features(word):
    return {'最后一个字母': word[-1], '倒数第二个字母': word[-2]}
features = [(gender_features(n), g) for (n, g) in names]
features

In [None]:
train, test = features[500:], features[:500]
classifier = nltk.NaiveBayesClassifier.train(train)

In [None]:
nltk.classify.accuracy(classifier, test)

In [None]:
classifier.classify({'last_letter': 'Ella'})

In [None]:
classifier.show_most_informative_features(5)

In [None]:
import nltk.corpus
from nltk.text import TextCollection
corpus = TextCollection(['this is sentence one', 
                         'this is sentence two',
                         'this is sentence three'])
corpus.tf_idf('this', 'this is sentence four')