## 1.2 本小节学习使用情感词典辅助情感分类

In [1]:
from nltk.corpus import opinion_lexicon

In [2]:
# 导入情感词典
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

print('positive:', len(positive_words))
print('negative:', len(negative_words))

positive: 2006
negative: 4783


In [3]:
from nltk.corpus import movie_reviews
import random
random.seed(42)

# 读取和划分数据集
def load_movie_reviews():
    pos_ids = movie_reviews.fileids('pos')
    neg_ids = movie_reviews.fileids('neg')

    all_reviews = []
    for pids in pos_ids:
        all_reviews.append((movie_reviews.raw(pids), 'positive'))
    
    for nids in neg_ids:
        all_reviews.append((movie_reviews.raw(nids), 'negative'))

    random.shuffle(all_reviews)
    train_reviews = all_reviews[:1600]
    test_reviews = all_reviews[1600:]

    return train_reviews, test_reviews

train_reviews, test_reviews = load_movie_reviews()
print('train:', len(train_reviews))
print('test:', len(test_reviews))

train: 1600
test: 400


In [4]:
from nltk import NaiveBayesClassifier


def train_and_test(extract_feature, train_data, test_data):
    training_set = nltk.classify.apply_features(extract_feature, train_data)
    test_set = nltk.classify.apply_features(extract_feature, test_data)

    classifier = NaiveBayesClassifier.train(training_set)
    accuracy = nltk.classify.util.accuracy(classifier, test_set)
    print(f'accuracy is {accuracy:.4f}')

    return classifier

In [5]:
# 只把情感词当作特征进行提取
from nltk import word_tokenize
import nltk
def extract_feature(text):
    feature = {}
    text = text.lower()
    for word in word_tokenize(text):
        if word in positive_words or word in negative_words:
            feature[f'contain: {word}'] = True
    return feature

In [6]:
train_and_test(extract_feature, train_reviews, test_reviews)

accuracy is 0.8125


<nltk.classify.naivebayes.NaiveBayesClassifier at 0x157d7159c70>

#### 仅根据情感词的数量进行情感判断（最简单的分类器）

In [7]:

def count_based_classifier(text):
    pos_num = 0
    neg_num = 0
    for word in word_tokenize(text):
        if word in positive_words:
            pos_num += 1
        if word in negative_words:
            neg_num += 1

    if pos_num >= neg_num:
        return 'positive'
    else:
        return 'negative'

In [8]:
correct_num = 0
all_num = 0

for review, polarity in test_reviews:
    predicted = count_based_classifier(review)
    if predicted == polarity:
        correct_num += 1
    all_num += 1 

print(f'accracy is {correct_num / all_num:.4f}')

accracy is 0.6600


更重要的问题

- 如何得到情感词典
    - 从语料库中挖掘情感词典 / 自定义种子词
    - 使用半监督的方法扩展情感词典
