In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable

import json
import requests
import time
import re
import jieba
from collections import Counter
import numpy as np

In [2]:
epochs = 10
learning_rate = 0.01

In [None]:
# 从京东下载评论，这里已经下过一次，如果需要重新下载，可以修改URL并重新执行该代码段
url_templates = [
    "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100007218425&score={}&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1"
]

def get_comments(url):
    comments = []
    
    rsp = requests.get(url)
    rsp.encoding = "gbk"
    
    if rsp.status_code != 200:
        return []
    
    content = rsp.text
    if content:
        ind = content.find("(")
        s1 = content[ind+1:-2]
        #print(s1)
        try:
            js = json.loads(s1)
            comment_infos = js["comments"]
        except:
            print("Error")
            return ([])
        
        for comment_info in comment_infos:
            comment_content = comment_info["content"]
            str1 = comment_content + "\n"
            comments.append(str1)
            
    return comments


good_comments = []
j = 0

for template in url_templates:
    for i in range(100):
        url = template.format(3, i)
        good_comments += get_comments(url)
        time.sleep(1)
        print("第{}条记录，文本总长度{}".format(j, len(good_comments)))
        j +=1

fw = open("good.txt", "w", encoding="utf-8")
fw.writelines(good_comments)


bad_comments = []
j = 0

for template in url_templates:
    for i in range(100):
        url = template.format(1, i)
        bad_comments += get_comments(url)
        time.sleep(1)
        print("第{}条记录，文本总长度{}".format(j, len(bad_comments)))
        j +=1

fw = open("bad.txt", "w", encoding="utf-8")
fw.writelines(bad_comments)

In [3]:
def filter_punc(sentence):
    sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\'“”《》?“]+|[+——！，。？、~@#￥%……&*（）：]+", "", sentence)  
    return(sentence)

In [4]:
def prepare_data(good_file, bad_file, is_filter = True):
    all_words = []
    pos_sentences = []
    neg_sentences = []
    with open(good_file, "r", encoding="utf-8") as fr:
        for idx, line in enumerate(fr):
            if is_filter:
                line = filter_punc(line)
            words = jieba.lcut(line)
            if len(words) > 0:
                all_words += words
                pos_sentences.append(words)
    print("{0} 包含 {1} 行, {2} 个词.".format(good_file, idx+1, len(all_words)))

    count = len(all_words)
    with open(bad_file, "r", encoding="utf-8") as fr:
        for idx, line in enumerate(fr):
            if is_filter:
                line = filter_punc(line)
            words = jieba.lcut(line)
            if len(words) > 0:
                all_words += words
                neg_sentences.append(words)
    print("{0} 包含 {1} 行, {2} 个词.".format(bad_file, idx+1, len(all_words)-count))

    dic = {}
    cnt = Counter(all_words)
    for word, freq in cnt.items():
        dic[word] = [len(dic), freq]
    print('字典大小：{}'.format(len(dic)))
    return(pos_sentences, neg_sentences, dic)

In [5]:
def word2index(word, diction):
    if word in diction:
        value = diction[word][0]
    else:
        value = -1
    return(value)

In [6]:
def index2word(index, diction):
    for w,v in diction.items():
        if v[0] == index:
            return(w)
    return(None)

In [7]:
good_file = "good.txt"
bad_file  = "bad.txt"

pos_sentences, neg_sentences, diction = prepare_data(good_file, bad_file, True)
st = sorted([(v[1], w) for w, v in diction.items()])
#print(st)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Administrator\AppData\Local\Temp\jieba.cache
Loading model cost 0.669 seconds.
Prefix dict has been built succesfully.


good.txt 包含 3309 行, 61497 个词.
bad.txt 包含 1354 行, 18912 个词.
字典大小：7213


In [8]:
def sentence2vec(sentence, dictionary):
    vector = np.zeros(len(dictionary))
    for l in sentence:
        vector[l] += 1
    return(1.0 * vector / len(sentence))

In [9]:
dataset = []
labels = []
sentences = []

In [10]:
# 处理正向评论
for sentence in pos_sentences:
    new_sentence = []
    for l in sentence:
        if l in diction:
            new_sentence.append(word2index(l, diction))
    dataset.append(sentence2vec(new_sentence, diction))
    labels.append(0) #正标签为0
    sentences.append(sentence)

In [11]:
# 处理负向评论
for sentence in neg_sentences:
    new_sentence = []
    for l in sentence:
        if l in diction:
            new_sentence.append(word2index(l, diction))
    dataset.append(sentence2vec(new_sentence, diction))
    labels.append(1) #负标签为1
    sentences.append(sentence)

In [12]:
# 打乱评论顺序
indices = np.random.permutation(len(dataset))

dataset = [dataset[i] for i in indices]
labels = [labels[i] for i in indices]
sentences = [sentences[i] for i in indices]

In [13]:
# 数据切割
test_size = len(dataset) // 10
print(test_size)

train_data = dataset[2 * test_size :]
train_label = labels[2 * test_size :]
print(len(train_data))

valid_data = dataset[: test_size]
valid_label = labels[: test_size]
print(len(valid_data))

test_data = dataset[test_size : 2 * test_size]
test_label = labels[test_size : 2 * test_size]
print(len(test_data))

465
3724
465
465


In [14]:
model = nn.Sequential(
    nn.Linear(len(diction), 10),
    nn.ReLU(),
    nn.Linear(10, 2),
    nn.LogSoftmax(dim=1),
)

In [15]:
def rightness(predictions, labels):
    pred = torch.max(predictions.data, 1)[1] 
    rights = pred.eq(labels.data.view_as(pred)).sum() 
    return rights, len(labels)

In [16]:
cost = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [17]:
losses = []

def trainModel(data, label):
    x = Variable(torch.FloatTensor(data).view(1,-1))
    y = Variable(torch.LongTensor(np.array([label])))

    optimizer.zero_grad()
    predict = model(x)
    loss = cost(predict, y)
    losses.append(loss.data.numpy())
    loss.backward()
    optimizer.step()

In [18]:
def evaluateModel(data, label):
    x = Variable(torch.FloatTensor(data).view(1, -1))
    y = Variable(torch.LongTensor(np.array([label])))

    predict = model(x)
    right = rightness(predict, y)
    loss = cost(predict, y)

    return predict, right, loss

In [19]:
for epoch in range(epochs):
    for i, data in enumerate(zip(train_data, train_label)):
        x, y = data
        trainModel(x, y)
    
    val_losses = []
    rights = []
    
    for j, val in enumerate(zip(valid_data, valid_label)):
        x, y = val
        predict, right, loss = evaluateModel(x, y)
        rights.append(right)
        val_losses.append(loss.data.numpy())
        
    
    right_ratio = 1.0 * np.sum([i[0] for i in rights]) / np.sum([i[1] for i in rights])
    print('第{}轮，训练损失：{:.2f}, 校验损失：{:.2f}, 校验准确率: {:.2f}'.format(epoch+1, np.mean(losses),np.mean(val_losses), right_ratio))


第1轮，训练损失：0.31, 校验损失：0.35, 校验准确率: 0.87
第2轮，训练损失：0.23, 校验损失：0.31, 校验准确率: 0.90
第3轮，训练损失：0.19, 校验损失：0.37, 校验准确率: 0.91
第4轮，训练损失：0.16, 校验损失：0.41, 校验准确率: 0.91
第5轮，训练损失：0.14, 校验损失：0.48, 校验准确率: 0.91
第6轮，训练损失：0.13, 校验损失：0.54, 校验准确率: 0.90
第7轮，训练损失：0.12, 校验损失：0.58, 校验准确率: 0.91
第8轮，训练损失：0.11, 校验损失：0.63, 校验准确率: 0.91
第9轮，训练损失：0.10, 校验损失：0.69, 校验准确率: 0.90
第10轮，训练损失：0.10, 校验损失：0.73, 校验准确率: 0.91


In [20]:
sentences = [sentences[i] for i in np.random.permutation(len(sentences))]
flags = ["好", "坏"]

for sentence in sentences[:100]:
    new_sentence = []
    for l in sentence:
        if l in diction:
            new_sentence.append(word2index(l, diction))
    test_sentence = sentence2vec(new_sentence, diction)
    output = model(Variable(torch.FloatTensor(test_sentence).view(1,-1)))
    print( "评论：", flags[torch.max(output.data, 1)[1].item()], "---", sentence)


评论： 好 --- ['终于', '到手', '了', '性能', '强悍', '的', '小', '新', '158', '核', 'R7', '-', '4800U16GC', '面稍', '有', '塑料', '感', '做工', '还', '不错', '；', '高色域', '的', '雾化', '屏幕', '素质', '良好', '画面', '清晰', '；', '跑', '分', '高速度', '快', '同时', '打开', '多个', '页面', '不卡顿', '静音', '效果', '好', '运行', '时', '几乎', '听不见', '风扇', '声音', '；', '全', '尺寸', '键盘', '手感', '好', '舒服', '小键盘', '用', '起来', '很', '方便', '；', '接口', '够用', '圆孔', '的', '充电', '口', '2', '个', 'USB', '口全', '尺寸', 'SD', '读卡器', '耳麦', 'HDMI', '据说', 'type', '-', 'c', '也', '可充电', '还', '没试', '过', '办公', '追剧', '做', '够用', '重量轻', '外出', '携带方便']
评论： 好 --- ['轻薄', '程度', '很', '轻薄', '的', '漂亮']
评论： 坏 --- ['散热', '性能', '差']
评论： 好 --- ['外形', '外观', '全', '铝合金', '加上', '处理', '过', '的', '边边角角', '拿', '在', '手里', '舒服', '不搁', '手', '轻薄', '不用说', '了', '这', '是', '轻薄', '本', '的', '必备条件', '联想', '的', '充电器', '慢慢', '的', '越来越', '小', '现在', '的', '这样', '体积', '我', '完全', '能', '接受', '外带', '没', '感觉', '走', '压力']
评论： 好 --- ['轻薄', '程度', '轻薄', '本', '相比', '我', '以前', '的', '老年', '机', '方便', '很多', '携带方便']
评论： 坏 --- ['外形', '外观', 