-
Notifications
You must be signed in to change notification settings - Fork 169
/
Sentiment_svm.py
131 lines (102 loc) · 4.02 KB
/
Sentiment_svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 5 10:05:30 2016
@author: ldy
"""
from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec
import numpy as np
import pandas as pd
import jieba
from sklearn.externals import joblib
from sklearn.svm import SVC
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# 加载文件,导入数据,分词
def loadfile():
neg=pd.read_excel('data/neg.xls',header=None,index=None)
pos=pd.read_excel('data/pos.xls',header=None,index=None)
cw = lambda x: list(jieba.cut(x))
pos['words'] = pos[0].apply(cw)
neg['words'] = neg[0].apply(cw)
#print pos['words']
#use 1 for positive sentiment, 0 for negative
y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))
x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos['words'], neg['words'])), y, test_size=0.2)
np.save('svm_data/y_train.npy',y_train)
np.save('svm_data/y_test.npy',y_test)
return x_train,x_test
#对每个句子的所有词向量取均值
def buildWordVector(text, size,imdb_w2v):
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in text:
try:
vec += imdb_w2v[word].reshape((1, size))
count += 1.
except KeyError:
continue
if count != 0:
vec /= count
return vec
#计算词向量
def get_train_vecs(x_train,x_test):
n_dim = 300
#Initialize model and build vocab
imdb_w2v = Word2Vec(size=n_dim, min_count=10)
imdb_w2v.build_vocab(x_train)
#Train the model over train_reviews (this may take several minutes)
imdb_w2v.train(x_train)
train_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_train])
#train_vecs = scale(train_vecs)
np.save('svm_data/train_vecs.npy',train_vecs)
print train_vecs.shape
#Train word2vec on test tweets
imdb_w2v.train(x_test)
imdb_w2v.save('svm_data/w2v_model/w2v_model.pkl')
#Build test tweet vectors then scale
test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test])
#test_vecs = scale(test_vecs)
np.save('svm_data/test_vecs.npy',test_vecs)
print test_vecs.shape
def get_data():
train_vecs=np.load('svm_data/train_vecs.npy')
y_train=np.load('svm_data/y_train.npy')
test_vecs=np.load('svm_data/test_vecs.npy')
y_test=np.load('svm_data/y_test.npy')
return train_vecs,y_train,test_vecs,y_test
##训练svm模型
def svm_train(train_vecs,y_train,test_vecs,y_test):
clf=SVC(kernel='rbf',verbose=True)
clf.fit(train_vecs,y_train)
joblib.dump(clf, 'svm_data/svm_model/model.pkl')
print clf.score(test_vecs,y_test)
##得到待预测单个句子的词向量
def get_predict_vecs(words):
n_dim = 300
imdb_w2v = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl')
#imdb_w2v.train(words)
train_vecs = buildWordVector(words, n_dim,imdb_w2v)
#print train_vecs.shape
return train_vecs
####对单个句子进行情感判断
def svm_predict(string):
words=jieba.lcut(string)
words_vecs=get_predict_vecs(words)
clf=joblib.load('svm_data/svm_model/model.pkl')
result=clf.predict(words_vecs)
if int(result[0])==1:
print string,' positive'
else:
print string,' negative'
if __name__=='__main__':
##导入文件,处理保存为向量
# x_train,x_test=loadfile() #得到句子分词后的结果,并把类别标签保存为y_train。npy,y_test.npy
# get_train_vecs(x_train,x_test) #计算词向量并保存为train_vecs.npy,test_vecs.npy
# train_vecs,y_train,test_vecs,y_test=get_data()#导入训练数据和测试数据
# svm_train(train_vecs,y_train,test_vecs,y_test)#训练svm并保存模型
##对输入句子情感进行判断
string='电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如'
#string='牛逼的手机,从3米高的地方摔下去都没坏,质量非常好'
svm_predict(string)