# 半监督词分类算法
https://blog.csdn.net/Yellow_python/article/details/100940617

In [1]:
from jieba import cut
from re import fullmatch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from collections import Counter

读语料

In [2]:
with open('x_train.txt', encoding='utf-8') as f:
    x_train = f.read().strip().split("\n")
with open('y_train.txt', encoding='utf-8') as f:
    y_train = f.read().strip().split("\n")
with open('x_test.txt', encoding='utf-8') as f:
    x_test = f.read().strip().split("\n")
with open('y_test.txt', encoding='utf-8') as f:
    y_test = f.read().strip().split("\n")

分词器

In [3]:
match = lambda word: fullmatch('[a-zA-Z\u4e00-\u9fa5]+', word)
stopwords = set('的了很买是都还也我就在这那又里哦和')
tokenizer = lambda text: (word for word in cut(text, HMM=False) if match(word) and word not in stopwords)

向量化

In [4]:
vectorizer = TfidfVectorizer(tokenizer=tokenizer)
X_train = vectorizer.fit_transform(x_train)
X_test = vectorizer.transform(x_test)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\HONGJI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.764 seconds.
Prefix dict has been built succesfully.


分类模型

In [5]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.9138333333333334


可视化函数

In [6]:
yellow = lambda w: '\033[033m{}\033[0m'.format(w)
hot = lambda i: '\033[034m%.2f\033[0m' % i if i < .5 else '\033[031m%.2f\033[0m' % i

### 全监督词分类

In [7]:
words = Counter(word for text in x_train for word in tokenizer(text))
for word, freq in words.most_common(100):
    pred = clf.predict_proba(vectorizer.transform([word]))[0][1]
    print(word, yellow(freq), hot(pred))

苹果 [033m1789[0m [031m0.61[0m
好 [033m1555[0m [031m0.81[0m
不 [033m1424[0m [034m0.25[0m
京东 [033m1088[0m [031m0.63[0m
吃 [033m867[0m [031m0.66[0m
有 [033m769[0m [034m0.39[0m
不错 [033m719[0m [031m0.94[0m
小 [033m623[0m [034m0.31[0m
新鲜 [033m595[0m [031m0.60[0m
水果 [033m549[0m [034m0.42[0m
一个 [033m546[0m [034m0.42[0m
好吃 [033m540[0m [031m0.75[0m
个 [033m509[0m [034m0.34[0m
差 [033m504[0m [034m0.06[0m
甜 [033m476[0m [031m0.72[0m
脆 [033m476[0m [031m0.90[0m
没有 [033m473[0m [031m0.54[0m
没 [033m454[0m [034m0.46[0m
烂 [033m450[0m [034m0.12[0m
坏 [033m442[0m [034m0.34[0m
给 [033m442[0m [034m0.41[0m
太 [033m439[0m [034m0.23[0m
大 [033m435[0m [031m0.60[0m
味道 [033m402[0m [031m0.70[0m
非常 [033m343[0m [031m0.70[0m
包装 [033m341[0m [031m0.83[0m
快递 [033m338[0m [031m0.71[0m
就是 [033m331[0m [031m0.63[0m
到 [033m320[0m [031m0.54[0m
说 [033m319[0m [034m0.29[0m
个头 [033m311[0m [031m0.67[0m
快 [033m306[0m [031

### 半监督词分类

In [8]:
y_pred = clf.predict_proba(X_test)
c1, c2, c3 = Counter(), Counter(), Counter()
for text, pred in zip(x_test, y_pred):
    for word in cut(text):
        if word not in words and match(word) and word not in stopwords:
            c1[word] += 1
            c2[word] += pred[1]
            c3[word] += 0 if pred[0] > pred[1] else 1
for word, freq in c1.most_common(100):
    print(word, yellow(freq), hot(c2[word] / freq), hot(c3[word] / freq))

差评 [033m216[0m [034m0.08[0m [034m0.00[0m
很脆 [033m113[0m [031m0.89[0m [031m0.96[0m
还会 [033m106[0m [031m0.90[0m [031m0.98[0m
不甜 [033m96[0m [034m0.29[0m [034m0.15[0m
脆甜 [033m96[0m [031m0.88[0m [031m0.95[0m
这是 [033m81[0m [034m0.28[0m [034m0.19[0m
太差 [033m78[0m [034m0.12[0m [034m0.00[0m
买过 [033m64[0m [031m0.58[0m [031m0.58[0m
不买 [033m64[0m [034m0.16[0m [034m0.03[0m
全是 [033m60[0m [034m0.15[0m [034m0.03[0m
不脆 [033m60[0m [034m0.33[0m [034m0.23[0m
一看 [033m56[0m [034m0.20[0m [034m0.11[0m
很差 [033m55[0m [034m0.16[0m [034m0.02[0m
给力 [033m55[0m [031m0.89[0m [031m0.96[0m
脆脆 [033m54[0m [031m0.90[0m [031m0.98[0m
坏果 [033m51[0m [031m0.53[0m [031m0.55[0m
一星 [033m48[0m [034m0.14[0m [034m0.02[0m
网购 [033m47[0m [034m0.30[0m [034m0.23[0m
好用 [033m47[0m [031m0.53[0m [031m0.98[0m
还行 [033m40[0m [031m0.70[0m [031m0.82[0m
甜脆 [033m40[0m [031m0.89[0m [031m0.95[0m
中果 [033m36[0m [034m0.16[0m 