In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import loaddata
import numpy as np
import sklearn
import jieba
import jieba.posseg as pseg
import sklearn.feature_extraction.text

In [3]:
messages = loaddata.load_message()
content = np.array([m[0] for m in messages])
target = np.array([m[1] for m in messages])

In [4]:
class MessageCountVectorizer(sklearn.feature_extraction.text.CountVectorizer):
    def build_analyzer(self):
        def analyzer(doc):
            words = pseg.cut(doc)
            new_doc = ''.join(w.word for w in words if w.flag != 'x')
            words = jieba.cut(new_doc)
            return words
        return analyzer

vec_count = MessageCountVectorizer(min_df=50,max_df=0.8)
data_count = vec_count.fit_transform(content)
vec_count.get_feature_names()
print(data_count.shape)

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.747 seconds.
DEBUG:jieba:Loading model cost 0.747 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


(23734, 1179)


In [5]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
def find_best_svm(data, target, cv):
    clf = SVC()
    C = [0.1, 0.5, 0.75, 1, 2, 3, 5, 10, 20]
    kernel = ['linear']#, 'poly', 'rbf']
    param_grid = [{'C': C, 'kernel':kernel}]
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=cv)
    grid_search.fit(data, target)
    grid_search.cls_name = 'SVM'
    return grid_search
grid_svm = find_best_svm(data_count, target, cv=10)
grid_svm.grid_scores_
grid_svm.best_score_, grid_svm.best_estimator_, grid_svm.best_params_

(0.94282463975731023, SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False), {'C': 0.5, 'kernel': 'linear'})

In [8]:
from sklearn.cross_validation import train_test_split
def get_classes_accury(data, target, test_times = 10, test_size=0.1):
    scores = np.zeros((test_times,len(set(target))))
    for t in range(test_times):
        clf =SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False)
        Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size=test_size,
                                                    random_state=t)
        clf.fit(Xtrain, ytrain)
        print(t, clf.score(Xtest, ytest))
        pre = clf.predict(Xtest)
        for i,c in enumerate(list(set(target))):
            s = np.logical_and(pre==c, ytest==c).sum()/ (ytest==c).sum()
            scores[t, i] = s

    ##### 生成表格
    print('|'+'class'+'|'+'|'.join([str(i) for i  in list(set(target))])+'|')
    print('|'+'-'+'|')
    for i,score in enumerate(scores):
        print( '|'+str(i)+'|'+ '|'.join(['{:.4f}'.format(_) for _ in score])+ '|' )
    print( '|'+'max'+ '|'+ '|'.join(['{:.4f}'.format(_) for _ in scores.max(axis=0)])+ '|' )
    print( '|'+'min'+ '|'+ '|'.join(['{:.4f}'.format(_) for _ in scores.min(axis=0)])+ '|' )
    print( '|'+'mean'+'|'+  '|'.join(['{:.4f}'.format(_) for _ in scores.mean(axis=0)])+ '|' )

    return scores
scores = get_classes_accury(data_count, target)

0 0.94650379107
1 0.943133951137
2 0.946082561078
3 0.954928390901
4 0.960404380792
5 0.946925021061
6 0.93850042123
7 0.950294860994
8 0.949452401011
9 0.940185341196
|class|1|3|6|5|2|4|
|-|
|0|0.9848|0.6667|0.7705|0.9373|0.9498|0.8438|
|1|0.9777|0.5882|0.8333|0.9251|0.9476|0.8415|
|2|0.9909|0.5263|0.7931|0.9112|0.9460|0.8821|
|3|0.9860|0.7917|0.8060|0.9499|0.9459|0.8939|
|4|0.9925|0.8095|0.8116|0.9483|0.9510|0.9096|
|5|0.9866|0.7000|0.7018|0.9358|0.9351|0.8579|
|6|0.9825|0.5714|0.7848|0.9481|0.9415|0.7662|
|7|0.9831|0.5455|0.7302|0.9417|0.9574|0.8500|
|8|0.9853|0.7333|0.7353|0.9489|0.9449|0.8592|
|9|0.9813|0.7857|0.7872|0.9186|0.9474|0.8061|
|max|0.9925|0.8095|0.8333|0.9499|0.9574|0.9096|
|min|0.9777|0.5263|0.7018|0.9112|0.9351|0.7662|
|mean|0.9851|0.6718|0.7754|0.9365|0.9467|0.8510|


In [None]:
s ='''
|class|3|2|6|1|4|5|
|mean|0.6878|0.9624|0.8616|0.9923|0.9004|0.9541|
|class|6|3|5|4|2|1|
|mean|0.8648|0.6938|0.9545|0.8993|0.9612|0.9900|
|class|4|5|3|6|1|2|
|mean|0.8789|0.9621|0.6537|0.8097|0.9915|0.9567|
|class|1|3|6|5|2|4|
|mean|0.9851|0.6718|0.7754|0.9365|0.9467|0.8510|
'''

In [9]:
import jieba

In [10]:
jieba.cut('尊 敬的用 户:您2009年10月份的积 分奖 品尚未领 取,请拨I259O7612按 2键领 取(48小时内有效)咨 询95105526')

<generator object Tokenizer.cut at 0x7f1ead6175c8>

In [11]:
[i for i in jieba.cut('尊 敬的用 户:您2009年10月份的积 分奖 品尚未领 取,请拨I259O7612按 2键领 取(48小时内有效)咨 询95105526')]

['尊',
 ' ',
 '敬',
 '的',
 '用',
 ' ',
 '户',
 ':',
 '您',
 '2009',
 '年',
 '10',
 '月份',
 '的',
 '积',
 ' ',
 '分奖',
 ' ',
 '品',
 '尚未',
 '领',
 ' ',
 '取',
 ',',
 '请拨',
 'I259O7612',
 '按',
 ' ',
 '2',
 '键领',
 ' ',
 '取',
 '(',
 '48',
 '小时',
 '内',
 '有效',
 ')',
 '咨',
 ' ',
 '询',
 '95105526']

In [13]:
class TfidfVectorizer(sklearn.feature_extraction.text.TfidfVectorizer):
    def build_analyzer(self):
        #analyzer = super(TfidfVectorizer, self).build_analyzer()
        def analyzer(doc):
            words = pseg.cut(doc)
            new_doc = ''.join(w.word for w in words if w.flag != 'x')
            words = jieba.cut(new_doc)
            return words
        return analyzer

vec_tfidf = TfidfVectorizer(min_df=50,max_df=0.8)
data_tfidf = vec_tfidf.fit_transform(content)
print(data_tfidf.shape)

(23734, 1179)


In [14]:
def test_SVM(data, target):
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import SVC
    clf = SVC()

    C = [0.1, 0.5, 0.75, 1, 2, 3, 5, 10, 30]
    kernel = ['linear'] # 'poly',
    param_grid = [{'C': C, 'kernel':kernel}]
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=10)
    grid_search.fit(data, target)

    return grid_search
grid_search = test_SVM(data_tfidf, target)
print(grid_search.grid_scores_)
grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_

[mean: 0.92011, std: 0.01803, params: {'C': 0.1, 'kernel': 'linear'}, mean: 0.93975, std: 0.01423, params: {'C': 0.5, 'kernel': 'linear'}, mean: 0.94198, std: 0.01328, params: {'C': 0.75, 'kernel': 'linear'}, mean: 0.94278, std: 0.01305, params: {'C': 1, 'kernel': 'linear'}, mean: 0.94379, std: 0.01294, params: {'C': 2, 'kernel': 'linear'}, mean: 0.94367, std: 0.01332, params: {'C': 3, 'kernel': 'linear'}, mean: 0.94266, std: 0.01378, params: {'C': 5, 'kernel': 'linear'}, mean: 0.94084, std: 0.01359, params: {'C': 10, 'kernel': 'linear'}, mean: 0.93524, std: 0.01245, params: {'C': 30, 'kernel': 'linear'}]


(0.94379371365972864, SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False), {'C': 2, 'kernel': 'linear'})