In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import loaddata
import numpy as np
import sklearn
import jieba
import jieba.posseg as pseg
import sklearn.feature_extraction.text

In [3]:
messages = loaddata.load_message()
content = np.array([m[0] for m in messages])
target = np.array([m[1] for m in messages])

In [4]:
class MessageCountVectorizer(sklearn.feature_extraction.text.CountVectorizer):
    def build_analyzer(self):
        def analyzer(doc):
            words = pseg.cut(doc)
            new_doc = ''.join(w.word for w in words if w.flag != 'x')
            words = jieba.cut(new_doc)
            return words
        return analyzer

vec_count = MessageCountVectorizer(min_df=10,max_df=0.8)
data_count = vec_count.fit_transform(content)
vec_count.get_feature_names()
print(data_count.shape)

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.732 seconds.
DEBUG:jieba:Loading model cost 0.732 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


(23734, 4869)


In [5]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
def find_best_svm(data, target, cv):
    clf = SVC()
    C = [0.1, 0.5, 0.75, 1, 2, 3, 5, 10, 20]
    kernel = ['linear']#, 'poly', 'rbf']
    param_grid = [{'C': C, 'kernel':kernel}]
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=cv)
    grid_search.fit(data, target)
    grid_search.cls_name = 'SVM'
    return grid_search
grid_svm = find_best_svm(data_count, target, cv=10)
grid_svm.grid_scores_
grid_svm.best_score_, grid_svm.best_estimator_, grid_svm.best_params_

(0.95761355018117467, SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False), {'C': 0.1, 'kernel': 'linear'})

In [7]:
from sklearn.cross_validation import train_test_split
def get_classes_accury(data, target, test_times = 10, test_size=0.1):
    scores = np.zeros((test_times,len(set(target))))
    for t in range(test_times):
        clf =SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False)
        Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size=test_size,
                                                    random_state=t)
        clf.fit(Xtrain, ytrain)
        print(t, clf.score(Xtest, ytest))
        pre = clf.predict(Xtest)
        for i,c in enumerate(list(set(target))):
            s = np.logical_and(pre==c, ytest==c).sum()/ (ytest==c).sum()
            scores[t, i] = s

    ##### 生成表格
    print('|'+'class'+'|'+'|'.join([str(i) for i  in list(set(target))])+'|')
    print('|'+'-'+'|')
    for i,score in enumerate(scores):
        print( '|'+str(i)+'|'+ '|'.join(['{:.4f}'.format(_) for _ in score])+ '|' )
    print( '|'+'max'+ '|'+ '|'.join(['{:.4f}'.format(_) for _ in scores.max(axis=0)])+ '|' )
    print( '|'+'min'+ '|'+ '|'.join(['{:.4f}'.format(_) for _ in scores.min(axis=0)])+ '|' )
    print( '|'+'mean'+'|'+  '|'.join(['{:.4f}'.format(_) for _ in scores.mean(axis=0)])+ '|' )

    return scores
scores = get_classes_accury(data_count, target)

0 0.956613310868
1 0.964616680708
2 0.962931760741
3 0.964616680708
4 0.969250210615
5 0.961668070767
6 0.952401010952
7 0.960404380792
8 0.960825610783
9 0.9599831508
|class|4|5|3|6|1|2|
|-|
|0|0.8906|0.9563|0.6667|0.7869|0.9896|0.9479|
|1|0.8962|0.9682|0.7647|0.8500|0.9857|0.9585|
|2|0.8962|0.9584|0.5789|0.7931|0.9945|0.9611|
|3|0.8939|0.9699|0.6667|0.8060|0.9953|0.9575|
|4|0.9096|0.9693|0.7619|0.8696|0.9953|0.9588|
|5|0.8579|0.9679|0.6000|0.7544|0.9928|0.9540|
|6|0.8159|0.9573|0.6429|0.7975|0.9883|0.9610|
|7|0.8833|0.9521|0.5455|0.7302|0.9929|0.9613|
|8|0.8981|0.9666|0.6667|0.7941|0.9917|0.9449|
|9|0.8469|0.9548|0.6429|0.9149|0.9888|0.9615|
|max|0.9096|0.9699|0.7647|0.9149|0.9953|0.9615|
|min|0.8159|0.9521|0.5455|0.7302|0.9857|0.9449|
|mean|0.8789|0.9621|0.6537|0.8097|0.9915|0.9567|


In [9]:
class TfidfVectorizer(sklearn.feature_extraction.text.TfidfVectorizer):
    def build_analyzer(self):
        #analyzer = super(TfidfVectorizer, self).build_analyzer()
        def analyzer(doc):
            words = pseg.cut(doc)
            new_doc = ''.join(w.word for w in words if w.flag != 'x')
            words = jieba.cut(new_doc)
            return words
        return analyzer

vec_tfidf = TfidfVectorizer(min_df=10,max_df=0.8)
data_tfidf = vec_tfidf.fit_transform(content)
print(data_tfidf.shape)

(23734, 4869)


In [10]:
def test_SVM(data, target):
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import SVC
    clf = SVC()

    C = [0.1, 0.5, 0.75, 1, 2, 3, 4, 5, 10, 30]
    kernel = ['linear'] # 'poly',
    param_grid = [{'C': C, 'kernel':kernel}]
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=10)
    grid_search.fit(data, target)

    return grid_search
grid_search = test_SVM(data_tfidf, target)
print(grid_search.grid_scores_)
grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_

[mean: 0.92146, std: 0.02057, params: {'C': 0.1, 'kernel': 'linear'}, mean: 0.95348, std: 0.01235, params: {'C': 0.5, 'kernel': 'linear'}, mean: 0.95698, std: 0.01227, params: {'C': 0.75, 'kernel': 'linear'}, mean: 0.95816, std: 0.01185, params: {'C': 1, 'kernel': 'linear'}, mean: 0.95943, std: 0.01222, params: {'C': 2, 'kernel': 'linear'}, mean: 0.95947, std: 0.01135, params: {'C': 3, 'kernel': 'linear'}, mean: 0.95892, std: 0.01137, params: {'C': 4, 'kernel': 'linear'}, mean: 0.95770, std: 0.01176, params: {'C': 5, 'kernel': 'linear'}, mean: 0.95546, std: 0.01171, params: {'C': 10, 'kernel': 'linear'}, mean: 0.95365, std: 0.01191, params: {'C': 30, 'kernel': 'linear'}]


(0.95946743069014917, SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False), {'C': 3, 'kernel': 'linear'})