In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import loaddata
import numpy as np
import sklearn
import jieba
import jieba.posseg as pseg
import sklearn.feature_extraction.text

In [3]:
messages = loaddata.load_message()
content = np.array([m[0] for m in messages])
target = np.array([m[1] for m in messages])

In [4]:
class MessageCountVectorizer(sklearn.feature_extraction.text.CountVectorizer):
    def build_analyzer(self):
        def analyzer(doc):
            words = pseg.cut(doc)
            new_doc = ''.join(w.word for w in words if w.flag != 'x')
            words = jieba.cut(new_doc)
            return words
        return analyzer

vec_count = MessageCountVectorizer(min_df=5,max_df=0.8)
data_count = vec_count.fit_transform(content)
vec_count.get_feature_names()
print(data_count.shape)

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.732 seconds.
DEBUG:jieba:Loading model cost 0.732 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


(23734, 8895)


In [5]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
def find_best_svm(data, target, cv):
    clf = SVC()
    C = [0.1, 0.5, 0.75, 1, 2, 3, 5, 10, 20]
    kernel = ['linear']#, 'poly', 'rbf']
    param_grid = [{'C': C, 'kernel':kernel}]
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=cv)
    grid_search.fit(data, target)
    grid_search.cls_name = 'SVM'
    return grid_search
grid_svm = find_best_svm(data_count, target, cv=10)
grid_svm.grid_scores_
grid_svm.best_score_, grid_svm.best_estimator_, grid_svm.best_params_

(0.96081570742394873, SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False), {'C': 0.5, 'kernel': 'linear'})

In [7]:
from sklearn.cross_validation import train_test_split
def get_classes_accury(data, target, test_times = 10, test_size=0.1):
    scores = np.zeros((test_times,len(set(target))))
    for t in range(test_times):
        clf =SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False)
        Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size=test_size,
                                                    random_state=t)
        clf.fit(Xtrain, ytrain)
        print(t, clf.score(Xtest, ytest))
        pre = clf.predict(Xtest)
        for i,c in enumerate(list(set(target))):
            s = np.logical_and(pre==c, ytest==c).sum()/ (ytest==c).sum()
            scores[t, i] = s

    ##### 生成表格
    print('|'+'class'+'|'+'|'.join([str(i) for i  in list(set(target))])+'|')
    print('|'+'-'+'|')
    for i,score in enumerate(scores):
        print( '|'+str(i)+'|'+ '|'.join(['{:.4f}'.format(_) for _ in score])+ '|' )
    print( '|'+'max'+ '|'+ '|'.join(['{:.4f}'.format(_) for _ in scores.max(axis=0)])+ '|' )
    print( '|'+'min'+ '|'+ '|'.join(['{:.4f}'.format(_) for _ in scores.min(axis=0)])+ '|' )
    print( '|'+'mean'+'|'+  '|'.join(['{:.4f}'.format(_) for _ in scores.mean(axis=0)])+ '|' )

    return scores
scores = get_classes_accury(data_count, target)

0 0.958719460826
1 0.965037910699
2 0.963352990733
3 0.966722830666
4 0.973041280539
5 0.96251053075
6 0.955349620893
7 0.965037910699
8 0.96251053075
9 0.961668070767
|class|6|3|5|4|2|1|
|-|
|0|0.8361|0.7083|0.9506|0.9167|0.9440|0.9905|
|1|0.8833|0.7647|0.9588|0.9071|0.9629|0.9857|
|2|0.8621|0.5789|0.9471|0.9245|0.9546|0.9945|
|3|0.9104|0.7500|0.9679|0.8939|0.9595|0.9916|
|4|0.9275|0.8095|0.9655|0.9149|0.9725|0.9934|
|5|0.8596|0.6000|0.9566|0.8579|0.9582|0.9928|
|6|0.8608|0.6429|0.9536|0.8458|0.9610|0.9864|
|7|0.7778|0.6364|0.9500|0.9167|0.9729|0.9893|
|8|0.7941|0.7333|0.9548|0.9223|0.9592|0.9890|
|9|0.9362|0.7143|0.9403|0.8929|0.9676|0.9869|
|max|0.9362|0.8095|0.9679|0.9245|0.9729|0.9945|
|min|0.7778|0.5789|0.9403|0.8458|0.9440|0.9857|
|mean|0.8648|0.6938|0.9545|0.8993|0.9612|0.9900|


In [9]:
class TfidfVectorizer(sklearn.feature_extraction.text.TfidfVectorizer):
    def build_analyzer(self):
        #analyzer = super(TfidfVectorizer, self).build_analyzer()
        def analyzer(doc):
            words = pseg.cut(doc)
            new_doc = ''.join(w.word for w in words if w.flag != 'x')
            words = jieba.cut(new_doc)
            return words
        return analyzer

vec_tfidf = TfidfVectorizer(min_df=5,max_df=0.8)
data_tfidf = vec_tfidf.fit_transform(content)
print(data_tfidf.shape)

(23734, 8895)


In [10]:
def test_SVM(data, target):
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import SVC
    clf = SVC()

    C = [0.1, 0.5, 0.75, 1, 2, 3, 4, 5, 10, 30]
    kernel = ['linear'] # 'poly',
    param_grid = [{'C': C, 'kernel':kernel}]
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=10)
    grid_search.fit(data, target)

    return grid_search
grid_search = test_SVM(data_tfidf, target)
print(grid_search.grid_scores_)
grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_

[mean: 0.91860, std: 0.02108, params: {'C': 0.1, 'kernel': 'linear'}, mean: 0.95450, std: 0.01219, params: {'C': 0.5, 'kernel': 'linear'}, mean: 0.95770, std: 0.01158, params: {'C': 0.75, 'kernel': 'linear'}, mean: 0.95926, std: 0.01140, params: {'C': 1, 'kernel': 'linear'}, mean: 0.96229, std: 0.01092, params: {'C': 2, 'kernel': 'linear'}, mean: 0.96208, std: 0.01102, params: {'C': 3, 'kernel': 'linear'}, mean: 0.96162, std: 0.01125, params: {'C': 4, 'kernel': 'linear'}, mean: 0.96166, std: 0.01153, params: {'C': 5, 'kernel': 'linear'}, mean: 0.96027, std: 0.01113, params: {'C': 10, 'kernel': 'linear'}, mean: 0.95858, std: 0.01112, params: {'C': 30, 'kernel': 'linear'}]


(0.96229038510154208, SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False), {'C': 2, 'kernel': 'linear'})