In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import loaddata
import numpy as np
import sklearn
import jieba
import jieba.posseg as pseg
import sklearn.feature_extraction.text

In [3]:
messages = loaddata.load_message()
content = np.array([m[0] for m in messages])
target = np.array([m[1] for m in messages])

In [4]:
class MessageCountVectorizer(sklearn.feature_extraction.text.CountVectorizer):
    def build_analyzer(self):
        def analyzer(doc):
            words = pseg.cut(doc)
            new_doc = ''.join(w.word for w in words if w.flag != 'x')
            words = jieba.cut(new_doc)
            return words
        return analyzer

vec_count = MessageCountVectorizer(min_df=2,max_df=0.8)
data_count = vec_count.fit_transform(content)
vec_count.get_feature_names()
print(data_count.shape)

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.845 seconds.
DEBUG:jieba:Loading model cost 0.845 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


(23734, 21959)


In [5]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
def find_best_svm(data, target, cv):
    clf = SVC()
    C = [0.1, 0.5, 0.75, 1, 2, 3, 5, 10, 20]
    kernel = ['linear']#, 'poly', 'rbf']
    param_grid = [{'C': C, 'kernel':kernel}]
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=cv)
    grid_search.fit(data, target)
    grid_search.cls_name = 'SVM'
    return grid_search
grid_svm = find_best_svm(data_count, target, cv=10)
grid_svm.grid_scores_
grid_svm.best_score_, grid_svm.best_estimator_, grid_svm.best_params_

(0.96089997471981126, SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False), {'C': 0.5, 'kernel': 'linear'})

In [8]:
from sklearn.cross_validation import train_test_split
def get_classes_accury(data, target, test_times = 10, test_size=0.1):
    scores = np.zeros((test_times,len(set(target))))
    for t in range(test_times):
        clf = SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False)
        Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size=test_size,
                                                    random_state=t)
        clf.fit(Xtrain, ytrain)
        print(t, clf.score(Xtest, ytest))
        pre = clf.predict(Xtest)
        for i,c in enumerate(list(set(target))):
            s = np.logical_and(pre==c, ytest==c).sum()/ (ytest==c).sum()
            scores[t, i] = s

    ##### 生成表格
    print('|'+'class'+'|'+'|'.join([str(i) for i  in list(set(target))])+'|')
    print('|'+'-'+'|')
    for i,score in enumerate(scores):
        print( '|'+str(i)+'|'+ '|'.join(['{:.4f}'.format(_) for _ in score])+ '|' )
    print( '|'+'max'+ '|'+ '|'.join(['{:.4f}'.format(_) for _ in scores.max(axis=0)])+ '|' )
    print( '|'+'min'+ '|'+ '|'.join(['{:.4f}'.format(_) for _ in scores.min(axis=0)])+ '|' )
    print( '|'+'mean'+'|'+  '|'.join(['{:.4f}'.format(_) for _ in scores.mean(axis=0)])+ '|' )

    return scores
scores = get_classes_accury(data_count, target)

0 0.963774220725
1 0.965459140691
2 0.963774220725
3 0.96798652064
4 0.973041280539
5 0.961668070767
6 0.958719460826
7 0.964616680708
8 0.960825610783
9 0.965037910699
|class|3|2|6|1|4|5|
|-|
|0|0.6667|0.9614|0.8689|0.9915|0.9062|0.9563|
|1|0.7647|0.9629|0.8833|0.9857|0.9126|0.9588|
|2|0.5789|0.9590|0.8621|0.9954|0.9245|0.9433|
|3|0.7500|0.9595|0.8806|0.9963|0.9091|0.9619|
|4|0.7619|0.9686|0.9130|0.9934|0.9202|0.9713|
|5|0.7000|0.9623|0.8421|0.9937|0.8579|0.9472|
|6|0.6429|0.9610|0.8734|0.9912|0.8557|0.9536|
|7|0.6364|0.9671|0.7619|0.9911|0.9222|0.9500|
|8|0.7333|0.9531|0.7941|0.9908|0.9029|0.9568|
|9|0.6429|0.9696|0.9362|0.9935|0.8929|0.9421|
|max|0.7647|0.9696|0.9362|0.9963|0.9245|0.9713|
|min|0.5789|0.9531|0.7619|0.9857|0.8557|0.9421|
|mean|0.6878|0.9624|0.8616|0.9923|0.9004|0.9541|


In [11]:
#直接采用TFID生成对应的
class TfidfVectorizer(sklearn.feature_extraction.text.TfidfVectorizer):
    def build_analyzer(self):
        #analyzer = super(TfidfVectorizer, self).build_analyzer()
        def analyzer(doc):
            words = pseg.cut(doc)
            new_doc = ''.join(w.word for w in words if w.flag != 'x')
            words = jieba.cut(new_doc)
            return words
        return analyzer

vec_tfidf = TfidfVectorizer(min_df=2,max_df=0.8)
data_tfidf = vec_tfidf.fit_transform(content)
print(data_tfidf.shape)

(23734, 21959)


In [12]:
def test_SVM(data, target):
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import SVC
    clf = SVC()

    C = [0.1, 0.5, 0.75, 1, 2, 3, 4, 5, 10, 30]
    kernel = ['linear'] # 'poly',
    param_grid = [{'C': C, 'kernel':kernel}]
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=10)
    grid_search.fit(data, target)

    return grid_search
grid_search = test_SVM(data_tfidf, target)
print(grid_search.grid_scores_)
grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_

[mean: 0.91278, std: 0.02150, params: {'C': 0.1, 'kernel': 'linear'}, mean: 0.95285, std: 0.01301, params: {'C': 0.5, 'kernel': 'linear'}, mean: 0.95761, std: 0.01146, params: {'C': 0.75, 'kernel': 'linear'}, mean: 0.96023, std: 0.01070, params: {'C': 1, 'kernel': 'linear'}, mean: 0.96292, std: 0.01068, params: {'C': 2, 'kernel': 'linear'}, mean: 0.96259, std: 0.01109, params: {'C': 3, 'kernel': 'linear'}, mean: 0.96263, std: 0.01101, params: {'C': 4, 'kernel': 'linear'}, mean: 0.96212, std: 0.01083, params: {'C': 5, 'kernel': 'linear'}, mean: 0.96136, std: 0.01085, params: {'C': 10, 'kernel': 'linear'}, mean: 0.96010, std: 0.01074, params: {'C': 30, 'kernel': 'linear'}]


(0.9629223898205107, SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False), {'C': 2, 'kernel': 'linear'})

In [1]:
import loaddata
import numpy as np
import sklearn
import jieba
import jieba.posseg as pseg
import sklearn.feature_extraction.text

讠斤 



In [2]:
messages = loaddata.load_message()
content = np.array([m[0] for m in messages])
target = np.array([m[1] for m in messages])

In [3]:
fantizi = loaddata.load_fantizi()
content_after_fantizi = []
processed = set()
for i in content:
    new_words=''
    for k in i:
        if k in fantizi:
            new_words += fantizi[k]
            processed.add((k, fantizi[k]))
        else:
            new_words += k
    content_after_fantizi.append(new_words)

In [4]:
chaifenzi = loaddata.load_chaifenzi()
content_after_chaifenzi = []
found_chaifenzi = set()
for line in content_after_fantizi:
    result = line
    for k,v in chaifenzi.items():
        if k in line:
            found_chaifenzi.add((k,v))
            result = result.replace(k,v)
    content_after_chaifenzi.append(result)

讠斤 



In [5]:
digits_features = np.zeros((len(content),16))
import re
for i,line in enumerate(content):
    for digits in re.findall(r'\d+', line):
        length = len(digits)
        if 0 < length <= 15:
            digits_features[i, length-1] += 1
        elif length > 15:
            digits_features[i, 15] += 1

In [6]:
class TfidfVectorizer(sklearn.feature_extraction.text.TfidfVectorizer):
    def build_analyzer(self):
        #analyzer = super(TfidfVectorizer, self).build_analyzer()
        def analyzer(doc):
            words = pseg.cut(doc)
            new_doc = ''.join(w.word for w in words if w.flag != 'x')
            words = jieba.cut(new_doc)
            return words
        return analyzer

vec_tfidf = TfidfVectorizer(min_df=2,max_df=0.8)
data_tfidf = vec_tfidf.fit_transform(content_after_chaifenzi)
print(data_tfidf.shape)

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.873 seconds.
DEBUG:jieba:Loading model cost 0.873 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


(23734, 21948)


In [7]:
data_tfidf_digits = np.concatenate((data_tfidf.A, digits_features),axis=1)

In [None]:
def test_SVM(data, target):
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import SVC
    clf = SVC()

    C = [2]
    kernel = ['linear'] # 'poly',
    param_grid = [{'C': C, 'kernel':kernel}]
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=10)
    grid_search.fit(data, target)

    return grid_search
grid_search = test_SVM(data_tfidf_digits, target)
print(grid_search.grid_scores_)
grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_