### 从文件中读取词数据，并将词数据生成集合返回

In [3]:
def make_word_set(words_file_path):
    '''
    从文件中读取词数据，并将词数据生成集合返回
    :words_file_path 保存词数据的文件路径
    :return 词集合
    '''
    words_set = set()
    with open(words_file_path, 'r') as fp:
        for line in fp.readlines():
            word = line.strip().decode("utf-8")
            # 去掉重复的词
            if len(word)>0 and word not in words_set:
                words_set.add(word)
    return words_set

### 使用jieba分词将句子切分，并将切分好的句子保存到文件中

In [22]:
def cut_files(corpus_path, write_path, stopwords):
    '''
    切分文件中的句子，并把切好的句子保存到文件中
    :corpus_path 未切词的原始句子输入文件的根目录路径
    :write_path 切词后的句子的写入路径
    '''
    import jieba
    # 闭包，切分单个句子
    def cut_sentence(sentence, stopwords):
        """
        闭包，使用jieba分词将句子切分
        :sentence 要切分的句子
        :stopwords 停用词集合
        """
        sentence_cut = jieba.cut(sentence.strip())
        # 最终以字符串形式返回
        result = ''
        for word in sentence_cut:
            word = word.strip()
            if word not in stopwords:
                if len(word) > 0:
                    result += word
                    result += " "
        return result

    # 打开写入的文件
    with open(write_path, 'w+') as w:
        # 遍历并打开要读取的文件
        import os
        class_dir_names = os.listdir(corpus_path)
        for class_dir_name in class_dir_names:
            file_names = os.listdir(corpus_path + '/' + class_dir_name)
            for file_name in file_names:
                with open(corpus_path + '/' + class_dir_name + '/' + file_name) as f:
                    # 去除内容两侧空白符，替换换行符、制表符为空格，并编码为utf-8
                    content = f.read().decode('utf-8').strip().replace('\n', ' ').replace('\t', ' ')
                    # 切分句子
                    token_sen = cut_sentence(content, stopwords)
                    # 将切分好的句子打上标签写入文件
                    w.write(class_dir_name.encode('utf-8') + '\t' + token_sen.encode('utf-8') + '\n')

### 从原始文件中读取文本，进行预处理并打上标签（仅在需要的时候执行）

In [31]:
%%time
stopwords = make_word_set('/home/beanyon/Desktop/logistic_regression/tools/tokenizer/stopwords.txt')
cut_files('/home/beanyon/Desktop/logistic_regression/Sample', '/home/beanyon/Desktop/logistic_regression/corpus.txt', stopwords)

CPU times: user 4min 38s, sys: 1.17 s, total: 4min 39s
Wall time: 4min 39s


### 从文件中获取并组装样本-13个类别各6500个文件

In [1]:
def get_samples_from_file_6500(file_path):
    """
    从文件中获取并组装样本
    :file_path 存放样本的文件路径
    :return 标签列表,语料列表
    """
    label_list = []
    corpus_list = []
    with open(file_path, 'r') as p:
        for line in p.readlines():
            sample = line.split('\t')
            label_list.append(sample[0])
            corpus_list.append(sample[1])
    
    # 将样本乱序
    sample_list = zip(label_list, corpus_list)
    import random
    random.shuffle(sample_list)
    label_list, corpus_list = zip(*sample_list)
    
    # 分割训练集、验证集、测试集
    train_label = label_list[:65000]
    val_label = label_list[65000:71500]
    test_label = label_list[71500:]
    train_corpus = corpus_list[:65000]
    val_corpus = corpus_list[65000:71500]
    test_corpus = corpus_list[71500:]
    
    return train_label, train_corpus, val_label, val_corpus, test_label, test_corpus

### 从文件中获取并组装样本-11个类别各20000个文件

In [10]:
def get_samples_from_file_20000(file_path):
    """
    从文件中获取并组装样本
    :file_path 存放样本的文件路径
    :return 标签列表,语料列表
    """
    label_list = []
    corpus_list = []
    with open(file_path, 'r') as p:
        for line in p.readlines():
            sample = line.split('\t')
            label_list.append(sample[0])
            corpus_list.append(sample[1])
    
    # 将样本乱序
    sample_list = zip(label_list, corpus_list)
    import random
    random.shuffle(sample_list)
    label_list, corpus_list = zip(*sample_list)
    
    # 分割训练集、验证集、测试集
    train_label = label_list[:165000]
    val_label = label_list[165000:176000]
    test_label = label_list[176000:]
    train_corpus = corpus_list[:165000]
    val_corpus = corpus_list[165000:176000]
    test_corpus = corpus_list[176000:]
    
    return train_label, train_corpus, val_label, val_corpus, test_label, test_corpus

### 将训练集、验证集、测试集合并，获取全部样本

In [2]:
def get_all_corpus(train_set, val_set, test_set):
    '''
    将训练集、验证集、测试集合并，获取全部样本
    :train_set 训练集样本列表
    :val_set 验证集样本列表
    :test_set 测试集样本列表
    :return 合并后的全体样本列表
    '''
    return train_set + val_set + test_set

### 计算TF-IDF值

In [3]:
def calc_tf_idf(corpus_set):
    '''
    计算TF-IDF值
    :train_corpus 训练集语料库
    :return TF-IDF值集合
    '''
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.feature_extraction.text import CountVectorizer
    # drop df < 1e-5,去低频词
    vectorizer = CountVectorizer(min_df=1e-5)
    transformer = TfidfTransformer()
    vectors = vectorizer.fit_transform(corpus_set)
    tf_idf = transformer.fit_transform(vectors)
    words = vectorizer.get_feature_names()
    print "how many words: {0}".format(len(words))
    print "tf-idf shape: ({0},{1})".format(tf_idf.shape[0], tf_idf.shape[1])
    print "vectors shape: ({0},{1})".format(vectors.shape[0], vectors.shape[1])
    return tf_idf

### 使用逻辑回归进行分类

In [4]:
def logistic_classifier(train_set_features, train_label, 
                        val_set_features, val_label, 
                        test_set_features, test_label):
    '''
    使用逻辑回归进行分类
    :train_set_features 训练集特征
    :train_label 训练集标签
    :val_set_features 验证集特征
    :val_label 验证集标签
    :test_set_features 测试集特征
    :test_label 测试集标签
    '''
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import classification_report
    
    lr_model = LogisticRegression() 
    lr_model.fit(train_set_features, train_label) 
    print "val mean accuracy: {0}".format(lr_model.score(val_set_features, val_label))
    y_pred = lr_model.predict(test_set_features)
    print classification_report(test_label, y_pred)

### 使用随机森林进行分类

In [5]:
def random_forest_classifier(train_set_features, train_label, 
                             val_set_features, val_label, 
                             test_set_features, test_label, 
                             n_est=200):
    '''
    使用随机森林进行分类
    :train_set_features 训练集特征
    :train_label 训练集标签
    :val_set_features 验证集特征
    :val_label 验证集标签
    :test_set_features 测试集特征
    :test_label 测试集标签
    :n_est 构建多少棵树，默认200棵
    '''
    from sklearn.ensemble import RandomForestClassifier 
    from sklearn.metrics import classification_report
    
    rf_model = RandomForestClassifier(n_estimators=n_est, random_state=1080, n_jobs=-1) 
    rf_model.fit(train_set_features, train_label) 
    print "val mean accuracy: {0}".format(rf_model.score(val_set_features, val_label)) 
    y_pred = rf_model.predict(test_set_features) 
    print classification_report(test_label, y_pred)

### 特征提取

In [13]:
%%time
## 从磁盘中读取样本标签和语料库
# 训练集，65000, 验证集，6500, 测试集，16578
# train_label, train_corpus, val_label, val_corpus, test_label, test_corpus = get_samples_from_file_6500('/home/beanyon/Desktop/logistic_regression/corpus.txt')
train_label, train_corpus, val_label, val_corpus, test_label, test_corpus = get_samples_from_file_20000('/home/beanyon/Desktop/preprogress/corpus.txt')
# 将语料库合并，用于计算TF-IDF值
all_corpus = get_all_corpus(train_corpus, val_corpus, test_corpus)
# 全量计算TF-IDF值
all_features = calc_tf_idf(all_corpus)
# 分割样本集为训练集、验证集、测试集
# train_features_set = all_features[:65000]
# val_features_set = all_features[65000:71500]
# test_features_set = all_features[71500:]
train_features_set = all_features[:165000]
val_features_set = all_features[165000:176000]
test_features_set = all_features[176000:]

how many words: 179774
tf-idf shape: (220000,179774)
vectors shape: (220000,179774)
CPU times: user 27 s, sys: 567 ms, total: 27.5 s
Wall time: 27.5 s


### 使用逻辑回归进行分类

In [14]:
%%time
# 使用逻辑回归进行分类
logistic_classifier(train_features_set, train_label, 
                    val_features_set, val_label, 
                    test_features_set, test_label)

val mean accuracy: 0.951272727273


UnicodeDecodeError: 'ascii' codec can't decode byte 0xe4 in position 0: ordinal not in range(128)

### 使用随机森林进行分类

In [15]:
%%time
# 使用随机森林进行分类
random_forest_classifier(train_features_set, train_label,
                         val_features_set, val_label,
                         test_features_set, test_label,
                         n_est=200)

  from numpy.core.umath_tests import inner1d


val mean accuracy: 0.942545454545


UnicodeDecodeError: 'ascii' codec can't decode byte 0xe4 in position 0: ordinal not in range(128)