In [1]:
from scipy import sparse
import numpy as np 
import re
import os

# 用 VSM 提取文本特征及权重

In [2]:
def build_dictionary(directory):
    """
    建立词典
    """
    dict_set = set()
    count = 0

    for (dirname, dirs, files) in os.walk(directory):
        for file in files:
            if file.endswith(".txt"):
                filename = os.path.join(dirname, file)
                with open(filename, "rb") as f:
                    count += 1
                    data = f.readline().decode('gbk')
                    words = data.split(',<<::=::>>')[0].split(",")
                    dict_set |= set(words)
    return dict_set
    
def reduce_dict(dict_set):
    """
    去掉太短的词汇
    """
    dict_copy = dict_set.copy()
    for word in dict_set:
        if len(word) < 2:
            dict_copy.remove(word)
    dictionary = {}
    for idx, word in enumerate(dict_copy):
        dictionary[word] = idx

    return dictionary   
    
    
def transform_data(dir, dictionary):
    data = []
    target = []
    count = 0
    dict_copy = dictionary.copy()

    for (dirname, dirs, files) in os.walk(dir):
        print(dirname)
        for file in files:
            if file.endswith(".txt"):
                count += 1
                filename = os.path.join(dirname, file)
                tags = re.split('[/\\\\]', dirname)
                tag = tags[-1]
                word_vector = np.zeros(len(dict_copy))

                with open(filename, "rb") as f:
                    content = f.readline().decode('gbk')
                    words = content.split(',<<::=::>>')[0].split(",")
                    freqList = content.split('<<::=::>>')[1].strip(",").split(",")

                    for word in words:
                        try:
                            intFreqList = [int(freq) for freq in freqList]
                            afterData = dict(list(zip(words, intFreqList)))
                            word_vector[dict_copy[word]] += afterData[word]

                        except KeyError:
                            pass

                data.append(word_vector)
                target.append(tag)
    return sparse.csr_matrix(np.asarray(data)), np.asarray(target)

In [6]:
# 这样就得到了 9431 篇文章的 VSM 特征向量表征
train_directory = "/workspace/mnt/group/customization/pengyuyan/dataMining/TanCorp-12-Txt_SegRes_2_1_53907f/train_SegRes/"
train_feature, train_target = transform_data(train_directory, reduce_dict(build_dictionary(train_directory)))

# 注意这里需要保证训练集和测试集所用的字典是一样的
test_directory = "/workspace/mnt/group/customization/pengyuyan/dataMining/TanCorp-12-Txt_SegRes_2_1_53907f/test_SegRes"
test_feature, test_target = transform_data(test_directory, reduce_dict(build_dictionary(train_directory)))


/workspace/mnt/group/customization/pengyuyan/dataMining/TanCorp-12-Txt_SegRes_2_1_53907f/train_SegRes/
/workspace/mnt/group/customization/pengyuyan/dataMining/TanCorp-12-Txt_SegRes_2_1_53907f/train_SegRes/体育_SegRes
/workspace/mnt/group/customization/pengyuyan/dataMining/TanCorp-12-Txt_SegRes_2_1_53907f/train_SegRes/娱乐_SegRes
/workspace/mnt/group/customization/pengyuyan/dataMining/TanCorp-12-Txt_SegRes_2_1_53907f/train_SegRes/人才_SegRes
/workspace/mnt/group/customization/pengyuyan/dataMining/TanCorp-12-Txt_SegRes_2_1_53907f/train_SegRes/房产_SegRes
/workspace/mnt/group/customization/pengyuyan/dataMining/TanCorp-12-Txt_SegRes_2_1_53907f/train_SegRes/艺术_SegRes
/workspace/mnt/group/customization/pengyuyan/dataMining/TanCorp-12-Txt_SegRes_2_1_53907f/train_SegRes/科技_SegRes
/workspace/mnt/group/customization/pengyuyan/dataMining/TanCorp-12-Txt_SegRes_2_1_53907f/train_SegRes/财经_SegRes
/workspace/mnt/group/customization/pengyuyan/dataMining/TanCorp-12-Txt_SegRes_2_1_53907f/train_SegRes/地域_SegRes
/

# 模型训练

In [17]:
from sklearn import linear_model
logreg = linear_model.LogisticRegression()
logreg.fit(train_feature, train_target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# 评估预测精度

In [18]:
test_predict = logreg.predict(test_feature)
true_false = (test_predict == test_target)
acc = np.count_nonzero(true_false)/float(len(test_target))

print("The Accuracy is {}".format(acc))

The Accuracy is 0.9502013138376775


# 用 TF-IDF 提取文本特征及权重

In [410]:
def compute_df_by_file():
    for (dirname, dirs, files) in os.walk(train_directory):
        word_docif_tf = []
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(dirname, file), "rb") as f:
                    tf_dict = dict()
                    content = f.readline().decode('gbk')
                    words = content.split(',<<::=::>>')[0].split(",")
                    freq = content.split('<<::=::>>')[1].strip(",").split(",")
                    intFreqList = [int(freq) for freq in freqList]
                    afterData = dict(list(zip(words, intFreqList)))

                    for word in words:
                        try:
                            tf_dict[word] = afterData[word]
                            
                        except KeyError: 
                            pass

                    tf_list = tf_dict.items()
                    word_docif_tf += [[item[0], file, item[1]] for item in tf_list]
                
    return word_docif_tf

In [418]:
import operator, itertools

def compute_dfidf():
    word_docif_tf = compute_df_by_file()
    word_docif_tf.sort()
    doc_freq = dict()
    term_freq = dict()
    
    for current_word, group in itertools.groupby(word_docif_tf, key=operator.itemgetter(0)):
        doclist = []
        df = 0
        
        for current_word, filename, tf in group:
            doclist.append([filename, tf])
            df += 1
        term_freq[current_word] = dict(doclist)
        doc_freq[current_word] = df
        
        return term_freq, doc_freq

In [79]:
import numpy as np

def tfidf(feature):
    """
    计算每篇文章的 tfidf
    """
    
    # 计算每个 word 的 tf
    word_tf = feature

    # 计算每个 word 的 idf
    N_x = feature.shape[0]
    N_for_words = (feature > 0).sum(0)
    idf_for_words = [np.log(  (N+1)/(N_x+1) ) for N in N_for_words]

    # 计算每个 word 的 tfidf
    tfidf_for_words = word_tf.multiply(idf_for_words[0])
    
    return tfidf_for_words


def get_target(directory):
    """
    准备 target
    """
    target = []
    for (dirname, dirs, files) in os.walk(directory):
        print(dirname)
        for file in files:
            if file.endswith(".txt"):
                filename = os.path.join(dirname, file)
                tags = re.split('[/\\\\]', dirname)
                tag = tags[-1]
                target.append(tag)
    
    return target

In [None]:
# 分别计算训练集和测试集的 feature 和 target
train_feature_tfidf = tfidf(train_feature)
train_feature_target_tfidf = get_target(train_directory)

test_feature_tfidf = tfidf(test_feature)
test_feature_target_tfidf = get_target(test_directory)

In [None]:
# 训练

from sklearn import linear_model
logreg_tfidf = linear_model.LogisticRegression()
logreg_tfidf.fit(train_feature_tfidf, train_feature_target_tfidf)

In [82]:
# 测试效果

test_predict_tfidf = logreg_tfidf.predict(test_feature_tfidf)
true_false_tfidf = (test_predict_tfidf == test_feature_target_tfidf)
acc = np.count_nonzero(true_false_tfidf)/float(len(test_feature_target_tfidf))

print("The Accuracy is {}".format(acc))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)