In [3]:
# 天池新闻文本分类比赛
# 初始baseline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
import joblib
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
import lightgbm as lgb
import logging
from threading import Thread

data_path = "./download/"

In [4]:
logging.basicConfig(level=logging.DEBUG,#控制台打印的日志级别
                    filename='LGBM.log',
                    filemode='a',##模式，有w和a，w就是写模式，每次都会重新写日志，覆盖之前的日志
                    #a是追加模式，默认如果不写的话，就是追加模式
                    format=
                    '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'
                    #日志格式
                    )

In [6]:
train_df = pd.read_csv(data_path + 'train_set.csv',sep='\t',nrows=None)
test_df = pd.read_csv(data_path + 'test_a.csv',sep='\t',nrows=None)
# test_b_df = pd.read_csv(data_path + 'test_b.csv',sep='\t',nrows=None)

In [7]:
def define_tfidf(ngram,max_feature):
    return TfidfVectorizer(
        sublinear_tf = True,
        strip_accents = 'unicode',
        analyzer = 'word',
        token_pattern = r'\w{1,}',
        stop_words = 'english',
        ngram_range = (1,ngram),
        max_features = max_feature,
    )

# k 折交叉验证
def k_evaluate(clf,x_train,y_train,x_test,k=10,n_est=0):
    skf = StratifiedKFold(n_splits=k, random_state=7,shuffle=True) 
    test_preds = np.zeros((x_test.shape[0],1),int)
    for KF_index,(train_index,valid_index) in enumerate(skf.split(x_train, train_df['label'].values)):
        logging.info('第%d折交叉验证开始...'%(KF_index + 1))
        # 训练集划分
        x_train_,x_valid_ = x_train[train_index],x_train[valid_index]
        y_train_,y_valid_ = y_train[train_index],y_train[valid_index]
        # 开始训练...
        clf.fit(x_train_,y_train_)
        # 执行预测
        val_pred = clf.predict(x_valid_)
        logging.info('准确率为：%.7f'%f1_score(y_valid_,val_pred,average='macro'))
        test_preds = np.column_stack((test_preds,clf.predict(x_test)))
#         test_pred += clf.predict_proba(x_test)
        logging.info('保存模型est%d_KF_index%d'%(n_est,KF_index + 1))
        joblib.dump(clf,data_path + 'LGBM/model/est%d_KF_index%d'%(n_est,KF_index + 1),compress=3)
    return test_preds

def save_pred2file(saved_path,test_preds):
    preds = []
    for i,test_list in enumerate(test_preds):    
        #  取预测数最多的作为预测结果   
        preds.append(np.argmax(np.bincount(test_list)))
    preds = np.array(preds)
    submission = pd.DataFrame()
    submission['label'] = preds
    submission.to_csv(saved_path,index=False)
    print("保存完毕")   

In [4]:
# tfidf = define_tfidf(3,7000)
# tfidf.fit(pd.concat([train_df['text'],test_df['text'],test_b_df['text']]))
# joblib.dump(tfidf,data_path+'tfidf.model')

In [None]:
tfidf = joblib.load(data_path+'tfidf.model')
train_features = tfidf.transform(train_df['text'])
test_features = tfidf.transform(test_df['text'])
# test_b_features = tfidf.transform(test_b_df['text'])

x_train = train_features
y_train = train_df['label']
x_test = test_features

## LGBM 调参

## 多线程运行 k 折交叉验证

In [None]:
class TFIDF_based:
    def __init__(self, X_train, Y_train, X_test, varified_parament, param_value):
        self.x_train = X_train
        self.y_train = Y_train
        self.x_test = X_test
        self.varified_param = varified_parament
        self.param_value = param_value
        self.test_preds = np.zeros((X_test.shape[0], 1), int)
        self.threads = []

    # k 折交叉验证
    def k_evaluate(self, clf, k=5):
        skf = StratifiedKFold(n_splits=k, random_state=7, shuffle=True)

        for KF_index, (train_index, valid_index) in enumerate(skf.split(self.x_train, self.y_train.values)):
            logging.info('第%d折交叉验证开始...' % (KF_index + 1))
            # 训练集划分
            x_train_, x_valid_ = self.x_train[train_index], self.x_train[valid_index]
            y_train_, y_valid_ = self.y_train[train_index], self.y_train[valid_index]
            # 将函数加入多线程列表中
            self.threads.append(Thread(target=self.train, args=(clf, x_train_, y_train_, x_valid_, y_valid_, KF_index)))

    def train(self, clf, x_train, y_train, x_valid, y_valid, KF_index):

        clf.fit(x_train, y_train)
        # 在验证集上判断模型的准确率
        val_pred = clf.predict(x_valid)
        logging.info('准确率为：%.7f' % f1_score(y_valid, val_pred, average='macro'))

        self.test_preds = np.column_stack((self.test_preds, clf.predict(self.x_test)))

        model_name = '%s%.3f_KF_index%d' % (self.varified_param, self.param_value, KF_index + 1)
        logging.info('保存模型:'+ model_name)
        joblib.dump(clf, './download/LGBM/model/' + model_name + '.model', compress=3)

    def save_pred2file(self, saved_path):
        preds = []
        for i, test_list in enumerate(self.test_preds):
            #  取预测数最多的作为预测结果
            preds.append(np.argmax(np.bincount(test_list)))
        preds = np.array(preds)

        # 转为 CSV 文件
        submission = pd.DataFrame()
        submission['label'] = preds
        submission.to_csv(saved_path, index=False)
        logging.info('预测数据保存到csv文件')
        logging.info('*' * 20)

    def run(self, clf, saved_path):
        self.k_evaluate(clf)
        for t in self.threads:
            t.setDaemon(True)
            t.start()
        for t in self.threads:
            t.join()
        self.save_pred2file(saved_path)
        self.test_preds = np.zeros((self.x_test.shape[0], 1), int)

In [None]:
params = {
        #     'bagging_fraction':0.8,
#         'learning_rate': 0.1,
        'objective': 'multiclass',
        'n_estimators': 500,
        'num_classes': 14,
        'reg_alpha': 0.001,
        'reg_lambda': 0.01
    }
# baseline，默认配置 0.943 左右
# 1、n_estimators-500:决策树棵数 —— 0.946
# 2、learning_rate:
# for param_value in range(550, 601, 50):
param_value = 0.01
varified_param = 'learning_rate'
information = '%s:%.3f 验证中...'%(varified_param,param_value)
logging.info(information)
print(infomation)
params[varified_param] = param_value
clf = lgb.LGBMClassifier(**params)
lgbm = TFIDF_based(x_train, y_train, x_test, varified_param, param_value)
lgbm.run(clf, data_path + 'LGBM/Test_a_results/%s_%.3f.csv' % (varified_param, param_value))

### 日志：经过测试，将训练和预测更改为多线程的方式可以提升效率，n_estimators为250时，时间减少了50分钟左右，为400以上时，可以减少一个小时以上的时间，经过考量，n_estimators 选择了一个相对折中的值——500，