In [1]:
import pandas as pd
from scipy.sparse import coo_matrix, vstack
from collections import Counter, namedtuple
from functools import reduce
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate


In [2]:
def _word_ngrams(tokens, ngram_range, stop_words=None):
    """Turn tokens into a sequence of n-grams after stop words filtering
    copy from https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L148
    """
    # handle stop words
    if stop_words is not None:
        tokens = [w for w in tokens if w not in stop_words]

    # handle token n-grams
    min_n, max_n = ngram_range
    if max_n != 1:
        original_tokens = tokens
        if min_n == 1:
            # no need to do any slicing for unigrams
            # just iterate through the original tokens
            tokens = list(original_tokens)
            min_n += 1
        else:
            tokens = []

        n_original_tokens = len(original_tokens)

        # bind method outside of loop to reduce overhead
        tokens_append = tokens.append
        space_join = " ".join

        for n in range(min_n,
                        min(max_n + 1, n_original_tokens + 1)):
            for i in range(n_original_tokens - n + 1):
                tokens_append(space_join(original_tokens[i: i + n]))

    return tokens

def line_generator(filename):
    if filename is not None:
        f = open(filename, encoding='utf-8')
    else:
        f = sys.stdin
    for line in f:
        yield line
    f.close()


def row_generator(g):
    fields = next(g).strip().split(',')
    Row = namedtuple('Row', fields)
    for line in g:
        yield Row(*line.strip().split(','))


def group_generator(g, group_keys):
    buffer = []
    last_gid = None
    if not isinstance(group_keys, list):
        group_keys = [group_keys]
    for row in g:
        cur_gid = tuple(getattr(row, k) for k in group_keys)
        if last_gid is None:
            last_gid = cur_gid
        if cur_gid != last_gid:
            last_gid = cur_gid
            yield buffer
            buffer = []
        buffer.append(row)
    yield buffer

def ngram_generator(g, ngram_range):
    ThreadSample = namedtuple('ThreadSample', ['file_id', 'ngrams', 'label'])
    for group in g:
        ng_cnt = Counter()
        file_id = group[0].file_id
        label = getattr(group[0], 'label', -1)
        apis = [row.api for row in group]
        for gram in _word_ngrams(apis, ngram_range):
            ng_cnt[gram] += 1
        yield ThreadSample(file_id, ng_cnt, label)

def file_sample_generator(g):
    FileSample = namedtuple('FileSample', ['file_id', 'ngrams', 'label'])
    for group in g:
        file_id = group[0].file_id
        label = getattr(group[0], 'label', -1)
        ngrams = reduce(Counter.__add__, (t.ngrams for t in group))
        yield FileSample(file_id, ngrams, label)

class ValueEncoder(object):
    def __init__(self):
        self.val_dict = {}

    def encode(self, val):
        if val not in self.val_dict:
            self.val_dict[val] = len(self.val_dict)
        return self.val_dict[val]

def extract_bow_features_from_file(path, value_encoder, ngram_range=(1,), total=None):
    g = line_generator(path)
    g = row_generator(g)
    g = group_generator(g, ['file_id', 'tid'])
    g = ngram_generator(g, ngram_range=(1, 3))
    g = group_generator(g, 'file_id')
    g = file_sample_generator(g)
    rows, cols, values = [], [], []
    labels = []
    for file_sample in tqdm(g, total=total, ncols=80):
        file_id = int(file_sample.file_id)
        label = int(file_sample.label)
        for ngram, cnt in file_sample.ngrams.items():
            rows.append(file_id)
            cols.append(value_encoder.encode(ngram))
            values.append(cnt)
        labels.append(label)
    bow_feats = coo_matrix((values, (rows, cols)))
    return bow_feats, labels

In [3]:
train_file = 'test.csv'
ve = ValueEncoder()
print('Extracting (1,3)-grams from train file ...')
tr_bow_feats, tr_labels = extract_bow_features_from_file(train_file, ve,
                                                             ngram_range=(1, 3), total=116624)
print('Extracting (1,3)-grams from test file ...')
import numpy as np
train_y=np.array(tr_labels)
train_X=tr_bow_feats
#te_bow_feats, _ = extract_bow_features_from_file(test_file, ve,
                                                     #ngram_range=(1, 3), total=53093)
print('Calculating TFIDF Value ...')
#num_feats = te_bow_feats.shape[1]
#tr_bow_feats.resize((tr_bow_feats.shape[0], num_feats))

tfidf_tsfm = TfidfTransformer()
tfidf_tsfm.fit(tr_bow_feats)
tr_bow_feats = tfidf_tsfm.transform(tr_bow_feats)
#te_bow_feats = tfidf_tsfm.transform(te_bow_feats)
rfc = RandomForestClassifier(n_estimators=50, n_jobs=-1)

print('Cross validation')
cv_res = cross_validate(rfc, tr_bow_feats, tr_labels, scoring='neg_log_loss', return_train_score=True)
print(cv_res)

  0%|                                       | 12/116624 [00:00<25:55, 74.98it/s]

Extracting (1,3)-grams from train file ...


100%|█████████████████████████████████| 116624/116624 [1:16:09<00:00, 25.52it/s]


Extracting (1,3)-grams from test file ...
(116624, 345903) (116624,)


In [4]:
def runXGB(train_X,train_y,test_X,test_y=None,feature_names=None,seed_val=0,num_rounds=1000):
    #参数设定
    param = {}
    param['objective'] = 'multi:softprob'#多分类、输出概率值
    param['eta'] = 0.1#学习率
    param['max_depth'] = 6#最大深度，越大越容易过拟合
    param['silent'] = 1#打印提示信息
    param['num_class'] = 6#三个类别
    param['eval_metric']= "mlogloss"#对数损失
    param['min_child_weight']=1#停止条件，这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。
    param['subsample'] =0.7#随机采样训练样本
    param['colsample_bytree'] = 0.7# 生成树时进行的列采样
    param['seed'] = seed_val#随机数种子
    num_rounds = num_rounds#迭代次数
    
    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X,label=train_y)
    
    if test_y is not None:
        xgtest = xgb.DMatrix(test_X,label=test_y)
        watchlist = [(xgtrain,'train'),(xgtest,'test')]
        model = xgb.train(plst,xgtrain,num_rounds,watchlist,early_stopping_rounds=20)
      #  early_stopping_rounds 当设置的迭代次数较大时，early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst,xgtrain,num_rounds)
    pred_test_y = model.predict(xgtest)
    return pred_test_y,model

In [5]:
#single model example1:xgboost
import xgboost as xgb
from sklearn import model_selection,preprocessing,ensemble
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
cv_scores = []
x_train, x_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.2, random_state=0)
pred_test_y,model=runXGB(x_train,y_train,x_valid,y_valid)



[0]	train-mlogloss:1.50833	test-mlogloss:1.50841
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:1.29936	test-mlogloss:1.29937
[2]	train-mlogloss:1.13418	test-mlogloss:1.13421
[3]	train-mlogloss:0.999324	test-mlogloss:0.999398
[4]	train-mlogloss:0.885761	test-mlogloss:0.886069
[5]	train-mlogloss:0.788353	test-mlogloss:0.788831
[6]	train-mlogloss:0.704186	test-mlogloss:0.704819
[7]	train-mlogloss:0.631084	test-mlogloss:0.63187
[8]	train-mlogloss:0.566549	test-mlogloss:0.567496
[9]	train-mlogloss:0.510023	test-mlogloss:0.511081
[10]	train-mlogloss:0.460195	test-mlogloss:0.461377
[11]	train-mlogloss:0.415059	test-mlogloss:0.416472
[12]	train-mlogloss:0.375168	test-mlogloss:0.376751
[13]	train-mlogloss:0.339361	test-mlogloss:0.341116
[14]	train-mlogloss:0.307289	test-mlogloss:0.309229
[15]	train-mlogloss:0.278888	test-mlogloss:0.28099
[16]	train-mlogloss:0.253223	test-m

[155]	train-mlogloss:0.002352	test-mlogloss:0.016313
[156]	train-mlogloss:0.002333	test-mlogloss:0.01631
[157]	train-mlogloss:0.00232	test-mlogloss:0.01634
[158]	train-mlogloss:0.002305	test-mlogloss:0.016358
[159]	train-mlogloss:0.002289	test-mlogloss:0.016372
[160]	train-mlogloss:0.002273	test-mlogloss:0.016381
[161]	train-mlogloss:0.002257	test-mlogloss:0.016381
[162]	train-mlogloss:0.002238	test-mlogloss:0.016399
[163]	train-mlogloss:0.002225	test-mlogloss:0.016408
Stopping. Best iteration:
[143]	train-mlogloss:0.002592	test-mlogloss:0.016278



In [6]:
#single model example2:randomforest for baseline 
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1,random_state=0)
rf = rf.fit(x_train, y_train)
print('Cross validation')
cv_res = cross_validate(rf, train_X,train_y, scoring='neg_log_loss', return_train_score=True)
print(cv_res)

Cross validation
{'fit_time': array([7.6434083 , 7.86441493, 7.27798533]), 'score_time': array([0.95841122, 0.97256565, 1.01796508]), 'test_score': array([-0.06938313, -0.06782988, -0.07709167]), 'train_score': array([-0.00638233, -0.00646153, -0.00619394])}
