In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer#,TfidfTransformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from scipy import sparse
import os
import time

In [None]:
def train_fit(train, clf):
    one_hot_feature=['LBS','age','carrier','consumptionAbility','education','gender','house']
    vector_feature=['appIdAction','appIdInstall','marriageStatus','interest1','interest2','interest3','interest4','interest5','kw1','kw2','kw3','topic1','topic2','topic3']
    
    #LabelEncoder将各种标签分配一个可数的连续编号
    for feature in one_hot_feature:
        try:
            data[feature] = LabelEncoder().fit_transform(data[feature].apply(int))
        except:
            data[feature] = LabelEncoder().fit_transform(data[feature])
            
    #划分训练集、测试集
    train=data[data.label!=-1]
    train_y=train.pop('label')

    train_x=train[['advertiserId','campaignId', 'creativeId','creativeSize','adCategoryId', 'productId','productType']]
                  # ,'ct_0','ct_1','ct_2','ct_3','ct_4','os_0','os_1','os_2']]
                  # ,'has_appins','has_appact','max_marriageStatus']]
            
    #one-hot特征reshape
    enc = OneHotEncoder()
    for feature in one_hot_feature:
        enc.fit(train[feature].values.reshape(-1, 1))
        train_a=enc.transform(train[feature].values.reshape(-1, 1))
        train_x= sparse.hstack((train_x, train_a))
    print('one-hot prepared !')
    del train_a

    #向量特征reshape
    cv=CountVectorizer(ngram_range=(1, 2),token_pattern='(?u)\\b\\w+\\b')
    for feature in vector_feature:
        cv.fit(train[feature])
        train_a = cv.transform(train[feature])
        train_x = sparse.hstack((train_x, train_a))
    print('cv prepared !')
    del train_a
    
    clf.fit(train_x, train_y, eval_set=[(train_x, train_y)], eval_metric='auc',early_stopping_rounds=100)
    
    return clf

In [None]:
def predict_test(test, clf):
    one_hot_feature=['LBS','age','carrier','consumptionAbility','education','gender','house']
    vector_feature=['appIdAction','appIdInstall','marriageStatus','interest1','interest2','interest3','interest4','interest5','kw1','kw2','kw3','topic1','topic2','topic3']

    #LabelEncoder将各种标签分配一个可数的连续编号
    for feature in one_hot_feature:
        try:
            data[feature] = LabelEncoder().fit_transform(data[feature].apply(int))
        except:
            data[feature] = LabelEncoder().fit_transform(data[feature])

    test=data[data.label==-1]
    res=test[['aid','uid']]
    test=test.drop('label',axis=1)

    test_x = test[['advertiserId','campaignId', 'creativeId','creativeSize','adCategoryId', 'productId', 'productType']]
                # ,'ct_0','ct_1','ct_2','ct_3','ct_4','os_0','os_1','os_2']]
                ##,'has_appins','has_appact','max_marriageStatus']]

    #one-hot特征reshape
    enc = OneHotEncoder()
    for feature in one_hot_feature:
        enc.fit(test[feature].values.reshape(-1, 1))
        test_a = enc.transform(test[feature].values.reshape(-1, 1))
        test_x = sparse.hstack((test_x, test_a))
    print('one-hot prepared !')
    del test_a
    
    ##向量特征reshape
    cv=CountVectorizer(ngram_range=(1, 2),token_pattern='(?u)\\b\\w+\\b')
    for feature in vector_feature:
        cv.fit(test[feature])
        test_a = cv.transform(test[feature])
        test_x = sparse.hstack((test_x, test_a))
    print('cv prepared !')
    del test_a

    return test_x

In [None]:
def combineFile(filename, num):
    for i in range(1,num+1):
        try:
            load_filename = 'data1/submission/submission_%d.csv'%i
            fin = open(load_filename,'r')
            fout = open(filename,'a')
            head = fin.readline()
            if i == 1:
                fout.writelines([head])
            buf = []
            for line in fin:
                buf.append(line)
            fout.writelines(buf)
        finally:
            fin.close()
            fout.close()

In [None]:
path = 'data1/'

clf = lgb.LGBMClassifier(
    boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,zero_as_missing=True,
    max_depth=-1, n_estimators=2000, objective='binary',
    subsample=0.9, colsample_bytree=0.8, subsample_freq=1,
    learning_rate=0.2, min_child_weight=50, random_state=2018, n_jobs=100
)

s_1 = time.time()

train=pd.read_csv(path+'train.csv')
train=train[train.label!=-1]
train=train.fillna('-1')

clf=train_fit(train, clf)

s_2 = time.time()
print("LGB: train fit OK!")
print('train use time : %d'%(s_2-s_1))

beg = 1
end = 12      # test切分的份数

for i in range(beg,end+1):  
    s_1 = time.time()
    test=pd.read_csv('%s/test/test_%d.csv'%(path,i)
    test['label'] = -1
    test=test.fillna('-1')
    
    test_x=predict_test(test, clf)

    res['score'] = clf.predict_proba(test_x)[:,1]
    res['score'] = res['score'].apply(lambda x: float('%.6f' % x))
    res.to_csv('%s/submission_%d.csv'%(path,i), index=False)
    print("LGB: test_%d is OK!")

    s_2 = time.time()
    print('test_%d use time : %d'%(i,(s_2-s_1)))

In [None]:
N = 12
combineFile('data1/submission.csv',N)
print('LGB baseline is OK!')