In [3]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm 
import datetime, time, gc 
# from utils import distance, haversine, standard, pad_seq 
from scipy.stats import skew, kurtosis
from zipfile import ZipFile
from collections import Counter 
from sklearn.metrics import roc_auc_score as auc
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score as auc
# import tables

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [13]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in tqdm(df.columns):
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem,
                                                                          100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
# !pip install lightgbm

In [7]:
data = pd.read_pickle('data/train_test_fea.pkl') 
data_tarenc = pd.read_pickle("data/train_test_targetenc.pkl")
print(data.shape, data_tarenc.shape)

data = pd.concat([data, data_tarenc], axis=1)
del data_tarenc
gc.collect()
print(data.shape)

(8000000, 148) (8000000, 14)
(8000000, 162)


In [12]:
cate_cols = ['uid', 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id',
       'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags',
       'app_first_class', 'app_second_class', 'city', 'age', 
       'device_name', 'career', 'gender', 'net_type',
       'residence', 'his_on_shelf_time', "communication_onlinerate", 
       'up_membership_grade', 'consume_purchase', 'indu_name',
]

for feat in tqdm(cate_cols):
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

use_cols = list( set(data.columns) - set(['label', 'cnt', 'random_sector',
        'uid_task_id_ctr', 'uid_adv_id_ctr', 'pt_d', 'id',
]))

100%|██████████| 24/24 [00:16<00:00,  1.43it/s]


In [15]:
data = reduce_mem_usage(data)

100%|██████████| 162/162 [00:16<00:00,  9.79it/s]

Mem. usage decreased to 2250.24 Mb (35.3% reduction)





In [24]:
# !pip install keras==2.2.4

file_name = datetime.date.today().strftime('%m%d')+"_{}.log".format("deep_base")
def write_log(w):
    t0 = datetime.datetime.now().strftime('%H:%M:%S')
    info = "{} : {}\n".format(t0, w)
    print(info)
    with open(file_name, 'a') as f:
        f.write(info)
        f.write("-"*80+"\n")


test_pred = np.zeros((1000000, ))

# params
params = {
        'learning_rate': 0.05,
        'objective': 'binary',
        'metric': ['binary_logloss', 'auc'], 
        'boosting_type': 'gbdt',
        'num_leaves': 256,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.85,
        'bagging_freq': 3,
        'seed': 8,
        'bagging_seed': 1,
        'feature_fraction_seed': 7,
        'min_data_in_leaf': 20,
        'nthread': 8,
        'verbose': 1,
    }

imp = pd.DataFrame()  # 特征重要性
imp['feat'] = use_cols
    
for i in range(1, 8):
    write_log('pt_d = {}'.format(i))
    
    train = data.iloc[:-1000000][data["pt_d"] != i].reset_index(drop=True)
    valid = data.iloc[:-1000000][data["pt_d"] == i].reset_index(drop=True)
    test = data.iloc[-1000000:].reset_index(drop=True)
    print(train.shape, valid.shape, test.shape)

    # 数据加载
    n_train = lgb.Dataset(train[use_cols], label=train['label'].values, 
                             # free_raw_data=False
                             )
    n_valid = lgb.Dataset(valid[use_cols], label=valid['label'].values, 
                              #free_raw_data=False
                             )

#     del train, valid
    gc.collect()
    
    clf = lgb.train(
            params=params,
            train_set=n_train,
            categorical_feature=cate_cols,
            num_boost_round=5000,
            valid_sets=[n_train, n_valid],
            early_stopping_rounds=30,
            verbose_eval=20,
#             feval=mse_score_eval
        )
    
    train_pred = clf.predict(valid[use_cols], num_iteration=clf.best_iteration)
    
    write_log("val_auc = {}".format(auc(valid['label'].values, train_pred)))
    
    test_pred += clf.predict(test[use_cols], num_iteration=clf.best_iteration) / 7
    
    imp['gain' + str(i)] = clf.feature_importance(importance_type='gain')
    imp['split' + str(i)] = clf.feature_importance(importance_type='split')


11:12:39 : pt_d = 1





(6000000, 162) (1000000, 162) (1000000, 162)


New categorical_feature is ['adv_id', 'adv_prim_id', 'age', 'app_first_class', 'app_second_class', 'career', 'city', 'communication_onlinerate', 'consume_purchase', 'creat_type_cd', 'dev_id', 'device_name', 'gender', 'his_on_shelf_time', 'indu_name', 'inter_type_cd', 'net_type', 'residence', 'slot_id', 'spread_app_id', 'tags', 'task_id', 'uid', 'up_membership_grade']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[LightGBM] [Info] Number of positive: 203653, number of negative: 5796347
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 31700
[LightGBM] [Info] Number of data points in the train set: 6000000, number of used features: 160




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.033942 -> initscore=-3.348566
[LightGBM] [Info] Start training from score -3.348566
Training until validation scores don't improve for 30 rounds
[20]	training's binary_logloss: 0.121436	training's auc: 0.828707	valid_1's binary_logloss: 0.133738	valid_1's auc: 0.812736
[40]	training's binary_logloss: 0.116223	training's auc: 0.844083	valid_1's binary_logloss: 0.130744	valid_1's auc: 0.814979
[60]	training's binary_logloss: 0.113205	training's auc: 0.857171	valid_1's binary_logloss: 0.129957	valid_1's auc: 0.81672
[80]	training's binary_logloss: 0.110739	training's auc: 0.869089	valid_1's binary_logloss: 0.129667	valid_1's auc: 0.818191
[100]	training's binary_logloss: 0.108572	training's auc: 0.87907	valid_1's binary_logloss: 0.129619	valid_1's auc: 0.818472
[120]	training's binary_logloss: 0.106684	training's auc: 0.88737	valid_1's binary_logloss: 0.129605	valid_1's auc: 0.818622
[140]	training's binary_logloss: 0.10501	training's auc:



(6000000, 162) (1000000, 162) (1000000, 162)


New categorical_feature is ['adv_id', 'adv_prim_id', 'age', 'app_first_class', 'app_second_class', 'career', 'city', 'communication_onlinerate', 'consume_purchase', 'creat_type_cd', 'dev_id', 'device_name', 'gender', 'his_on_shelf_time', 'indu_name', 'inter_type_cd', 'net_type', 'residence', 'slot_id', 'spread_app_id', 'tags', 'task_id', 'uid', 'up_membership_grade']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[LightGBM] [Info] Number of positive: 206202, number of negative: 5793798
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31924
[LightGBM] [Info] Number of data points in the train set: 6000000, number of used features: 160




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034367 -> initscore=-3.335687
[LightGBM] [Info] Start training from score -3.335687
Training until validation scores don't improve for 30 rounds
[20]	training's binary_logloss: 0.122829	training's auc: 0.828354	valid_1's binary_logloss: 0.125391	valid_1's auc: 0.817566
[40]	training's binary_logloss: 0.117558	training's auc: 0.843483	valid_1's binary_logloss: 0.122247	valid_1's auc: 0.819981
[60]	training's binary_logloss: 0.114471	training's auc: 0.856918	valid_1's binary_logloss: 0.121421	valid_1's auc: 0.821726
[80]	training's binary_logloss: 0.111979	training's auc: 0.869032	valid_1's binary_logloss: 0.121156	valid_1's auc: 0.822849
[100]	training's binary_logloss: 0.109775	training's auc: 0.878943	valid_1's binary_logloss: 0.121071	valid_1's auc: 0.823234
[120]	training's binary_logloss: 0.10785	training's auc: 0.887264	valid_1's binary_logloss: 0.121078	valid_1's auc: 0.823305
Early stopping, best iteration is:
[103]	training's bi



(6000000, 162) (1000000, 162) (1000000, 162)


New categorical_feature is ['adv_id', 'adv_prim_id', 'age', 'app_first_class', 'app_second_class', 'career', 'city', 'communication_onlinerate', 'consume_purchase', 'creat_type_cd', 'dev_id', 'device_name', 'gender', 'his_on_shelf_time', 'indu_name', 'inter_type_cd', 'net_type', 'residence', 'slot_id', 'spread_app_id', 'tags', 'task_id', 'uid', 'up_membership_grade']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[LightGBM] [Info] Number of positive: 208265, number of negative: 5791735
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31974
[LightGBM] [Info] Number of data points in the train set: 6000000, number of used features: 160




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034711 -> initscore=-3.325376
[LightGBM] [Info] Start training from score -3.325376
Training until validation scores don't improve for 30 rounds
[20]	training's binary_logloss: 0.123427	training's auc: 0.829818	valid_1's binary_logloss: 0.121856	valid_1's auc: 0.807521
[40]	training's binary_logloss: 0.118079	training's auc: 0.844836	valid_1's binary_logloss: 0.118921	valid_1's auc: 0.810149
[60]	training's binary_logloss: 0.114967	training's auc: 0.858177	valid_1's binary_logloss: 0.118162	valid_1's auc: 0.811885
[80]	training's binary_logloss: 0.112469	training's auc: 0.869939	valid_1's binary_logloss: 0.117918	valid_1's auc: 0.813038
[100]	training's binary_logloss: 0.110267	training's auc: 0.879688	valid_1's binary_logloss: 0.117891	valid_1's auc: 0.813151
Early stopping, best iteration is:
[89]	training's binary_logloss: 0.111447	training's auc: 0.874456	valid_1's binary_logloss: 0.117874	valid_1's auc: 0.813298
12:03:04 : val_auc 



(6000000, 162) (1000000, 162) (1000000, 162)


New categorical_feature is ['adv_id', 'adv_prim_id', 'age', 'app_first_class', 'app_second_class', 'career', 'city', 'communication_onlinerate', 'consume_purchase', 'creat_type_cd', 'dev_id', 'device_name', 'gender', 'his_on_shelf_time', 'indu_name', 'inter_type_cd', 'net_type', 'residence', 'slot_id', 'spread_app_id', 'tags', 'task_id', 'uid', 'up_membership_grade']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[LightGBM] [Info] Number of positive: 207746, number of negative: 5792254
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32032
[LightGBM] [Info] Number of data points in the train set: 6000000, number of used features: 160




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034624 -> initscore=-3.327961
[LightGBM] [Info] Start training from score -3.327961
Training until validation scores don't improve for 30 rounds
[20]	training's binary_logloss: 0.123325	training's auc: 0.829092	valid_1's binary_logloss: 0.12258	valid_1's auc: 0.80983
[40]	training's binary_logloss: 0.117978	training's auc: 0.844497	valid_1's binary_logloss: 0.11961	valid_1's auc: 0.812561
[60]	training's binary_logloss: 0.114893	training's auc: 0.857927	valid_1's binary_logloss: 0.118904	valid_1's auc: 0.813901
[80]	training's binary_logloss: 0.112372	training's auc: 0.869744	valid_1's binary_logloss: 0.118663	valid_1's auc: 0.815006
[100]	training's binary_logloss: 0.110125	training's auc: 0.880173	valid_1's binary_logloss: 0.118619	valid_1's auc: 0.815285
[120]	training's binary_logloss: 0.108187	training's auc: 0.888398	valid_1's binary_logloss: 0.11866	valid_1's auc: 0.815137
Early stopping, best iteration is:
[103]	training's binar



(6000000, 162) (1000000, 162) (1000000, 162)


New categorical_feature is ['adv_id', 'adv_prim_id', 'age', 'app_first_class', 'app_second_class', 'career', 'city', 'communication_onlinerate', 'consume_purchase', 'creat_type_cd', 'dev_id', 'device_name', 'gender', 'his_on_shelf_time', 'indu_name', 'inter_type_cd', 'net_type', 'residence', 'slot_id', 'spread_app_id', 'tags', 'task_id', 'uid', 'up_membership_grade']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[LightGBM] [Info] Number of positive: 209848, number of negative: 5790152
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31959
[LightGBM] [Info] Number of data points in the train set: 6000000, number of used features: 160




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034975 -> initscore=-3.317530
[LightGBM] [Info] Start training from score -3.317530
Training until validation scores don't improve for 30 rounds
[20]	training's binary_logloss: 0.124003	training's auc: 0.830534	valid_1's binary_logloss: 0.118331	valid_1's auc: 0.802156
[40]	training's binary_logloss: 0.118636	training's auc: 0.845311	valid_1's binary_logloss: 0.115586	valid_1's auc: 0.804526
[60]	training's binary_logloss: 0.115529	training's auc: 0.858633	valid_1's binary_logloss: 0.114931	valid_1's auc: 0.80585
[80]	training's binary_logloss: 0.11302	training's auc: 0.870553	valid_1's binary_logloss: 0.114783	valid_1's auc: 0.806397
[100]	training's binary_logloss: 0.110787	training's auc: 0.880495	valid_1's binary_logloss: 0.114776	valid_1's auc: 0.806533
[120]	training's binary_logloss: 0.108878	training's auc: 0.888521	valid_1's binary_logloss: 0.114816	valid_1's auc: 0.806442
Early stopping, best iteration is:
[95]	training's bina



(6000000, 162) (1000000, 162) (1000000, 162)


New categorical_feature is ['adv_id', 'adv_prim_id', 'age', 'app_first_class', 'app_second_class', 'career', 'city', 'communication_onlinerate', 'consume_purchase', 'creat_type_cd', 'dev_id', 'device_name', 'gender', 'his_on_shelf_time', 'indu_name', 'inter_type_cd', 'net_type', 'residence', 'slot_id', 'spread_app_id', 'tags', 'task_id', 'uid', 'up_membership_grade']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[LightGBM] [Info] Number of positive: 206517, number of negative: 5793483
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32216
[LightGBM] [Info] Number of data points in the train set: 6000000, number of used features: 160




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034419 -> initscore=-3.334106
[LightGBM] [Info] Start training from score -3.334106
Training until validation scores don't improve for 30 rounds
[20]	training's binary_logloss: 0.122288	training's auc: 0.831652	valid_1's binary_logloss: 0.127937	valid_1's auc: 0.798217
[40]	training's binary_logloss: 0.116915	training's auc: 0.846837	valid_1's binary_logloss: 0.12545	valid_1's auc: 0.800173
[60]	training's binary_logloss: 0.113791	training's auc: 0.86025	valid_1's binary_logloss: 0.125013	valid_1's auc: 0.801123
[80]	training's binary_logloss: 0.111266	training's auc: 0.871871	valid_1's binary_logloss: 0.124991	valid_1's auc: 0.801283
Early stopping, best iteration is:
[67]	training's binary_logloss: 0.112816	training's auc: 0.864706	valid_1's binary_logloss: 0.124982	valid_1's auc: 0.801258
12:45:48 : val_auc = 0.8012576891274322

12:45:52 : pt_d = 7





(6000000, 162) (1000000, 162) (1000000, 162)


New categorical_feature is ['adv_id', 'adv_prim_id', 'age', 'app_first_class', 'app_second_class', 'career', 'city', 'communication_onlinerate', 'consume_purchase', 'creat_type_cd', 'dev_id', 'device_name', 'gender', 'his_on_shelf_time', 'indu_name', 'inter_type_cd', 'net_type', 'residence', 'slot_id', 'spread_app_id', 'tags', 'task_id', 'uid', 'up_membership_grade']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[LightGBM] [Info] Number of positive: 206115, number of negative: 5793885
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 31984
[LightGBM] [Info] Number of data points in the train set: 6000000, number of used features: 160




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034353 -> initscore=-3.336124
[LightGBM] [Info] Start training from score -3.336124
Training until validation scores don't improve for 30 rounds
[20]	training's binary_logloss: 0.122151	training's auc: 0.831813	valid_1's binary_logloss: 0.128624	valid_1's auc: 0.797906
[40]	training's binary_logloss: 0.116758	training's auc: 0.847164	valid_1's binary_logloss: 0.126271	valid_1's auc: 0.799847
[60]	training's binary_logloss: 0.113625	training's auc: 0.860233	valid_1's binary_logloss: 0.125943	valid_1's auc: 0.800386
[80]	training's binary_logloss: 0.111073	training's auc: 0.872031	valid_1's binary_logloss: 0.126045	valid_1's auc: 0.800183
Early stopping, best iteration is:
[60]	training's binary_logloss: 0.113625	training's auc: 0.860233	valid_1's binary_logloss: 0.125943	valid_1's auc: 0.800386
12:55:53 : val_auc = 0.8003862523455544



In [19]:
train_pred = clf.predict(valid[use_cols], num_iteration=clf.best_iteration)

# write_log("val_auc = {}".format(auc(valid['label'].values, train_pred)))
    

In [21]:
clf.predict(test[use_cols], num_iteration=clf.best_iteration)

array([0.01792428, 0.08202912, 0.09722099, ..., 0.03920117, 0.01852832,
       0.02759261])

In [26]:
result = pd.DataFrame({
        'id': np.load("data/test_id.npy"), 
        'probability': test_pred.reshape(-1), 
    })

result.to_csv("submission.csv", index=False) 