# Use LGBM Algorithm to Classify Fraudulent Clicks on Ads

In [1]:
import numpy as np
import pandas as pd
import time
start_time = time.time()

In [2]:
# define data types to save memory while loading
dtypes_new = {            
                'app'                               : 'uint16',
                'app_channel_unicount'              : 'uint32',
                'appfreq_ip'                        : 'uint32',
                'appfreq_ip_dow_channel'            : 'uint16',
                'channel'                           : 'uint16',
                'channelfreq_ip'                    : 'uint32',
                'channelfreq_ip_app'                : 'uint32',
                'channelfreq_ip_device'             : 'uint32',
                'channelfreq_ip_device_app'         : 'uint32',
                'channelfreq_ip_device_os'          : 'uint32',
                'channelfreq_ip_dow'                : 'uint32',
                'channelfreq_ip_dow_app'            : 'uint16',
                'channelfreq_ip_dow_device'         : 'uint32',
                'channelfreq_ip_dow_hour'           : 'uint16',
                'channelfreq_ip_dow_hour_minute'    : 'uint16',
                'channelfreq_ip_dow_hour_os_app'    : 'uint16',
                'channelfreq_ip_dow_os'             : 'uint16',
                'channelfreq_ip_dow_os_app'         : 'uint16',
                'channelfreq_ip_hour_app'           : 'uint16',
                'channelfreq_ip_hour_device'        : 'uint16',
                'channelfreq_ip_hour_os'            : 'uint16',
                'channelfreq_ip_os'                 : 'uint32',
                'channelfreq_ip_os_app'             : 'uint16',
                'device'                            : 'uint16',
                'devicefreq_ip'                     : 'uint32',
                'devicefreq_ip_app'                 : 'uint32',
                'devicefreq_ip_dow_channel'         : 'uint16',
                'devicefreq_ip_hour'                : 'uint32',
                'devicefreq_ip_os_channel'          : 'uint16',
                'dow'                                : 'uint8',
                'dowfreq_ip'                        : 'uint32',
                'duration_in_hours'                 : 'uint16',
                'hour'                               : 'uint8',
                'hour_working'                       : 'uint8',
                'ip_app_channel_unicount'           : 'uint16',
                'ip_app_unicount'                   : 'uint32',
                'ip_channel_unicount'               : 'uint16',
                'ip_device_unicount'                : 'uint32',
                'ip_dow_unicount'                   : 'uint32',
                'os'                                : 'uint16',
                'osfreq_ip'                         : 'uint32',
                'osfreq_ip_channel'                 : 'uint16',
                'osfreq_ip_hour_channel'            : 'uint16',
}

features = list(dtypes_new.keys())
print(len(features))
features

43


['app',
 'app_channel_unicount',
 'appfreq_ip',
 'appfreq_ip_dow_channel',
 'channel',
 'channelfreq_ip',
 'channelfreq_ip_app',
 'channelfreq_ip_device',
 'channelfreq_ip_device_app',
 'channelfreq_ip_device_os',
 'channelfreq_ip_dow',
 'channelfreq_ip_dow_app',
 'channelfreq_ip_dow_device',
 'channelfreq_ip_dow_hour',
 'channelfreq_ip_dow_hour_minute',
 'channelfreq_ip_dow_hour_os_app',
 'channelfreq_ip_dow_os',
 'channelfreq_ip_dow_os_app',
 'channelfreq_ip_hour_app',
 'channelfreq_ip_hour_device',
 'channelfreq_ip_hour_os',
 'channelfreq_ip_os',
 'channelfreq_ip_os_app',
 'device',
 'devicefreq_ip',
 'devicefreq_ip_app',
 'devicefreq_ip_dow_channel',
 'devicefreq_ip_hour',
 'devicefreq_ip_os_channel',
 'dow',
 'dowfreq_ip',
 'duration_in_hours',
 'hour',
 'hour_working',
 'ip_app_channel_unicount',
 'ip_app_unicount',
 'ip_channel_unicount',
 'ip_device_unicount',
 'ip_dow_unicount',
 'os',
 'osfreq_ip',
 'osfreq_ip_channel',
 'osfreq_ip_hour_channel']

In [3]:
categorical_features = ['dow']

In [4]:
features_selected = []
ids = ['_unicount', 'channelfreq_', 'devicefreq_', 'osfreq_', 'appfreq_']

for id in ids:
    features_selected += [str for str in features if id in str]

In [5]:
%%time
import lightgbm as lgb
import gc
gc.collect()

X_train = pd.read_csv('X_train.csv', dtype=dtypes_new, engine='c')
# normalize features by taking log values
for feature in features_selected:
    X_train[feature] = np.log2(1 + X_train[feature].values).astype(int)
lgb_train = lgb.Dataset(X_train.values, \
                        label=pd.read_csv('y_train.csv', dtype=dtypes_new, header = -1, engine='c').values.flatten(), \
                        feature_name=features,
                       )
del X_train
gc.collect()

X_test = pd.read_csv('X_test.csv', dtype=dtypes_new, engine='c')
# normalize features by taking log values
for feature in features_selected:
    X_test[feature] = np.log2(1 + X_test[feature].values).astype(int)
    
lgb_eval = lgb.Dataset(X_test.values, \
                        label=pd.read_csv('y_test.csv', dtype=dtypes_new, header = -1, engine='c').values.flatten(), \
                        reference=lgb_train, \
                        feature_name=features,
                      )
del X_test
gc.collect()

CPU times: user 7min 4s, sys: 1min 43s, total: 8min 48s
Wall time: 5min 57s


In [None]:
%%time
params = {
            'boosting_type': 'gbdt',
            'tree_learner': 'data_parallel',
            'objective': 'binary',
            'num_leaves': 100,
            'learning_rate': 0.1,
            'lambda_l2': 0,  
            'min_child_weight': 0,
            'max_bin': 300, # max number of bins that feature values will be bucketed in
            'is_unbalance': True,
            'metric': 'auc',
            'num_threads': 16,
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=[lgb_train, lgb_eval], 
                valid_names=['train', 'valid'], 
                learning_rates=lambda iter: round(0.1 * pow(0.995, iter), 6),
                early_stopping_rounds = 50
               )

In [None]:
# best train-auc so far: 0.996759
# best valid-auc so far: 0.980228

In [None]:
%%time
test_data_processed = pd.read_csv('test_data_processed.csv', dtype=dtypes_new)
prediction = gbm.predict(test_data_processed, num_iteration=gbm.best_iteration)

# del test_data_processed
# gc.collect()

prediction

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(1, 1, figsize=(15,5), dpi=150)
lgb.plot_importance(gbm, ax=ax)
fig.savefig('feature_importance_large_training_data.png')

In [None]:
plt.figure(figsize=(6,4), dpi=100)

n, bins, patches = plt.hist(prediction, 50, facecolor='g', alpha=0.75)

plt.xlabel('is_attributed Probability')
plt.ylabel('Occurance')
plt.grid(True)
plt.show()

In [None]:
%%time
click_id = pd.read_csv('click_id.csv', dtype=dtypes_new, header = -1, engine='c')
submission = pd.DataFrame({'click_id': click_id[0], 'is_attributed': prediction})
submission.to_csv('submission.gz', compression='gzip', index=False)

# del prediction
# gc.collect()

In [None]:
# XGBoost classification with impactful features using larger training data
submission.head(5)

In [None]:
print('total running hours: {0:.3f}'.format((time.time() - start_time)/3600))

In [None]:
# !kaggle competitions submit -c talkingdata-adtracking-fraud-detection -f submission.gz -m "AWS LGBM with improved features"