In [18]:
import gc
import time
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import warnings
warnings.filterwarnings('ignore')
import os
path = '../../../DEVELOPMENT/Fraud Detection/input/'

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

In [19]:
train_sample = pd.read_csv(path+'train_sample.csv', dtype=dtypes)

In [20]:
train_sample.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,,0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0


In [21]:
def prep_data(d):
    d['hour'] = pd.to_datetime(d.click_time).dt.hour.astype('uint8')
    d['day'] = pd.to_datetime(d.click_time).dt.day.astype('uint8')
    d['wday']  = pd.to_datetime(d.click_time).dt.dayofweek.astype('uint8')
    
    print('grouping by ip-day-hour combination')
    gp = d[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip-day-hour'})
    d = d.merge(gp, on=['ip','day','hour'], how='left')
    del gp; gc.collect()
    
    print('group by ip-app combination')
    gp = d[['ip','app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
    d = d.merge(gp, on=['ip','app'], how='left')
    del gp; gc.collect()
    
    print('group by ip-app-os combination')
    gp = d[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
    d = d.merge(gp, on=['ip','app', 'os'], how='left')
    del gp; gc.collect()
    
    print("vars and data type")
    d['ip-day-hour'] = d['ip-day-hour'].astype('uint16')
    d['ip_app_count'] = d['ip_app_count'].astype('uint16')
    d['ip_app_os_count'] = d['ip_app_os_count'].astype('uint16')
    
    print("label encoding....")
    from sklearn.preprocessing import LabelEncoder
    d[['app','device','os', 'channel', 'hour', 'day', 'wday']].apply(LabelEncoder().fit_transform)
    print('dropping')
    d.drop(['click_time', 'ip'], 1, inplace=True)
    
    return d

In [22]:
train_df1 = prep_data(train_sample)

grouping by ip-day-hour combination
group by ip-app combination
group by ip-app-os combination
vars and data type
label encoding....
dropping


In [23]:
train_df1.head(30)

Unnamed: 0,app,device,os,channel,attributed_time,is_attributed,hour,day,wday,ip-day-hour,ip_app_count,ip_app_os_count
0,12,1,13,497,,0,9,7,1,1,3,2
1,25,1,17,259,,0,13,7,1,4,4,1
2,12,1,19,212,,0,18,7,1,1,1,1
3,13,1,13,477,,0,4,7,1,1,1,1
4,12,1,1,178,,0,9,9,3,1,2,1
5,3,1,17,115,,0,1,9,3,1,1,1
6,1,1,17,135,,0,1,9,3,1,1,1
7,9,1,25,442,,0,10,7,1,1,2,1
8,2,2,22,364,,0,9,8,2,1,5,1
9,3,1,19,135,,0,12,8,2,1,1,1


In [24]:
from sklearn.model_selection import train_test_split

predictors = train_df1.drop(['attributed_time', 'is_attributed'], axis=1)
target = train_df1["is_attributed"]

x_train, x_test, y_train, test = train_test_split(predictors, target, test_size = 0.1, random_state = 1)

In [25]:
x_train.head(30)

Unnamed: 0,app,device,os,channel,hour,day,wday,ip-day-hour,ip_app_count,ip_app_os_count
61010,6,1,30,459,11,7,1,1,1,1
77750,9,1,14,466,15,9,3,1,1,1
91401,26,1,16,121,15,9,3,1,1,1
42871,2,1,13,219,16,7,1,2,2,2
16411,18,1,19,121,2,9,3,1,6,1
87482,15,1,6,265,22,8,2,1,1,1
61007,18,1,22,107,13,9,3,1,1,1
95105,18,2,27,449,13,9,3,6,9,1
94181,1,1,13,17,14,8,2,1,1,1
10346,9,1,19,134,1,8,2,1,1,1


In [47]:
x_train, x_test, y_train, test = train_test_split(predictors, target, test_size = 0.1, random_state = 1)

In [48]:
print("훈련 세트 정확도: {:.3f}".format(gbk.score(x_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(gbk.score(x_test, test)))

훈련 세트 정확도: 0.998
테스트 세트 정확도: 0.998


In [49]:
print("훈련 세트 정확도: {:.3f}".format(gbk.score(x_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(gbk.score(x_test, test)))

훈련 세트 정확도: 0.998
테스트 세트 정확도: 0.998


In [50]:
plot_feature_importances_cancer(gbk)

NameError: name 'plot_feature_importances_cancer' is not defined

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [31]:
gbk_roc = roc_auc_score(test, gbk.predict_proba(x_test)[:,1])
print('ROC : ',gbk_roc)

ROC :  0.948601348319


In [32]:
predictions = gbk.predict_proba(x_test)
predictions

array([[  9.99545556e-01,   4.54444107e-04],
       [  9.99672115e-01,   3.27885149e-04],
       [  9.99523163e-01,   4.76836667e-04],
       ..., 
       [  9.99706818e-01,   2.93181506e-04],
       [  9.99523163e-01,   4.76836667e-04],
       [  9.99655956e-01,   3.44043711e-04]])

In [33]:
test = pd.read_csv(path+'test.csv', dtype=dtypes)
test.tail()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
18790464,18790464,99442,9,1,13,127,2017-11-10 15:00:00
18790465,18790465,88046,23,1,37,153,2017-11-10 15:00:00
18790466,18790467,81398,18,1,17,265,2017-11-10 15:00:00
18790467,18790466,123236,27,1,13,122,2017-11-10 15:00:00
18790468,18790468,73516,12,2,27,265,2017-11-10 15:00:00


In [34]:
test_df = prep_data(test)

grouping by ip-day-hour combination
group by ip-app combination
group by ip-app-os combination
vars and data type
label encoding....
dropping


In [35]:
test_df.head()

Unnamed: 0,click_id,app,device,os,channel,hour,day,wday,ip-day-hour,ip_app_count,ip_app_os_count
0,0,9,1,3,107,4,10,4,34,28,1
1,1,9,1,3,466,4,10,4,403,289,5
2,2,21,1,19,128,4,10,4,229,312,24
3,3,15,1,13,111,4,10,4,239,42,23
4,4,12,1,13,328,4,10,4,60,24,7


In [36]:
test_df = test_df.drop('click_id', axis = 1)

In [37]:
predictions = gbk.predict_proba(test_df)

In [38]:
sub = pd.DataFrame()
sub['click_id'] = test['click_id'].astype('int')
test.drop(['click_id'], axis=1, inplace=True)
gc.collect()

53

In [39]:
sub2 = pd.DataFrame()
sub['click_id'] = sub['click_id'].astype('int')
gc.collect()

0

In [40]:
def convert_preds(raw_preds):
    preds = []
    for p in raw_preds:
        preds.append(1 - p[0])
    return preds

In [41]:
val_preds = convert_preds(predictions)

In [42]:
sub['is_attributed'] = val_preds
sub.head()

Unnamed: 0,click_id,is_attributed
0,0,0.000293
1,1,0.000328
2,2,0.000108
3,3,0.002578
4,4,0.000328


In [43]:
max(val_preds)

1.0

In [44]:
sub.to_csv('GBK.csv', float_format='%.8f', index=False)