In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
import time
from datetime import datetime
import os
import gc
import random
path = '../../../DEVELOPMENT/Fraud Detection/input/'

In [2]:
train_sample = pd.read_csv(path+'train_sample.csv')
test = pd.read_csv(path+'test.csv')
np.random.seed(0)

In [3]:
sub = pd.DataFrame()
sub['click_id'] = test['click_id'].astype('int')
test.drop(['click_id'], axis=1, inplace=True)
gc.collect()

28

In [4]:
def prep_data(d):
    d['hour'] = pd.to_datetime(d.click_time).dt.hour.astype('uint8')
    d['day'] = pd.to_datetime(d.click_time).dt.day.astype('uint8')
    d['wday']  = pd.to_datetime(d.click_time).dt.dayofweek.astype('uint8')
    
    print('grouping by ip-day-hour combination')
    gp = d[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'qty'})
    d = d.merge(gp, on=['ip','day','hour'], how='left')
    del gp; gc.collect()
    
    print('group by ip-app combination')
    gp = d[['ip','app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
    d = d.merge(gp, on=['ip','app'], how='left')
    del gp; gc.collect()
    
    print('group by ip-app-os combination')
    gp = d[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
    d = d.merge(gp, on=['ip','app', 'os'], how='left')
    del gp; gc.collect()
    
    print("vars and data type")
    d['qty'] = d['qty'].astype('uint16')
    d['ip_app_count'] = d['ip_app_count'].astype('uint16')
    d['ip_app_os_count'] = d['ip_app_os_count'].astype('uint16')
    
    print("label encoding....")
    from sklearn.preprocessing import LabelEncoder
    d[['app','device','os', 'channel', 'hour', 'day', 'wday']].apply(LabelEncoder().fit_transform)
    print('dropping')
    d.drop(['click_time', 'ip'], 1, inplace=True)
    
    return d

In [5]:
train_df = prep_data(train_sample)
test = prep_data(test)

grouping by ip-day-hour combination
group by ip-app combination
group by ip-app-os combination
vars and data type
label encoding....
dropping
grouping by ip-day-hour combination
group by ip-app combination
group by ip-app-os combination
vars and data type
label encoding....
dropping


In [6]:
train_df.head()

Unnamed: 0,app,device,os,channel,attributed_time,is_attributed,hour,day,wday,qty,ip_app_count,ip_app_os_count
0,12,1,13,497,,0,9,7,1,1,3,2
1,25,1,17,259,,0,13,7,1,4,4,1
2,12,1,19,212,,0,18,7,1,1,1,1
3,13,1,13,477,,0,4,7,1,1,1,1
4,12,1,1,178,,0,9,9,3,1,2,1


In [7]:
test.tail()

Unnamed: 0,app,device,os,channel,hour,day,wday,qty,ip_app_count,ip_app_os_count
18790464,9,1,13,127,15,10,4,2,16,3
18790465,23,1,37,153,15,10,4,1,30,2
18790466,18,1,17,265,15,10,4,2,190,7
18790467,27,1,13,122,15,10,4,1,1,1
18790468,12,2,27,265,15,10,4,4,14704,94


In [8]:
y = train_df['is_attributed']
x_train = train_df.drop(['is_attributed','attributed_time'],axis=1)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score

In [10]:
my_pipeline = make_pipeline(Imputer(), RandomForestClassifier())

In [12]:
scores = cross_val_score(my_pipeline, x_train, y, scoring='roc_auc', cv=10)
print(scores)

[ 0.90906734  0.82251824  0.77989621  0.75682112  0.82242201  0.80074388
  0.77936646  0.79237929  0.88343189  0.86038115]


In [13]:
my_pipeline.fit(x_train, y)
prediction = my_pipeline.predict(test)
len(prediction)

18790469

In [14]:
sub['is_attributed'] = prediction
sub.head()

Unnamed: 0,click_id,is_attributed
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [15]:
sub.to_csv('RF4_Pipeline.csv', float_format='%.8f', index=False)