In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
sns.set(font_scale=1)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import matplotlib.pyplot as plt
%matplotlib inline
import time
from subprocess import check_output
path = '../../../DEVELOPMENT/Fraud Detection/input/'
import gc


dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

from sklearn.linear_model import LogisticRegression
from scipy.special import expit, logit
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,confusion_matrix

In [4]:
train_sample = pd.read_csv(path+'train_sample.csv', dtype=dtypes)
test = pd.read_csv(path+'test.csv', dtype=dtypes)

In [5]:
sub = pd.DataFrame()
sub['click_id'] = test['click_id'].astype('int')
test.drop(['click_id'], axis=1, inplace=True)
gc.collect()

33

In [6]:
def prep_data(d):
    d['hour'] = pd.to_datetime(d.click_time).dt.hour.astype('uint8')
    d['day'] = pd.to_datetime(d.click_time).dt.day.astype('uint8')
    d['wday']  = pd.to_datetime(d.click_time).dt.dayofweek.astype('uint8')

    print('hour/download combination')
    gp = d.groupby('hour', as_index = False).count()
    gp = d[['hour', 'app', 'attributed_time']].groupby(by='hour').count().reset_index()
    gp = gp.rename(columns = {'app': 'click_count', 'attributed_time': 'download_count'})
    gp['h_download_rate'] = gp['download_count'] / gp['click_count'] * 100
    d = d.merge(gp[['hour','h_download_rate']], on='hour' ,how='left')
    del gp; gc.collect()
    
    print('ip/download combination')
    gp = d.groupby('ip', as_index = False).count()
    gp = d[['ip', 'app', 'attributed_time']].groupby(by='ip').count().reset_index()
    gp = gp.rename(columns = {'app': 'click_count', 'attributed_time': 'download_count'})
    gp['i_download_rate'] = gp['download_count'] / gp['click_count'] * 100
    d = d.merge(gp[['ip','i_download_rate']], on='ip' ,how='left')
    del gp; gc.collect()
    
    print('device/download combination')
    gp = d.groupby('device', as_index = False).count()
    gp = d[['device', 'app', 'attributed_time']].groupby(by='device').count().reset_index()
    gp = gp.rename(columns = {'app': 'click_count', 'attributed_time': 'download_count'})
    gp['d_download_rate'] = gp['download_count'] / gp['click_count'] * 100
    d = d.merge(gp[['device','d_download_rate']], on='device' ,how='left')
    del gp; gc.collect()

    print('os/download combination')
    gp = d.groupby('os', as_index = False).count()
    gp = d[['os', 'app', 'attributed_time']].groupby(by='os').count().reset_index()
    gp = gp.rename(columns = {'app': 'click_count', 'attributed_time': 'download_count'})
    gp['o_download_rate'] = gp['download_count'] / gp['click_count'] * 100
    d = d.merge(gp[['os','o_download_rate']], on='os' ,how='left')
    del gp; gc.collect()
    
    print('channel/download combination')
    gp = d.groupby('channel', as_index = False).count()
    gp = d[['channel', 'app', 'attributed_time']].groupby(by='channel').count().reset_index()
    gp = gp.rename(columns = {'app': 'click_count', 'attributed_time': 'download_count'})
    gp['c_download_rate'] = gp['download_count'] / gp['click_count'] * 100
    d = d.merge(gp[['channel','c_download_rate']], on='channel' ,how='left')
    del gp; gc.collect()

    print("vars and data type")
    d['h_download_rate'] = d['h_download_rate'].astype('float64')
    d['i_download_rate'] = d['i_download_rate'].astype('float64')
    d['d_download_rate'] = d['d_download_rate'].astype('float64')
    d['o_download_rate'] = d['o_download_rate'].astype('float64')
    d['c_download_rate'] = d['c_download_rate'].astype('float64')
    
    print("label encoding....")
    from sklearn.preprocessing import LabelEncoder
    d[['app','device','os', 'channel', 'hour', 'day', 'wday']].apply(LabelEncoder().fit_transform)
    print('dropping')
    d.drop(['click_time', 'attributed_time'], 1, inplace=True)
        
    return d

In [7]:
train_df = prep_data(train_sample)

hour/download combination
ip/download combination
device/download combination
os/download combination
channel/download combination
vars and data type
label encoding....
dropping


In [13]:
train_df.tail(10)

Unnamed: 0,ip,app,device,os,channel,is_attributed,hour,day,wday,h_download_rate,i_download_rate,d_download_rate,o_download_rate,c_download_rate
99990,84388,9,1,22,107,0,4,9,3,0.166,0.0,0.155,0.149,0.022
99991,43374,1,1,18,134,0,12,9,3,0.263,0.0,0.155,0.083,0.031
99992,125061,12,1,13,328,0,9,7,1,0.205,0.0,0.155,0.118,0.0
99993,146258,3,1,13,280,0,3,9,3,0.285,0.0,0.155,0.118,0.025
99994,147153,13,1,13,477,0,7,7,1,0.255,0.0,0.155,0.118,0.0
99995,124883,11,1,19,122,0,13,9,3,0.178,0.0,0.155,0.172,0.0
99996,85150,9,1,13,244,0,11,7,1,0.187,0.0,0.155,0.118,0.0
99997,18839,3,1,13,19,0,11,8,2,0.187,0.0,0.155,0.118,0.0
99998,114276,15,1,12,245,0,17,8,2,0.092,0.0,0.155,0.091,0.0
99999,119349,14,1,15,401,0,14,7,1,0.306,0.0,0.155,0.167,0.0


In [12]:
test.tail(10)

Unnamed: 0,ip,app,device,os,channel,click_time
18790459,102467,15,1,17,140,2017-11-10 15:00:00
18790460,80537,9,1,19,445,2017-11-10 15:00:00
18790461,101214,2,2,16,237,2017-11-10 15:00:00
18790462,113418,17,1,17,128,2017-11-10 15:00:00
18790463,69245,12,1,13,135,2017-11-10 15:00:00
18790464,99442,9,1,13,127,2017-11-10 15:00:00
18790465,88046,23,1,37,153,2017-11-10 15:00:00
18790466,81398,18,1,17,265,2017-11-10 15:00:00
18790467,123236,27,1,13,122,2017-11-10 15:00:00
18790468,73516,12,2,27,265,2017-11-10 15:00:00


In [40]:
a = train_df[['ip','app','os','channel','hour','h_download_rate']].groupby(by=['ip','app','os','channel','hour'])[['h_download_rate']].mean().reset_index()
a.head()

Unnamed: 0,ip,app,os,channel,hour,h_download_rate
0,9,9,13,244,16,0.0
1,10,11,22,319,1,0.294
2,10,12,19,140,7,0.255
3,10,18,13,107,11,0.187
4,19,14,16,379,9,0.205


In [35]:
def prep_data1(d):
    
    print('grouping by ip-day-hour combination')
    gp = d[['ip','app','os','channel','hour','h_download_rate']].groupby(by=['ip','app','os','channel','hour'])[['h_download_rate']].mean().reset_index()
    d = d.merge(gp, on=['ip','app','os','channel','hour'], how='left')
    del gp; gc.collect()
    
    return d

In [26]:
train_sample[['ip','day','hour','channel']].head()
train_sample[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,channel
ip,day,hour,Unnamed: 3_level_1
9,7,16,1
10,7,1,1
10,7,7,1
10,8,11,1
19,8,9,1


In [9]:
# Change time
data['click_time_dt']= pd.to_datetime(data['click_time'])
dt= data['click_time_dt'].dt
data['day'] = dt.day.astype('uint8')
data['hour'] = dt.hour.astype('uint8')
data['minute'] = dt.minute.astype('uint8')

NameError: name 'data' is not defined

In [None]:
count_per_hour = data.groupby('hour', as_index = False).count()
count_per_hour.head()

In [None]:
count_per_hour = count_per_hour[['hour', 'app', 'attributed_time']]
count_per_hour.head()

In [None]:
count_per_hour = count_per_hour.rename(columns = {'app': 'click_count', 'attributed_time': 'download_count'})
count_per_hour['download_rate'] = count_per_hour['download_count'] / count_per_hour['click_count'] * 100
count_per_hour.head()

In [None]:
data.head()

In [None]:
#What I cannot find meaningful result
data = data.drop('click_time_dt', axis = 1)

In [None]:
count_per_hour = data.groupby('hour', as_index = False).count()
count_per_hour.head()

In [None]:
gp = data[['hour', 'app', 'attributed_time']].groupby(by='hour').count().reset_index()
gp.head()

In [None]:
gp = gp.rename(columns = {'app': 'click_count', 'attributed_time': 'download_count'})
gp['h_download_rate'] = gp['download_count'] / gp['click_count'] * 100
gp.head()

In [None]:
click_ip_count = data.groupby('ip', as_index= False)['device'].aggregate('count').sort_values('device', ascending=False)
click_ip_count = click_ip_count.rename(columns={'device':'count'})
click_ip_count.head()

In [None]:
ip_click_ranking = data.groupby('ip', as_index = False).count().sort_values(by = 'app', ascending=False)
ip_click_ranking = ip_click_ranking[['ip', 'app']]
ip_click_ranking = ip_click_ranking.rename(columns={'app':'click_count'})
ip_click_ranking.head(10)

In [None]:
ip_click_download = data.groupby('ip', as_index = False).sum().sort_values(by='app', ascending = False).reset_index()
ip_click_download = ip_click_download[['ip', 'app', 'is_attributed']]
ip_click_download = ip_click_download.rename(columns={'is_attributed': 'download_count', 'app': 'click_count'})

ip_click_download.head()

In [None]:
ip_click_download['i_download_rate'] = ip_click_download['download_count'] / ip_click_download['click_count'] * 100
ip_click_download.head()

In [None]:
data = data.merge(ip_click_download[['ip','i_download_rate']], on='ip' ,how='left')

In [None]:
dll_cnt_device = data.groupby('device', as_index=False).sum().sort_values(by='is_attributed', ascending=False)
dll_cnt_device = dll_cnt_device[['device', 'is_attributed']]
dll_cnt_device = dll_cnt_device.rename(columns={'is_attributed': 'download_count'})
dll_cnt_device.head()

In [None]:
click_cnt_device = data.groupby('device', as_index=False).count().sort_values(by='app', ascending=False)
click_cnt_device = click_cnt_device[['device', 'app']]
click_cnt_device = click_cnt_device.rename(columns={'app': 'click_count'})
click_cnt_device.head()

In [None]:
device_click_download_df = pd.merge(click_cnt_device, dll_cnt_device, on='device') 
device_click_download_df['d_download_rate'] = device_click_download_df['download_count']/ device_click_download_df['click_count'] * 100 
device_click_download_df.head()

In [None]:
data = data.merge(device_click_download_df[['device','d_download_rate']], on='device' ,how='left')

In [None]:
dll_cnt_os = data.groupby('os', as_index=False).sum().sort_values(by='is_attributed', ascending=False)
dll_cnt_os = dll_cnt_os[['os', 'is_attributed']]
dll_cnt_os = dll_cnt_os.rename(columns={'is_attributed': 'download_count'})
dll_cnt_os.head()

In [None]:
click_cnt_os = data.groupby('os', as_index=False).count().sort_values(by='app', ascending=False)
click_cnt_os = click_cnt_os[['os', 'app']]
click_cnt_os = click_cnt_os.rename(columns={'app': 'click_count'})
click_cnt_os.head()

In [None]:
os_click_download_df = pd.merge(click_cnt_os, dll_cnt_os, on='os') 
os_click_download_df['o_download_rate'] = os_click_download_df['download_count']/ os_click_download_df['click_count'] * 100 
os_click_download_df.head()

In [None]:
data = data.merge(os_click_download_df[['os','o_download_rate']], on='os' ,how='left')

In [None]:
click_ip_count = data.groupby('app', as_index= False)['device'].aggregate('count').sort_values('device', ascending=False)
click_ip_count = click_ip_count.rename(columns={'device':'click_count'})
click_ip_count = click_ip_count[click_ip_count['click_count'] > 100]
click_ip_count.head()

In [None]:
click_cnt_channel = data.groupby('channel', as_index=False).count().sort_values(by='app', ascending=False)
click_cnt_channel = click_cnt_channel[['channel', 'app']]
click_cnt_channel = click_cnt_channel.rename(columns={'app': 'click_count'})

click_cnt_channel.head()

In [None]:
channel_click_download_df = pd.merge(click_cnt_channel, dll_cnt_channel, on='channel') 
channel_click_download_df['c_download_rate'] = channel_click_download_df['download_count']/ channel_click_download_df['click_count'] * 100 
channel_click_download_df.head(10)

In [None]:
data = data.merge(channel_click_download_df[['channel','c_download_rate']], on='channel' ,how='left')

In [None]:
data.head()

In [None]:
data = data.drop(['click_time','attributed_time'], axis=1)

In [None]:
data.head(30)

In [None]:
RANDOM_SEED = 1
import random
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

X_train, X_test = train_test_split(data, test_size=0.1, random_state=RANDOM_SEED)

y_train = X_train['is_attributed']
X_train = X_train.drop(['is_attributed'], axis=1)
y_test = X_test['is_attributed']
X_test = X_test.drop(['is_attributed'], axis=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=9, random_state=0)
rf.fit(X_train, y_train)

In [None]:
predictions = rf.predict_proba(X_test)
predictions

In [None]:
def convert_preds(raw_preds):
    preds = []
    for p in raw_preds:
        preds.append(1 - p[0])
    return preds

In [None]:
val_preds = convert_preds(predictions)

In [None]:
max(val_preds)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support)

fpr, tpr, thresholds = roc_curve(y_test, val_preds)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, label='AUC = %0.4f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.001, 1])
plt.ylim([0, 1.001])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show();

In [None]:
test.tail()