In [2]:
import gc
import time
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import warnings
warnings.filterwarnings('ignore')
import os
path = '../../../DEVELOPMENT/Fraud Detection/input/'

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }



In [3]:
train_sample = pd.read_csv(path+'train_sample.csv', dtype=dtypes)

In [4]:
train_sample.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,,0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0


In [18]:
def prep_data(d):
    d['hour'] = pd.to_datetime(d.click_time).dt.hour.astype('uint8')
    d['day'] = pd.to_datetime(d.click_time).dt.day.astype('uint8')
    d['wday']  = pd.to_datetime(d.click_time).dt.dayofweek.astype('uint8')
    
    print('grouping by ip-day-hour combination')
    gp = d[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip-day-hour'})
    d = d.merge(gp, on=['ip','day','hour'], how='left')
    del gp; gc.collect()
    
    print('group by ip-app combination')
    gp = d[['ip','app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
    d = d.merge(gp, on=['ip','app'], how='left')
    del gp; gc.collect()
    
    print('group by ip-app-os combination')
    gp = d[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
    d = d.merge(gp, on=['ip','app', 'os'], how='left')
    del gp; gc.collect()
    
    print("vars and data type")
    d['ip-day-hour'] = d['ip-day-hour'].astype('uint16')
    d['ip_app_count'] = d['ip_app_count'].astype('uint16')
    d['ip_app_os_count'] = d['ip_app_os_count'].astype('uint16')
    
    print("label encoding....")
    from sklearn.preprocessing import LabelEncoder
    d[['app','device','os', 'channel', 'hour', 'day', 'wday']].apply(LabelEncoder().fit_transform)
    print('dropping')
    d.drop(['click_time', 'ip'], 1, inplace=True)
    
    return d

In [19]:
train_df1 = prep_data(train_sample)

grouping by ip-day-hour combination
group by ip-app combination
group by ip-app-os combination
vars and data type
label encoding....
dropping


In [20]:
train_df1.head()

Unnamed: 0,app,device,os,channel,attributed_time,is_attributed,hour,day,wday,ip-day-hour,ip_app_count,ip_app_os_count
0,12,1,13,497,,0,9,7,1,1,3,2
1,25,1,17,259,,0,13,7,1,4,4,1
2,12,1,19,212,,0,18,7,1,1,1,1
3,13,1,13,477,,0,4,7,1,1,1,1
4,12,1,1,178,,0,9,9,3,1,2,1


In [21]:
from sklearn.model_selection import train_test_split

predictors = train_df1.drop(['attributed_time', 'is_attributed'], axis=1)
target = train_df1["is_attributed"]
x_train, x_val, y_train, y_val = train_test_split(predictors, target, test_size = 0.22, random_state = 0)

#### Testing Different Models

- Gaussian Naive Bayes
- Logistic Regression
- Perceptron
- Deicision Tree Classifier
- KNN or K-Nearest Neighbors
- Stochastic Gradient Decent
- Gradient Boosting Classifer

In [22]:
#Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
y_pred = gaussian.predict(x_val)
acc_gaussian = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_gaussian)

98.64


In [23]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
logreg_y_pred = logreg.predict(x_val)
acc_logreg = round(accuracy_score(logreg_y_pred, y_val) *100, 2)
print(acc_logreg)

99.74


In [24]:
from sklearn.linear_model import Perceptron

perceptron = Perceptron()
perceptron.fit(x_train, y_train)
y_pred = perceptron.predict(x_val)
acc_perceptron = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_perceptron)

99.56


In [25]:
from sklearn.tree import DecisionTreeClassifier

decisiontree = DecisionTreeClassifier()
decisiontree.fit(x_train, y_train)
y_pred = decisiontree.predict(x_val)
acc_decisiontree = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_decisiontree)

99.62


In [26]:
from sklearn.ensemble import RandomForestClassifier

randomforest = RandomForestClassifier()
randomforest.fit(x_train, y_train)
y_pred = randomforest.predict(x_val)
acc_randomforest = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_randomforest)

99.73


In [27]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_pred = knn.predict(x_val)
acc_knn = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_knn)

99.75


In [28]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
sgd.fit(x_train, y_train)
y_pred = sgd.predict(x_val)
acc_sgd = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_sgd)

99.75


In [29]:
from sklearn.ensemble import GradientBoostingClassifier

gbk = GradientBoostingClassifier()
gbk.fit(x_train, y_train)
y_pred = gbk.predict(x_val)
acc_gbk = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_gbk)

99.75


In [30]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train, y_train)
y_pred = svc.predict(x_val)
acc_svc = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_svc)

99.75


In [31]:
models = pd.DataFrame({
    'Model':['SVM','KNN','Logreg','RF','Naive Bayes', 'Perceptron', 'Decisino Tree', 'Stocastic GD', 'Gradient BoostingClassifier'], 
    'Score' : [acc_svc,acc_knn, acc_logreg, acc_randomforest, acc_gaussian, acc_perceptron, acc_decisiontree, acc_sgd, acc_gbk]})
models.sort_values(by='Score',ascending=False)

Unnamed: 0,Model,Score
0,SVM,99.75
1,KNN,99.75
7,Stocastic GD,99.75
8,Gradient BoostingClassifier,99.75
2,Logreg,99.74
3,RF,99.73
6,Decisino Tree,99.62
5,Perceptron,99.56
4,Naive Bayes,98.64


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

logreg_roc = roc_auc_score(y_val, logreg.predict_proba(x_val)[:,1])
print('ROC : ',logreg_roc)

ROC :  0.742700575491


In [33]:
gaussian_roc = roc_auc_score(y_val, gaussian.predict_proba(x_val)[:,1])
print('ROC : ',gaussian_roc)

ROC :  0.785636164783


In [34]:
decisiontree_roc = roc_auc_score(y_val, decisiontree.predict_proba(x_val)[:,1])
print('ROC : ',decisiontree_roc)

ROC :  0.659401691318


In [35]:
randomforest_roc = roc_auc_score(y_val, randomforest.predict_proba(x_val)[:,1])
print('ROC : ',randomforest_roc)

ROC :  0.791417113692


In [36]:
knn_roc = roc_auc_score(y_val, knn.predict_proba(x_val)[:,1])
print('ROC : ',knn_roc)

ROC :  0.782692389068


In [37]:
gbk_roc = roc_auc_score(y_val, gbk.predict_proba(x_val)[:,1])
print('ROC : ',gbk_roc)

ROC :  0.834874729832


In [38]:
models = pd.DataFrame({
    'Model':['Logreg','Naive Bayes', 'Decisino Tree', 'randomforest_roc', 'knn','Gradient BoostingClassifier'], 
    'ROC' : [logreg_roc, gaussian_roc, decisiontree_roc, randomforest_roc, knn_roc, gbk_roc]})
models.sort_values(by='ROC',ascending=False)

Unnamed: 0,Model,ROC
5,Gradient BoostingClassifier,0.834875
3,randomforest_roc,0.791417
1,Naive Bayes,0.785636
4,knn,0.782692
0,Logreg,0.742701
2,Decisino Tree,0.659402


In [59]:
test = pd.read_csv(path+'test.csv', dtype=dtypes)
test.tail()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
18790464,18790464,99442,9,1,13,127,2017-11-10 15:00:00
18790465,18790465,88046,23,1,37,153,2017-11-10 15:00:00
18790466,18790467,81398,18,1,17,265,2017-11-10 15:00:00
18790467,18790466,123236,27,1,13,122,2017-11-10 15:00:00
18790468,18790468,73516,12,2,27,265,2017-11-10 15:00:00


In [40]:
test_df = prep_data(test)

grouping by ip-day-hour combination
group by ip-app combination
group by ip-app-os combination
vars and data type
label encoding....
dropping


In [50]:
test_df.head()

Unnamed: 0,click_id,app,device,os,channel,hour,day,wday,ip-day-hour,ip_app_count,ip_app_os_count
0,0,9,1,3,107,4,10,4,34,28,1
1,1,9,1,3,466,4,10,4,403,289,5
2,2,21,1,19,128,4,10,4,229,312,24
3,3,15,1,13,111,4,10,4,239,42,23
4,4,12,1,13,328,4,10,4,60,24,7


In [52]:
test_df = test_df.drop('click_id', axis = 1)

In [53]:
predictions = gbk.predict_proba(test_df)

In [60]:
sub = pd.DataFrame()
sub['click_id'] = test['click_id'].astype('int')
test.drop(['click_id'], axis=1, inplace=True)
gc.collect()

298

In [61]:
sub2 = pd.DataFrame()
sub['click_id'] = sub['click_id'].astype('int')
gc.collect()

0

In [67]:
def convert_preds(raw_preds):
    preds = []
    for p in raw_preds:
        preds.append(1 - p[0])
    return preds

In [68]:
val_preds = convert_preds(predictions)

In [70]:
sub['is_attributed'] = val_preds
sub.head()

Unnamed: 0,click_id,is_attributed
0,0,0.000384
1,1,0.000334
2,2,0.000334
3,3,0.000384
4,4,0.000334


In [71]:
sub.to_csv('GBK.csv', float_format='%.8f', index=False)