In [3]:
import gc
import time
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import warnings
warnings.filterwarnings('ignore')
import os
path = '../../../DEVELOPMENT/Fraud Detection/input/'

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

In [4]:
train_sample = pd.read_csv(path+'train_sample.csv', dtype=dtypes)

In [28]:
train_sample.head(10)

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,hour,day,wday
0,87540,12,1,13,497,2017-11-07 09:30:38,,0,9,7,1
1,105560,25,1,17,259,2017-11-07 13:40:27,,0,13,7,1
2,101424,12,1,19,212,2017-11-07 18:05:24,,0,18,7,1
3,94584,13,1,13,477,2017-11-07 04:58:08,,0,4,7,1
4,68413,12,1,1,178,2017-11-09 09:00:09,,0,9,9,3
5,93663,3,1,17,115,2017-11-09 01:22:13,,0,1,9,3
6,17059,1,1,17,135,2017-11-09 01:17:58,,0,1,9,3
7,121505,9,1,25,442,2017-11-07 10:01:53,,0,10,7,1
8,192967,2,2,22,364,2017-11-08 09:35:17,,0,9,8,2
9,143636,3,1,19,135,2017-11-08 12:35:26,,0,12,8,2


In [6]:
def prep_data(d):
    d['hour'] = pd.to_datetime(d.click_time).dt.hour.astype('uint8')
    d['day'] = pd.to_datetime(d.click_time).dt.day.astype('uint8')
    d['wday']  = pd.to_datetime(d.click_time).dt.dayofweek.astype('uint8')
    
    print('grouping by ip-day-hour combination')
    gp = d[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip-day-hour'})
    d = d.merge(gp, on=['ip','day','hour'], how='left')
    del gp; gc.collect()
    
    print('group by ip-app combination')
    gp = d[['ip','app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
    d = d.merge(gp, on=['ip','app'], how='left')
    del gp; gc.collect()
    
    print('group by ip-app-os combination')
    gp = d[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
    d = d.merge(gp, on=['ip','app', 'os'], how='left')
    del gp; gc.collect()
    
    print("vars and data type")
    d['ip-day-hour'] = d['ip-day-hour'].astype('uint16')
    d['ip_app_count'] = d['ip_app_count'].astype('uint16')
    d['ip_app_os_count'] = d['ip_app_os_count'].astype('uint16')
    
    print("label encoding....")
    from sklearn.preprocessing import LabelEncoder
    d[['app','device','os', 'channel', 'hour', 'day', 'wday']].apply(LabelEncoder().fit_transform)
    print('dropping')
    d.drop(['click_time', 'ip'], 1, inplace=True)
    
    return d

In [7]:
train_df1 = prep_data(train_sample)

grouping by ip-day-hour combination
group by ip-app combination
group by ip-app-os combination
vars and data type
label encoding....
dropping


In [8]:
train_df1.head()

Unnamed: 0,app,device,os,channel,attributed_time,is_attributed,hour,day,wday,ip-day-hour,ip_app_count,ip_app_os_count
0,12,1,13,497,,0,9,7,1,1,3,2
1,25,1,17,259,,0,13,7,1,4,4,1
2,12,1,19,212,,0,18,7,1,1,1,1
3,13,1,13,477,,0,4,7,1,1,1,1
4,12,1,1,178,,0,9,9,3,1,2,1


In [9]:
train_df2 = train_df1.drop(['app','device','os','channel','attributed_time','hour','day','wday'], axis =1)

In [10]:
from sklearn.model_selection import train_test_split

predictors = train_df2.drop(['is_attributed'], axis=1)
target = train_df1["is_attributed"]
x_train, x_val, y_train, y_val = train_test_split(predictors, target, test_size = 0.22, random_state = 0)

#### Testing Different Models

- Gaussian Naive Bayes
- Logistic Regression
- Perceptron
- Deicision Tree Classifier
- KNN or K-Nearest Neighbors
- Stochastic Gradient Decent
- Gradient Boosting Classifer

In [11]:
#Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
y_pred = gaussian.predict(x_val)
acc_gaussian = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_gaussian)

99.75


In [12]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
logreg_y_pred = logreg.predict(x_val)
acc_logreg = round(accuracy_score(logreg_y_pred, y_val) *100, 2)
print(acc_logreg)

99.75


In [13]:
from sklearn.linear_model import Perceptron

perceptron = Perceptron()
perceptron.fit(x_train, y_train)
y_pred = perceptron.predict(x_val)
acc_perceptron = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_perceptron)

99.75


In [14]:
from sklearn.tree import DecisionTreeClassifier

decisiontree = DecisionTreeClassifier()
decisiontree.fit(x_train, y_train)
y_pred = decisiontree.predict(x_val)
acc_decisiontree = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_decisiontree)

99.75


In [15]:
from sklearn.ensemble import RandomForestClassifier

randomforest = RandomForestClassifier()
randomforest.fit(x_train, y_train)
y_pred = randomforest.predict(x_val)
acc_randomforest = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_randomforest)

99.75


In [16]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_pred = knn.predict(x_val)
acc_knn = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_knn)

99.75


In [17]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
sgd.fit(x_train, y_train)
y_pred = sgd.predict(x_val)
acc_sgd = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_sgd)

99.75


In [18]:
from sklearn.ensemble import GradientBoostingClassifier

gbk = GradientBoostingClassifier()
gbk.fit(x_train, y_train)
y_pred = gbk.predict(x_val)
acc_gbk = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_gbk)

99.75


In [19]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train, y_train)
y_pred = svc.predict(x_val)
acc_svc = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_svc)

99.75


In [20]:
models = pd.DataFrame({
    'Model':['SVM','KNN','Logreg','RF','Naive Bayes', 'Perceptron', 'Decisino Tree', 'Stocastic GD', 'Gradient BoostingClassifier'], 
    'Score' : [acc_svc,acc_knn, acc_logreg, acc_randomforest, acc_gaussian, acc_perceptron, acc_decisiontree, acc_sgd, acc_gbk]})
models.sort_values(by='Score',ascending=False)

Unnamed: 0,Model,Score
0,SVM,99.75
1,KNN,99.75
2,Logreg,99.75
3,RF,99.75
4,Naive Bayes,99.75
5,Perceptron,99.75
6,Decisino Tree,99.75
7,Stocastic GD,99.75
8,Gradient BoostingClassifier,99.75


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

logreg_roc = roc_auc_score(y_val, logreg.predict_proba(x_val)[:,1])
print('ROC : ',logreg_roc)

ROC :  0.644407761966


In [22]:
gaussian_roc = roc_auc_score(y_val, gaussian.predict_proba(x_val)[:,1])
print('ROC : ',gaussian_roc)

ROC :  0.636396297719


In [23]:
decisiontree_roc = roc_auc_score(y_val, decisiontree.predict_proba(x_val)[:,1])
print('ROC : ',decisiontree_roc)

ROC :  0.640660805427


In [24]:
randomforest_roc = roc_auc_score(y_val, randomforest.predict_proba(x_val)[:,1])
print('ROC : ',randomforest_roc)

ROC :  0.637039574762


In [25]:
knn_roc = roc_auc_score(y_val, knn.predict_proba(x_val)[:,1])
print('ROC : ',knn_roc)

ROC :  0.499931644185


In [26]:
gbk_roc = roc_auc_score(y_val, gbk.predict_proba(x_val)[:,1])
print('ROC : ',gbk_roc)

ROC :  0.641052630722


In [27]:
models = pd.DataFrame({
    'Model':['Logreg','Naive Bayes', 'Decisino Tree', 'randomforest_roc', 'knn','Gradient BoostingClassifier'], 
    'ROC' : [logreg_roc, gaussian_roc, decisiontree_roc, randomforest_roc, knn_roc, gbk_roc]})
models.sort_values(by='ROC',ascending=False)

Unnamed: 0,Model,ROC
0,Logreg,0.644408
5,Gradient BoostingClassifier,0.641053
2,Decisino Tree,0.640661
3,randomforest_roc,0.63704
1,Naive Bayes,0.636396
4,knn,0.499932
