In [164]:
import numpy as np
import pandas as pd

# read data
input_samples = pd.read_csv('data.csv', header=0,index_col = 0, na_filter = True )
input_samples = input_samples.reset_index(drop=True)

# Basic data research

In [165]:
# shape and values
print input_samples.shape
print input_samples.loc[1]

(100000, 54)
idvisit                                10948664
idvisitor                      34ace250e6f0d5dd
visitor_localtime                      09:15:48
visitor_returning                             t
visitor_count_visits                         46
visitor_days_since_last                       3
visitor_days_since_first                     89
visit_first_action_time     2014-02-03 08:16:02
visit_last_action_time      2014-02-03 08:33:17
visit_exit_idaction_url                  205296
visit_entry_idaction_url                 205296
visit_total_actions                           2
visit_total_time                        1035682
referer_name                        Lollipop_PL
referer_url                                 NaN
referer_keyword                             NaN
config_os                                   WNT
config_browser_name                          CH
config_browser_version                       32
config_resolution                      1360x768
config_pdf                 

In [143]:
# find amount of Nan values
for name in input_samples.keys():
    keys = pd.isnull(input_samples[name])
    if keys.any():
        print name
        print keys.value_counts()

referer_name
False    98339
True      1661
dtype: int64
referer_url
True     88826
False    11174
dtype: int64
referer_keyword
True     89804
False    10196
dtype: int64
config_browser_name
False    96106
True      3894
dtype: int64
config_browser_version
False    96106
True      3894
dtype: int64
location_city
False    78580
True     21420
dtype: int64
utm_source
False    97392
True      2608
dtype: int64
utm_medium
False    96862
True      3138
dtype: int64
utm_term
True     99044
False      956
dtype: int64
utm_content
True     98889
False     1111
dtype: int64
utm_campaign
False    96862
True      3138
dtype: int64
channel
True     97452
False     2548
dtype: int64
idaction_name_ref
False    93985
True      6015
dtype: int64
is_lead
True     99319
False      681
dtype: int64


In [166]:
# find variability
for name in input_samples.keys():
    print name, len(input_samples[name].value_counts(dropna=False).keys())

idvisit 3711
idvisitor 2036
visitor_localtime 3594
visitor_returning 3
visitor_count_visits 108
visitor_days_since_last 75
visitor_days_since_first 149
visit_first_action_time 3688
visit_last_action_time 3691
visit_exit_idaction_url 496
visit_entry_idaction_url 499
visit_total_actions 14
visit_total_time 700
referer_name 91
referer_url 507
referer_keyword 154
config_os 8
config_browser_name 7
config_browser_version 38
config_resolution 110
config_pdf 3
config_flash 3
config_java 3
config_director 3
config_quicktime 3
config_realplayer 3
config_windowsmedia 3
config_gears 2
config_silverlight 3
config_cookie 3
location_ip 2649
location_browser_lang 80
location_country 23
location_region 82
location_city 752
location_latitude 757
location_longitude 762
location_ip_raw 2646
referer_type 4
utm_source 54
utm_medium 44
utm_term 32
utm_content 22
utm_campaign 67
channel 17
idlink_va 26817
idvisitor.1 2036
server_time 26664
idvisit.1 24333
idaction_url 1328
idaction_url_ref 344
idaction_name_r

In [120]:
# target variable distribution
print input_samples["is_lead"].value_counts(dropna = False)

NaN     99319
lead      681
dtype: int64


In [167]:
# split sets by leads/not_leads 
leads = input_samples[pd.notnull(input_samples["is_lead"])]
not_leads = input_samples[pd.isnull(input_samples["is_lead"])]
print "Leads", "Not leads"
print len(leads),  len(not_leads)
for name in leads.keys():
    common = len(set(leads[name].value_counts(dropna=False).keys()) & set(not_leads[name].value_counts(dropna=False).keys()))
    print name, common, len(leads[name].value_counts(dropna=False).keys()),len(not_leads[name].value_counts(dropna=False).keys())

Leads Not leads
681 99319
idvisit 0 21 3690
idvisitor 11 21 2026
visitor_localtime 2 22 3574
visitor_returning 3 3 3
visitor_count_visits 6 6 108
visitor_days_since_last 5 5 75
visitor_days_since_first 8 8 149
visit_first_action_time 1 22 3667
visit_last_action_time 1 22 3670
visit_exit_idaction_url 8 16 488
visit_entry_idaction_url 8 18 489
visit_total_actions 7 8 13
visit_total_time 0 21 679
referer_name 12 16 87
referer_url 4 15 496
referer_keyword 6 8 152
config_os 4 4 8
config_browser_name 5 5 7
config_browser_version 10 11 38
config_resolution 9 10 109
config_pdf 3 3 3
config_flash 3 3 3
config_java 3 3 3
config_director 3 3 3
config_quicktime 3 3 3
config_realplayer 3 3 3
config_windowsmedia 3 3 3
config_gears 2 2 2
config_silverlight 3 3 3
config_cookie 3 3 3
location_ip 8 22 2635
location_browser_lang 11 11 80
location_country 6 6 23
location_region 12 12 82
location_city 10 14 748
location_latitude 12 17 753
location_longitude 12 17 758
location_ip_raw 8 22 2632
referer_type 

# Data preprocessing

In [168]:
# target vector
y = input_samples["is_lead"].copy()

y[pd.notnull(y)] = 1
y[pd.isnull(y)] = 0
print y.value_counts(dropna=False)

 0     99319
 1       681
NaN        0
dtype: int64


In [169]:
# drop identifications vectors 
useless_features = ["is_lead", "idvisit", "idvisitor","location_ip","location_ip_raw",
                    "idlink_va","idvisitor.1","idvisit.1","visit_last_action_time"]

clean_input = input_samples.copy()
clean_input = clean_input.drop(useless_features,1)

In [170]:
import time
from datetime import datetime as dt

# time replacements cheats
time_fields = ["visitor_localtime", "visit_first_action_time", "server_time"]

# convert to day of week and hour
clean_input["first_act_hour"] = clean_input.apply(
    lambda x: dt.strptime(x["visit_first_action_time"], "%Y-%m-%d %H:%M:%S").hour, axis=1)

clean_input["first_act_dow"] = clean_input.apply(
    lambda x: dt.strptime(x["visit_first_action_time"], "%Y-%m-%d %H:%M:%S").weekday(), axis=1)

# convert to hour
clean_input["local_hour"] = clean_input.apply(
    lambda x: dt.strptime(x["visitor_localtime"], "%H:%M:%S").hour, axis=1)

clean_input["server_hour"] = clean_input.apply(
    lambda x: dt.strptime(x["server_time"], "%Y-%m-%d %H:%M:%S").hour, axis=1)

clean_input = clean_input.drop(time_fields,1)

In [None]:
# all features overview

#idvisit 3711 -> drop
#idvisitor 2036 -> drop
#visitor_localtime 3594 -> to hour
#visitor_returning 3 -> categorial
#visitor_count_visits 108 -> numerical
#visitor_days_since_last 75 -> numerical
#visitor_days_since_first 149 -> numerical
#visit_first_action_time 3688 -> hour and dayofweek
#visit_last_action_time 3691 -> drop
#visit_exit_idaction_url 496 -> categorial
#visit_entry_idaction_url 499 -> categorial
#visit_total_actions 14 -> numerical
#visit_total_time 700 -> numerical (check for log or conversion)
#referer_name 91 -> categorial
#referer_url 507 -> categorial
#referer_keyword 154 -> categorial
#config_os 8 -> categorial
#config_browser_name 7 -> categorial
#config_browser_version 38 -> categorial
#config_resolution 110 -> categorial
#config_pdf 3 -> categorial
#config_flash 3 -> categorial
#config_java 3 -> categorial
#config_director 3 -> categorial
#config_quicktime 3 -> categorial
#config_realplayer 3 -> categorial
#config_windowsmedia 3 -> categorial
#config_gears 2 -> categorial
#config_silverlight 3 -> categorial
#config_cookie 3 -> categorial
#location_ip 2649 -> drop
#location_browser_lang 80 -> categorial
#location_country 23 -> categorial
#location_region 82 -> categorial
#location_city 752 -> categorial
#location_latitude 757 -> numerical mb try to work with map or drop
#location_longitude 762 -> numerical mt try to drop
#location_ip_raw 2646 -> drop
#referer_type 4 -> categorial
#utm_source 54 -> categorial
#utm_medium 44 -> categorial
#utm_term 32 -> categorial
#utm_content 22 -> categorial
#utm_campaign 67 -> categorial
#channel 17 -> categorial
#idlink_va 26817 -> drop
#idvisitor.1 2036 -> drop
#server_time 26664 -> hour
#idvisit.1 24333 -> drop 
#idaction_url 1328 -> categorial
#idaction_url_ref 344 -> categorial
#idaction_name_ref 2 -> categorial
#time_spent_ref_action 2467 -> numerical
#is_lead 2 -> target

In [171]:
# clean some noisy values
noisy_fields = ["visit_exit_idaction_url", "visit_entry_idaction_url", "referer_name", "referer_url", 
                "referer_keyword", "config_resolution", "config_browser_version", "utm_source", "utm_medium",
                "utm_term", "utm_content", "utm_campaign", "location_browser_lang", "location_region",
                "location_city", "idaction_url", "idaction_url_ref"]

for name in noisy_fields:
    print name, len(clean_input[name].value_counts(dropna=False).keys())

for name in noisy_fields:
    # fill NA
    clean_input.loc[pd.isnull(clean_input[name]),name] = "undefined"
    
    # get popular entries, that cover 95% of appearing values
    summary=0
    i=0
    valuable_keys = []
    val_counts = clean_input[name].value_counts(dropna=False)
    while summary<=0.95*len(clean_input):
        summary += val_counts.get_values()[i]
        valuable_keys.append(val_counts.keys()[i])
        i+=1
    
    # replace rare with undefined
    rare_keys = list(set(val_counts.keys()) - set(valuable_keys))
    print len(val_counts.keys()), len(valuable_keys), len(rare_keys)
    clean_input[name] = clean_input[name].replace(rare_keys,"undefined")
    
for name in noisy_fields:
    print name, len(clean_input[name].value_counts().keys())
    
    


visit_exit_idaction_url 496
visit_entry_idaction_url 499
referer_name 91
referer_url 507
referer_keyword 154
config_resolution 110
config_browser_version 38
utm_source 54
utm_medium 44
utm_term 32
utm_content 22
utm_campaign 67
location_browser_lang 80
location_region 82
location_city 752
idaction_url 1328
idaction_url_ref 344
496 16 480
499 13 486
92 5 87
508 2 506
155 2 153
110 15 95
39 9 30
55 4 51
45 4 41
33 1 32
23 1 22
68 4 64
80 9 71
82 21 61
753 253 500
1328 49 1279
344 4 340
visit_exit_idaction_url 17
visit_entry_idaction_url 14
referer_name 5
referer_url 2
referer_keyword 2
config_resolution 16
config_browser_version 9
utm_source 4
utm_medium 4
utm_term 1
utm_content 1
utm_campaign 4
location_browser_lang 10
location_region 22
location_city 253
idaction_url 50
idaction_url_ref 5


In [172]:
# drop more useless constant fields 
more_fields = ["utm_term","utm_content"]
clean_input = clean_input.drop(more_fields,1)

In [173]:
# Some hardcode for log transform time variables - visit_total_time, time_spent_ref_action, visitor_days_since_first.
# If 0 = add flag

time_names = ["visit_total_time", "time_spent_ref_action", "visitor_days_since_first", "visitor_days_since_last"]
for name in time_names:
    clean_input[name+"_absence"] = np.zeros(len(clean_input))

for name in time_names:
    clean_input[name+"_absence"] = np.zeros(len(clean_input))
    null_index = clean_input[name]==0
    not_null_index = clean_input[name]!=0
    clean_input.loc[not_null_index, name] = np.log(clean_input.loc[not_null_index, name])
    clean_input.loc[null_index, name+ "_absence"] = 1
        

In [174]:
# research numerical values
numerical = ["visitor_count_visits", "visitor_days_since_last", "visitor_days_since_first", "visit_total_actions",
             "visit_total_time", "location_latitude", "location_longitude", "time_spent_ref_action"]
for name in numerical:
    print name
    print clean_input[name].describe()

visitor_count_visits
count    100000.000000
mean         36.958180
std          29.166124
min           1.000000
25%          13.000000
50%          33.000000
75%          52.000000
max         178.000000
Name: visitor_count_visits, dtype: float64
visitor_days_since_last
count    100000.000000
mean          0.370412
std           0.990572
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           7.971776
Name: visitor_days_since_last, dtype: float64
visitor_days_since_first
count    100000.000000
mean          3.534771
std           1.345666
min           0.000000
25%           2.944439
50%           3.871201
75%           4.510860
max           7.971776
Name: visitor_days_since_first, dtype: float64
visit_total_actions
count    100000.000000
mean          1.432540
std           1.218911
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max          18.000000
Name: visit_total_actions, dtype: floa

In [175]:
# create dummy one-hot encoding variables from categorial
not_dummy = numerical
dummy_keys = clean_input.keys() - not_dummy

print dummy_keys
clean_dummies = clean_input.copy()

for name in dummy_keys:
    clean_dummies[name] = clean_input[name].astype('category')

clean_dummies = pd.get_dummies(clean_dummies)
print len(clean_dummies.keys())


Index([u'channel', u'config_browser_name', u'config_browser_version', u'config_cookie', u'config_director', u'config_flash', u'config_gears', u'config_java', u'config_os', u'config_pdf', u'config_quicktime', u'config_realplayer', u'config_resolution', u'config_silverlight', u'config_windowsmedia', u'first_act_dow', u'first_act_hour', u'idaction_name_ref', u'idaction_url', u'idaction_url_ref', u'local_hour', u'location_browser_lang', u'location_city', u'location_country', u'location_region', u'referer_keyword', u'referer_name', u'referer_type', u'referer_url', u'server_hour', u'time_spent_ref_action_absence', u'utm_campaign', u'utm_medium', u'utm_source', u'visit_entry_idaction_url', u'visit_exit_idaction_url', u'visit_total_time_absence', u'visitor_days_since_first_absence', u'visitor_days_since_last_absence', u'visitor_returning'], dtype='object')
589


# Model building and research

In [222]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
from sklearn.linear_model import SGDClassifier, RidgeClassifier, LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC
import random

# use stratified cross validation 
skf = cross_validation.StratifiedKFold(y, n_folds=10)


# Create various classifiers
# Baseline - Mode of target variable
params = {'n_estimators':100, 'max_features':None, 'max_depth':6}

gbm = GradientBoostingClassifier(n_estimators=400, max_depth = 8, min_samples_leaf = 3, max_features='auto')
dummy = DummyClassifier(strategy='most_frequent')
rf = RandomForestClassifier(**params)
ada = AdaBoostClassifier(n_estimators=100)
knn = KNeighborsClassifier(n_neighbors=10, weights='distance')
knn2 = KNeighborsClassifier(n_neighbors=100, weights='uniform')
sgd = SGDClassifier(loss = 'log', alpha=0.01, n_iter=200)
logregl1 = LogisticRegression(penalty='l1', tol=0.01)
logregl2 = LogisticRegression(penalty='l2', tol=0.01)


all_class = [(dummy,'Dummy'),
             (logregl1, 'Logistic L1'),
             (logregl2,'Logistic L2'),
             (gbm,'GBM'),
             (rf, 'RF'),
             (ada, 'Ada'),
             (knn, 'KNN'),
             (knn2,'KNN2'),
             (sgd, 'SGD')]

total_acc = []
total_auc = []
total_loss = []

# Train and evaluate performance according various metrics 
for multi_class, cl_name in all_class:
    print "**************************"
    print cl_name
    print "**************************"
    avg_auc = []
    avg_loss = []
    avg_acc = []
    for train_index, test_index in skf:
        multi_class.fit(clean_dummies.iloc[train_index], y[train_index])
        pred_prob = multi_class.predict_proba(clean_dummies.iloc[test_index])[:,1]
        pred = multi_class.predict(clean_dummies.iloc[test_index])
        
        print ("ROC AUC: %.6f" % roc_auc_score(y[test_index], pred_prob))
        print ("LogLoss: %.6f" % log_loss(y[test_index], pred_prob))
        print ("Accuracy: %.6f" % accuracy_score(y[test_index], pred))
        
        avg_auc.append(roc_auc_score(y[test_index], pred_prob))
        avg_loss.append(log_loss(y[test_index], pred_prob))
        avg_acc.append(accuracy_score(y[test_index], pred))
    
    print "Summary: ", cl_name
    print ("ROC AUC Avg: %.6f Min: %.6f Max: %.6f" % (np.mean(avg_auc), np.max(avg_auc), np.min(avg_auc)))
    print ("LogLoss Avg: %.6f Min: %.6f Max: %.6f" % (np.mean(avg_loss), np.max(avg_loss), np.min(avg_loss)))
    print ("Accuracy Avg: %.6f Min: %.6f Max: %.6f" % (np.mean(avg_acc), np.max(avg_acc), np.min(avg_acc)))
    total_acc.append(np.mean(avg_acc))
    total_auc.append(np.mean(avg_auc))
    total_loss.append(np.mean(avg_loss))

**************************
Dummy
**************************
ROC AUC: 0.500000
LogLoss: 0.238294
Accuracy: 0.993101
ROC AUC: 0.500000
LogLoss: 0.234864
Accuracy: 0.993200
ROC AUC: 0.500000
LogLoss: 0.234864
Accuracy: 0.993200
ROC AUC: 0.500000
LogLoss: 0.234864
Accuracy: 0.993200
ROC AUC: 0.500000
LogLoss: 0.234864
Accuracy: 0.993200
ROC AUC: 0.500000
LogLoss: 0.234864
Accuracy: 0.993200
ROC AUC: 0.500000
LogLoss: 0.234864
Accuracy: 0.993200
ROC AUC: 0.500000
LogLoss: 0.234864
Accuracy: 0.993200
ROC AUC: 0.500000
LogLoss: 0.234864
Accuracy: 0.993200
ROC AUC: 0.500000
LogLoss: 0.234887
Accuracy: 0.993199
Summary:  Dummy
ROC AUC Avg: 0.500000 Min: 0.500000 Max: 0.500000
LogLoss Avg: 0.235209 Min: 0.238294 Max: 0.234864
Accuracy Avg: 0.993190 Min: 0.993200 Max: 0.993101
**************************
Logistic L1
**************************
ROC AUC: 1.000000
LogLoss: 0.007489
Accuracy: 0.994301
ROC AUC: 0.997743
LogLoss: 0.018461
Accuracy: 0.994900
ROC AUC: 0.994154
LogLoss: 0.011529
Accuracy: 0

In [231]:
# Output for best model
print "Best accuracy model: ", all_class[np.argmax(total_acc)][1]
print "Score: ", np.max(total_acc), " Baseline: ", total_acc[0]
print "###########################"
print "Best logloss model: ", all_class[np.argmin(total_loss)][1]
print "Score: ", np.min(total_loss), " Baseline: ", total_loss[0]
print "###########################"
print "Best ROC AUC model: ", all_class[np.argmax(total_auc)][1]
print "Score: ", np.max(total_auc), " Baseline: ", total_auc[0]
print "###########################"

Best accuracy model:  GBM
Score:  0.996809959996  Baseline:  0.993190000986
###########################
Best logloss model:  Logistic L2
Score:  0.0131894872363  Baseline:  0.235209033184
###########################
Best ROC AUC model:  GBM
Score:  0.997913113573  Baseline:  0.5
###########################
