In [49]:
! ls client_data -l --block-size=M

total 465M
-rw-r--r-- 1 miptgirl dpt_yandex_monetize_metrica_dev_beh  60M Dec 15 17:51 2015-12-07.csv
-rw-r--r-- 1 miptgirl dpt_yandex_monetize_metrica_dev_beh 157M Dec 15 17:51 2015-12-14.csv
-rw-r--r-- 1 miptgirl dpt_yandex_monetize_metrica_dev_beh 250M Dec 15 17:52 2015-12-21.csv


In [50]:
import pandas as pd
from sklearn import model_selection, preprocessing, linear_model, metrics, feature_extraction
import numpy as np
import os
import json

pd.set_option('display.max_columns', 500)

In [51]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import graph_objs as go
import requests
import StringIO
import pandas as pd
import plotly

print __version__ # need 1.9.0 or greater

init_notebook_mode(connected = True)



def plotly_df(df, title = '', filename = None):
    data = []
    
    for column in df.columns:
        trace = go.Scatter(
            x = df.index,
            y = df[column],
            mode = 'lines',
            name = column
        )
        data.append(trace)
    
    layout = dict(title = title)
    fig = dict(data = data, layout = layout)
    iplot(fig, show_link = False)
    if filename:
        plotly.offline.plot(fig, filename=filename)

1.12.11


## Подготовка данных

In [52]:
%%time
sample = 0.01

dfs_sampled = []

for filename in os.listdir('./client_data_new'):
    df = pd.read_csv('./client_data_new/' + filename).fillna(0)
    pos_df = df[df.target == 1]
    neg_df = df[df.target == 0]

    neg_df_sampled = neg_df.sample(frac = sample)
    df_sampled = pd.concat([pos_df, neg_df_sampled])
    dfs_sampled.append(df_sampled) 

df = pd.concat(dfs_sampled).drop_duplicates()
X = df.drop(['target', 'user_id'], axis = 1)
y = df.target

print X.shape

(X_train, X_test, y_train, y_test) = model_selection.train_test_split(X, y, 
                                                              test_size = 0.3, 
                                                              random_state = 0, 
                                                              stratify = y)

scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

(329387, 50)
CPU times: user 1min 53s, sys: 19.8 s, total: 2min 13s
Wall time: 2min 13s


## Логистическая регрессия
### Базовая модель

In [5]:
param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}

In [6]:
estimator = linear_model.LogisticRegression(class_weight = 'balanced')
clf_logit = model_selection.GridSearchCV(estimator, param_grid, cv = 5, scoring = 'roc_auc')
%time clf_logit.fit(X_train_scaled, y_train)

CPU times: user 3min 12s, sys: 19.5 s, total: 3min 32s
Wall time: 3min 14s


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [7]:
clf_logit.best_estimator_

LogisticRegression(C=0.01, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [8]:
clf_logit.best_score_

0.84933781200171088

In [9]:
coefs = clf_logit.best_estimator_.coef_.tolist()[0]
columns = X.columns.tolist()

logit_coefs = pd.DataFrame({'weight': coefs, 'feature': columns})
logit_coefs['abs_weight'] = map(abs, logit_coefs.weight)
logit_coefs.sort_values('abs_weight', ascending = False).head(10)

Unnamed: 0,feature,weight,abs_weight
36,purchases,1.085225,1.085225
3,days_since_last_visit,-0.653453,0.653453
20,male,0.345543,0.345543
4,days_since_first_visit,0.34536,0.34536
32,avg_depth,0.301414,0.301414
35,not_bounce_visits,0.28648,0.28648
39,cart_products,0.268876,0.268876
21,female,0.220695,0.220695
0,is_mobile,-0.21614,0.21614
38,viewed_products,-0.149298,0.149298


In [46]:
def df_to_wiki(df):
    print '#|\n ||' + df.to_csv(index = True, sep = '|').replace('|', ' | ').replace('\n', '|| \n ||')[:-2] + '|#'

In [11]:
y_logit = clf_logit.best_estimator_.predict_proba(X_test_scaled)[:, 1]
roc_auc_logit = metrics.roc_auc_score(y_test, y_logit)
roc_auc_logit

0.84657321847662237

In [12]:
y_logit_df = pd.DataFrame(y_logit)
y_logit_df.columns = ['probability']

In [13]:
y_logit_df['probability_rounded'] = map(lambda x: 100*round(x, 2), y_logit_df.probability)

In [14]:
plotly_df(y_logit_df.groupby('probability_rounded').count())

### Отбор признаков с помощью Lasso

In [15]:
estimator = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
clf_lasso = model_selection.GridSearchCV(estimator, param_grid, cv = 5, scoring = 'roc_auc')
%time clf_lasso.fit(X_train_scaled, y_train)

CPU times: user 6min 29s, sys: 23.1 s, total: 6min 52s
Wall time: 6min 35s


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [16]:
clf_lasso.best_estimator_

LogisticRegression(C=0.01, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [17]:
clf_lasso.best_score_

0.84938144375424207

In [18]:
y_lasso = clf_lasso.best_estimator_.predict_proba(X_test_scaled)[:, 1]
roc_auc_lasso = metrics.roc_auc_score(y_test, y_lasso)
roc_auc_lasso

0.84662297910137729

In [19]:
coefs = clf_lasso.best_estimator_.coef_.tolist()[0]
columns = X.columns.tolist()

lasso_coefs = pd.DataFrame({'weight': coefs, 'feature': columns})
lasso_coefs['abs_weight'] = map(abs, lasso_coefs.weight)
lasso_coefs.sort_values('abs_weight', ascending = False).tail(20)

Unnamed: 0,feature,weight,abs_weight
47,saved_visits,-0.025485,0.025485
6,cooking,-0.025161,0.025161
16,healthy,0.012278,0.012278
27,first_source_undefined,0.009639,0.009639
19,cinema,0.008176,0.008176
48,social_visits,0.005648,0.005648
31,first_source_internal,0.004719,0.004719
8,cars,0.004422,0.004422
14,family,-0.003606,0.003606
45,organic_visits,-0.00352,0.00352


### Логистическая регрессия (по данным Logs API)

In [53]:
logs_api_cols = ['is_mobile',
 'is_bounce_last_visit',
 'days_since_last_visit',
 'days_since_first_visit',
 'is_yabrowser',
 'mobile',
 'first_source_direct',
 'first_source_referral',
 'first_source_organic',
 'first_source_ad',
 'first_source_saved',
 'first_source_undefined',
 'first_source_external',
 'first_source_email',
 'first_source_social',
 'first_source_internal',
 'avg_depth',
 'avg_duration',
 'visits',
 'not_bounce_visits',
 'purchases',
 'purchased_products',
 'viewed_products',
 'cart_products',
 'revenue',
 'ad_visits',
 'direct_visits',
 'email_visits',
 'internal_visits',
 'organic_visits',
 'referral_visits',
 'saved_visits',
 'social_visits',
 'undefined_visits']

In [54]:
X_api = df[logs_api_cols]
y_api = df.target

(X_train_api, X_test_api, y_train_api, y_test_api) = model_selection.train_test_split(X_api, y_api, 
                                                              test_size = 0.3, 
                                                              random_state = 0, 
                                                              stratify = y)

scaler = preprocessing.StandardScaler()
X_train_scaled_api = scaler.fit_transform(X_train_api)
X_test_scaled_api = scaler.transform(X_test_api)

In [22]:
param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}

estimator = linear_model.LogisticRegression(class_weight = 'balanced')
clf_logit_api = model_selection.GridSearchCV(estimator, param_grid, cv = 5, scoring = 'roc_auc')
%time clf_logit_api.fit(X_train_scaled_api, y_train_api)

CPU times: user 2min 23s, sys: 18.7 s, total: 2min 42s
Wall time: 2min 24s


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [23]:
clf_logit_api.best_estimator_

LogisticRegression(C=0.01, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [24]:
clf_logit_api.best_score_

0.84466424283921471

In [25]:
y_logit_api = clf_logit_api.best_estimator_.predict_proba(X_test_scaled_api)[:, 1]
roc_auc_logit_api = metrics.roc_auc_score(y_test, y_logit_api)
roc_auc_logit_api

0.84194353899180507

## Random Forest
### Базовая модель

In [9]:
from sklearn import ensemble

In [12]:
forest_param_grid = {
    'n_estimators': [5, 7, 10, 25, 50, 75, 100, 250, 500, 750, 1000],
    'max_features': ['sqrt', 'log2', None]
}

In [138]:
estimator = ensemble.RandomForestClassifier(
    class_weight='balanced', 
    oob_score=True,
    random_state=42,
    n_jobs=4
)

In [139]:
clf_forest = model_selection.GridSearchCV(estimator, forest_param_grid, cv = 5, scoring = 'roc_auc')
%time clf_forest.fit(X_train_scaled, y_train)

CPU times: user 5h 3min 12s, sys: 1min 6s, total: 5h 4min 18s
Wall time: 1h 25min 28s


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
            oob_score=True, random_state=42, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 7, 10, 25, 50, 75, 100, 250, 500, 750, 1000], 'max_features': ['sqrt', 'log2', None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [132]:
clf_forest.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='sqrt',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [153]:
plotly_df(pd.DataFrame(clf_forest.cv_results_).fillna('max').sort_values(['param_max_features', 'param_n_estimators'])[['mean_test_score', 'param_n_estimators', 'param_max_features']].pivot_table(index = 'param_n_estimators', columns = 'param_max_features', values = 'mean_test_score'))

In [141]:
clf_forest.best_score_

0.84125170342746702

In [142]:
clf_forest.best_estimator_.oob_score_

0.91846352673669485

In [143]:
y_forest = clf_forest.best_estimator_.predict_proba(X_test_scaled)[:, 1]
roc_auc_forest = metrics.roc_auc_score(y_test, y_forest)
roc_auc_forest

0.84094468033764047

In [21]:
forest_estimator = ensemble.RandomForestClassifier(
    class_weight='balanced', 
    oob_score=True,
    random_state=42,
    n_jobs=4,
    n_estimators=5000,
    max_features='sqrt'
)

%time forest_estimator.fit(X_train_scaled, y_train)

CPU times: user 14min 58s, sys: 11.7 s, total: 15min 10s
Wall time: 5min 20s


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='sqrt',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=4,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [22]:
y_forest_big = forest_estimator.predict_proba(X_test_scaled)[:, 1]
roc_auc_forest_big = metrics.roc_auc_score(y_test, y_forest_big)
roc_auc_forest_big

0.8439160545000175

### По данным Logs API

In [10]:
estimator = ensemble.RandomForestClassifier(
    class_weight='balanced', 
    oob_score=True,
    random_state=42,
    n_jobs=4
)

In [13]:
clf_forest_api = model_selection.GridSearchCV(estimator, forest_param_grid, cv = 5, scoring = 'roc_auc')
%time clf_forest_api.fit(X_train_scaled_api, y_train_api)


Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.


invalid value encountered in true_divide



CPU times: user 3h 22min 3s, sys: 1min 16s, total: 3h 23min 19s
Wall time: 59min 2s


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
            oob_score=True, random_state=42, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 7, 10, 25, 50, 75, 100, 250, 500, 750, 1000], 'max_features': ['sqrt', 'log2', None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [14]:
clf_forest_api.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='sqrt',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=750, n_jobs=4,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [15]:
clf_forest_api.best_score_

0.8046127751653307

In [16]:
y_forest_api = clf_forest_api.best_estimator_.predict_proba(X_test_scaled_api)[:, 1]
roc_auc_forest_api = metrics.roc_auc_score(y_test_api, y_forest_api)
roc_auc_forest_api

0.80421980914119306

In [17]:
plotly_df(pd.DataFrame(clf_forest_api.cv_results_).fillna('max').sort_values(['param_max_features', 'param_n_estimators'])[['mean_test_score', 'param_n_estimators', 'param_max_features']].pivot_table(index = 'param_n_estimators', columns = 'param_max_features', values = 'mean_test_score'))

## XGBoost

In [55]:
import xgboost as xgb

In [56]:
clf_xgb = xgb.XGBClassifier(
    n_estimators = 1000,
    colsample_bytree=1,
    learning_rate=0.025,
    max_depth=3,
    min_child_weight=10,
    subsample=0.5
)
%time clf_xgb.fit(X_train_scaled_api, y_train_api)

CPU times: user 5min 16s, sys: 292 ms, total: 5min 17s
Wall time: 44.3 s


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.025, max_delta_step=0, max_depth=3,
       min_child_weight=10, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.5)

In [57]:
xgb_features_df = pd.DataFrame()
xgb_features_df['weight'] = clf_xgb.feature_importances_
xgb_features_df['feature'] = logs_api_cols
print df_to_wiki(xgb_features_df.sort_values('weight', ascending = False).set_index('feature').applymap(lambda x: round(100*x, 2)))

#|
 ||feature | weight|| 
 ||days_since_first_visit | 14.25|| 
 ||days_since_last_visit | 11.63|| 
 ||avg_duration | 10.86|| 
 ||avg_depth | 9.97|| 
 ||viewed_products | 6.85|| 
 ||revenue | 5.76|| 
 ||not_bounce_visits | 4.98|| 
 ||direct_visits | 3.84|| 
 ||visits | 3.73|| 
 ||cart_products | 3.23|| 
 ||email_visits | 3.2|| 
 ||ad_visits | 2.57|| 
 ||purchases | 2.46|| 
 ||organic_visits | 2.42|| 
 ||internal_visits | 2.37|| 
 ||is_mobile | 2.36|| 
 ||purchased_products | 1.83|| 
 ||mobile | 1.65|| 
 ||first_source_email | 1.16|| 
 ||is_bounce_last_visit | 1.0|| 
 ||first_source_direct | 0.96|| 
 ||first_source_organic | 0.66|| 
 ||is_yabrowser | 0.66|| 
 ||referral_visits | 0.51|| 
 ||first_source_ad | 0.45|| 
 ||first_source_referral | 0.37|| 
 ||first_source_internal | 0.14|| 
 ||social_visits | 0.09|| 
 ||first_source_social | 0.05|| 
 ||first_source_saved | 0.0|| 
 ||first_source_undefined | 0.0|| 
 ||first_source_external | 0.0|| 
 ||saved_visits | 0.0|| 
 ||undefined_visits | 

In [58]:
y_xgb = clf_xgb.predict_proba(X_test_scaled_api)[:, 1]
roc_auc_xgb = metrics.roc_auc_score(y_test_api, y_xgb)
roc_auc_xgb 

0.85956542223785803

In [59]:
y_xgb_df = pd.DataFrame(y_xgb)
y_xgb_df.columns = ['probability']
y_xgb_df['probability_rounded'] = map(lambda x: 100*round(x, 2), y_xgb_df.probability)
plotly_df(y_xgb_df.groupby('probability_rounded').count())

## kNN

In [5]:
from sklearn import neighbors

In [21]:
knn_param_grid = {
    'n_neighbors': [1, 2, 5, 10, 25, 50, 75, 100],
    'weights': ['uniform', 'distance'],
    'p': [1, 2, 3]
}

In [None]:
knn_results = []

for n_neighbors in knn_param_grid['n_neighbors']:
    for weights in knn_param_grid['weights']:
        for p in knn_param_grid['p']:
            print 'n_neighbors={n_neighbors}, weights={weights}, p={p}'.format(
                n_neighbors=n_neighbors,
                weights=weights,
                p=p
            )
            estimator = neighbors.KNeighborsClassifier(
                n_jobs=4,
                n_neighbors=n_neighbors,
                weights=weights,
                p=p
            )
            %time scores = model_selection.cross_val_score(estimator, X_train_scaled_api, y_train_api, scoring = 'roc_auc', cv = 5)
            print '### score = {score}/n'.format(score = np.mean(scores))
            
            item = {
                'scores': scores,
                'score': np.mean(scores),
                'p': p,
                'n_neighbors': n_neighbors,
                'weights': weights
            }
            
            knn_results.append(item)

n_neighbors=1, weights=uniform, p=1
CPU times: user 25min 20s, sys: 584 ms, total: 25min 21s
Wall time: 8min 19s
### score = 0.613627661547/n
n_neighbors=1, weights=uniform, p=2
CPU times: user 23min 5s, sys: 812 ms, total: 23min 5s
Wall time: 7min 53s
### score = 0.613232800178/n
n_neighbors=1, weights=uniform, p=3
CPU times: user 1h 58min 30s, sys: 3.34 s, total: 1h 58min 34s
Wall time: 35min 48s
### score = 0.613360262126/n
n_neighbors=1, weights=distance, p=1
CPU times: user 23min 36s, sys: 648 ms, total: 23min 37s
Wall time: 7min 47s
### score = 0.613627661547/n
n_neighbors=1, weights=distance, p=2
CPU times: user 25min 37s, sys: 640 ms, total: 25min 37s
Wall time: 8min 25s
### score = 0.613232800178/n
n_neighbors=1, weights=distance, p=3
CPU times: user 30min 27s, sys: 724 ms, total: 30min 27s
Wall time: 9min 45s
### score = 0.823448475247/n
n_neighbors=75, weights=uniform, p=3


In [None]:
knn_results