# Проверка на прошлогодних данных

Обучим модель на данных с 21 декабря по 17 января и проверим ее предсказания на покупки в период с 18 по 24 января.

In [1]:
! ls train_data -l --block-size=M

total 1327M
-rw-r--r-- 1 miptgirl dpt_yandex_monetize_metrica_dev_beh 245M Dec 22 13:40 2015-12-21.csv
-rw-r--r-- 1 miptgirl dpt_yandex_monetize_metrica_dev_beh 336M Dec 22 13:41 2015-12-28.csv
-rw-r--r-- 1 miptgirl dpt_yandex_monetize_metrica_dev_beh 377M Dec 22 13:41 2016-01-04.csv
-rw-r--r-- 1 miptgirl dpt_yandex_monetize_metrica_dev_beh 370M Dec 22 13:42 2016-01-11.csv


In [2]:
import pandas as pd
from sklearn import model_selection, preprocessing, linear_model, metrics, feature_extraction
import numpy as np
import os
import json

pd.set_option('display.max_columns', 500)

In [3]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import graph_objs as go
import requests
import StringIO
import pandas as pd
import plotly

print __version__ # need 1.9.0 or greater

init_notebook_mode(connected = True)



def plotly_df(df, title = '', filename = None):
    data = []
    
    for column in df.columns:
        trace = go.Scatter(
            x = df.index,
            y = df[column],
            mode = 'lines',
            name = column
        )
        data.append(trace)
    
    layout = dict(title = title)
    fig = dict(data = data, layout = layout)
    iplot(fig, show_link = False)
    if filename:
        plotly.offline.plot(fig, filename=filename)

1.12.11


## Подготовка данных

In [5]:
%%time
sample = 0.01

dfs_sampled = []

for filename in os.listdir('./train_data'):
    df = pd.read_csv('./train_data/' + filename).fillna(0)
    pos_df = df[df.target == 1]
    neg_df = df[df.target == 0]

    neg_df_sampled = neg_df.sample(frac = sample)
    df_sampled = pd.concat([pos_df, neg_df_sampled])
    dfs_sampled.append(df_sampled) 

df = pd.concat(dfs_sampled).drop_duplicates()
X = df.drop(['target', 'user_id'], axis = 1)
y = df.target

print X.shape

(X_train, X_test, y_train, y_test) = model_selection.train_test_split(X, y, 
                                                              test_size = 0.3, 
                                                              random_state = 0, 
                                                              stratify = y)

scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

(126410, 50)
CPU times: user 43.6 s, sys: 8 s, total: 51.6 s
Wall time: 1min


#### Выделение данных Logs API

In [6]:
logs_api_cols = ['is_mobile',
 'is_bounce_last_visit',
 'days_since_last_visit',
 'days_since_first_visit',
 'is_yabrowser',
 'mobile',
 'first_source_direct',
 'first_source_referral',
 'first_source_organic',
 'first_source_ad',
 'first_source_saved',
 'first_source_undefined',
 'first_source_external',
 'first_source_email',
 'first_source_social',
 'first_source_internal',
 'avg_depth',
 'avg_duration',
 'visits',
 'not_bounce_visits',
 'purchases',
 'purchased_products',
 'viewed_products',
 'cart_products',
 'revenue',
 'ad_visits',
 'direct_visits',
 'email_visits',
 'internal_visits',
 'organic_visits',
 'referral_visits',
 'saved_visits',
 'social_visits',
 'undefined_visits']

In [7]:
X_api = df[logs_api_cols]
y_api = df.target

(X_train_api, X_test_api, y_train_api, y_test_api) = model_selection.train_test_split(X_api, y_api, 
                                                              test_size = 0.3, 
                                                              random_state = 0, 
                                                              stratify = y)

scaler = preprocessing.StandardScaler()
X_train_scaled_api = scaler.fit_transform(X_train_api)
X_test_scaled_api = scaler.transform(X_test_api)

## Обучение (XGBoost)

In [9]:
import xgboost as xgb

In [10]:
clf_xgb = xgb.XGBClassifier(
    n_estimators = 1000,
    colsample_bytree=1,
    learning_rate=0.025,
    max_depth=3,
    min_child_weight=10,
    subsample=0.5
)
%time clf_xgb.fit(X_train_scaled_api, y_train_api)

CPU times: user 2min 9s, sys: 128 ms, total: 2min 10s
Wall time: 17 s


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.025, max_delta_step=0, max_depth=3,
       min_child_weight=10, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.5)

In [15]:
xgb_features_df = pd.DataFrame()
xgb_features_df['weight'] = clf_xgb.feature_importances_
xgb_features_df['feature'] = logs_api_cols
xgb_features_df.sort_values('weight', ascending = False).head(10)

Unnamed: 0,weight,feature
3,0.144658,days_since_first_visit
17,0.115033,avg_duration
2,0.107942,days_since_last_visit
16,0.098802,avg_depth
22,0.082257,viewed_products
24,0.06445,revenue
19,0.038134,not_bounce_visits
26,0.037346,direct_visits
18,0.036716,visits
25,0.035771,ad_visits


In [16]:
y_xgb = clf_xgb.predict_proba(X_test_scaled_api)[:, 1]
roc_auc_xgb = metrics.roc_auc_score(y_test_api, y_xgb)
roc_auc_xgb 

0.86815156016934902

In [17]:
y_xgb_df = pd.DataFrame(y_xgb)
y_xgb_df.columns = ['probability']
y_xgb_df['probability_rounded'] = map(lambda x: 100*round(x, 2), y_xgb_df.probability)
plotly_df(y_xgb_df.groupby('probability_rounded').count())

## Проверка на более поздних данных (18 января)

In [26]:
%%time

dfs_sampled = []

for filename in os.listdir('./test_data'):
    df = pd.read_csv('./test_data/' + filename).fillna(0)
    dfs_sampled.append(df) 

df_exp = pd.concat(dfs_sampled).drop_duplicates()
print df_exp.shape

X_api_exp = df_exp[logs_api_cols]
y_api_exp = df_exp.target

X_scaled_api_exp = scaler.transform(X_api_exp)

(2905539, 52)
CPU times: user 23.2 s, sys: 5.22 s, total: 28.4 s
Wall time: 28.5 s


In [27]:
y_xgb_exp = clf_xgb.predict_proba(X_scaled_api_exp)[:, 1]
roc_auc_xgb_exp = metrics.roc_auc_score(y_api_exp, y_xgb_exp)
roc_auc_xgb_exp

0.87642550037193478

In [33]:
cmp_df = pd.DataFrame()

cmp_df['prediction'] = y_xgb_exp
cmp_df['fact'] = y_api_exp.values
cmp_df['prediction_rounded'] = map(lambda x: round(x, 2), cmp_df.prediction)

In [41]:
negative_df = cmp_df[cmp_df.fact == 0].copy()
negative_df['count'] = 1
negative_df = negative_df.groupby('prediction_rounded')[['count']].sum()

In [42]:
positive_df = cmp_df[cmp_df.fact == 1].copy()
positive_df['count'] = 1
positive_df = positive_df.groupby('prediction_rounded')[['count']].sum()

In [44]:
negative_df.columns = ['negative']
positive_df.columns = ['positive']

In [64]:
cmp_df = negative_df.join(positive_df).fillna(0)
cmp_df_norm = cmp_df.apply(lambda x: 100.*x/cmp_df.sum(), axis = 1)
plotly_df(cmp_df_norm)

In [65]:
cmp_df_norm['negative_cum'] = cmp_df_norm.negative.cumsum()
cmp_df_norm['positive_cum'] = cmp_df_norm.positive.cumsum()

In [66]:
plotly_df(cmp_df_norm[filter(lambda x: 'cum' in x, cmp_df_norm.columns)])

In [68]:
cmp_df['positive_ratio'] = cmp_df.positive/(cmp_df.positive + cmp_df.negative)

In [70]:
plotly_df(cmp_df[['positive_ratio']])