In [2]:
import pandas as pd, numpy as np, catboost, logging, os, sys, requests
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, recall_score, precision_score, roc_auc_score

module_path = os.path.abspath(os.path.join('/home/ktereshin/yandex/arcadia/cloud/analytics/python/work'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_loader import clickhouse
from global_variables import (
    metrika_clickhouse_param_dict,
    cloud_clickhouse_param_dict
)
from nile.api.v1 import (
    clusters,
    aggregators as na,
    extractors as ne,
    filters as nf,
    Record
)
from vault_client import instances

In [3]:
client = instances.Production()
yt_creds = client.get_version('ver-01d33pgv8pzc7t99s3egm24x47')
cluster_yt = clusters.yt.Hahn(
    token = yt_creds['value']['token'],
    pool = yt_creds['value']['pool'],
    
)

In [4]:
features = cluster_yt.read('//home/cloud_analytics/scoring/learning_dataset').as_dataframe()
targets = cluster_yt.read('//home/cloud_analytics/scoring/targets').as_dataframe()

train_data = pd.merge(
    targets[targets['dataset_type'] == 'learning_set'][['puid', 'first_trial_consumption_datetime', 'start_paid_consumption']],
    features,
    on = 'puid',
    how = 'left'
).fillna('0')

to_predict = pd.merge(
    targets[targets['dataset_type'] != 'learning_set'][['puid', 'first_trial_consumption_datetime', 'start_paid_consumption']],
    features,
    on = 'puid',
    how = 'left'
).fillna('0')

train_data = shuffle(train_data).reset_index(drop=True)

In [5]:
X_train, X_test, y_train, y_test = train_test_split( train_data, train_data['start_paid_consumption'], test_size=0.33, random_state=42)
X_train, X_eval, y_train, y_eval = train_test_split( X_train, y_train, test_size=0.25, random_state=42)
positive = X_train[y_train == 1]
res = pd.DataFrame()
for i in range(3):
    X_train = pd.concat(
        [
            X_train,
            positive
        ]
    )
    y_train = pd.concat(
        [
            y_train,
            pd.Series([1]*positive.shape[0])
        ]
    )
    
cat_col = [
    'ba_payment_type',
    'ba_usage_status',
    'ba_state',
    'device_type',
    'ba_person_type',
    'age',
    'ba_payment_cycle_type',
    'ba_type',
    'channel',
    'country',
    'os',
    'promocode_source',
    'segment',
    'sex',
    'first_trial_consumption_datetime',
    'session_start_time',
    'search_phrase',
    'start_paid_consumption',
    'puid'
]
cat_indexes = []
for col in cat_col:
    cat_indexes.append(X_train.columns.get_loc(col))
    
ignore_col = [
'first_trial_consumption_datetime','session_start_time','search_phrase','start_paid_consumption', 'puid'
]
ignore_indexes = []
for col in ignore_col:
    ignore_indexes.append(X_train.columns.get_loc(col))

In [6]:
X_train['rand'] = np.random.rand(X_train.shape[0])
X_eval['rand'] = np.random.rand(X_eval.shape[0])
X_test['rand'] = np.random.rand(X_test.shape[0])
to_predict['rand'] = np.random.rand(to_predict.shape[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
train_pool = catboost.Pool(X_train, y_train, cat_features = cat_indexes)
eval_pool = catboost.Pool(X_eval, y_eval, cat_features = cat_indexes)
test_pool = catboost.Pool(X_test, cat_features = cat_indexes)
predict_pool = catboost.Pool(to_predict, cat_features = cat_indexes)

In [8]:
grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1],
    'bootstrap_type': ['Bernoulli','Bayesian'],
    'subsample': [0.3, 0.5, 0.65, 0.8],
    'depth': [1,2,3]
}

In [9]:
res = pd.DataFrame()
for val in grid['learning_rate']:
    '''
    params = {
        "iterations": 300,
        "depth": grid['depth'][0],
        "learning_rate": val,
        "bootstrap_type": grid['bootstrap_type'][0],
        "subsample": grid['subsample'][0],
        "loss_function": "Logloss",
        "verbose": False,
        'eval_metric': 'AUC',
        'early_stopping_rounds': 20,
        #'ignored_features': ignore_indexes
    }

    scores = catboost.cv(
        train_pool,
        params,
        fold_count=3,
        shuffle = True,
        verbose = False,
        plot=False
    )
    '''
    #scores = scores[scores['test-AUC-mean'] == scores['test-AUC-mean'].max()]
    scores = pd.DataFrame([grid['depth'][0]]).rename(columns = {0: 'depth'})
    scores['bootstrap_type'] = grid['bootstrap_type'][0]
    scores['subsample'] = grid['subsample'][0]
    scores['learning_rate'] = val

    model = catboost.CatBoostClassifier(
        iterations=1000,
        depth=grid['depth'][0],
        learning_rate=val,
        bootstrap_type = grid['bootstrap_type'][0],
        subsample = grid['subsample'][0],
        loss_function='Logloss',
        ignored_features = ignore_indexes,
        verbose=False
    )
    model.fit(train_pool, eval_set = eval_pool, plot = False, early_stopping_rounds = 20, use_best_model = True)
    scores['recall'] = recall_score(y_test, model.predict(test_pool))
    scores['precision'] = precision_score(y_test, model.predict(test_pool))
    scores['roc_auc_score'] = roc_auc_score(y_test, model.predict(test_pool))
    res = pd.concat(
        [
            res,
            scores
        ]
    )
    print(scores)
    print('=============================\n\n')

   depth bootstrap_type  subsample  learning_rate    recall  precision  \
0      1      Bernoulli        0.3           0.01  0.376289   0.480263   

   roc_auc_score  
0       0.662055  


   depth bootstrap_type  subsample  learning_rate    recall  precision  \
0      1      Bernoulli        0.3           0.05  0.469072   0.448276   

   roc_auc_score  
0       0.697548  


   depth bootstrap_type  subsample  learning_rate    recall  precision  \
0      1      Bernoulli        0.3            0.1  0.458763   0.440594   

   roc_auc_score  
0       0.692063  


   depth bootstrap_type  subsample  learning_rate    recall  precision  \
0      1      Bernoulli        0.3            0.5  0.494845    0.40678   

   roc_auc_score  
0       0.701188  


   depth bootstrap_type  subsample  learning_rate    recall  precision  \
0      1      Bernoulli        0.3              1  0.427835   0.333333   

   roc_auc_score  
0       0.659096  




In [10]:
learning_rate = 0.5

In [12]:
res = pd.DataFrame()
for val in grid['subsample']:
    '''
    params = {
        "iterations": 300,
        "depth": grid['depth'][0],
        "learning_rate": val,
        "bootstrap_type": grid['bootstrap_type'][0],
        "subsample": grid['subsample'][0],
        "loss_function": "Logloss",
        "verbose": False,
        'eval_metric': 'AUC',
        'early_stopping_rounds': 20,
        #'ignored_features': ignore_indexes
    }

    scores = catboost.cv(
        train_pool,
        params,
        fold_count=3,
        shuffle = True,
        verbose = False,
        plot=False
    )
    '''
    #scores = scores[scores['test-AUC-mean'] == scores['test-AUC-mean'].max()]
    scores = pd.DataFrame([grid['depth'][0]]).rename(columns = {0: 'depth'})
    scores['bootstrap_type'] = grid['bootstrap_type'][0]
    scores['subsample'] = val
    scores['learning_rate'] = learning_rate

    model = catboost.CatBoostClassifier(
        iterations=1000,
        depth=grid['depth'][0],
        learning_rate=learning_rate,
        bootstrap_type = grid['bootstrap_type'][0],
        subsample = val,
        loss_function='Logloss',
        ignored_features = ignore_indexes,
        verbose=False
    )
    model.fit(train_pool, eval_set = eval_pool, plot = False, early_stopping_rounds = 20, use_best_model = True)
    scores['recall'] = recall_score(y_test, model.predict(test_pool))
    scores['precision'] = precision_score(y_test, model.predict(test_pool))
    scores['roc_auc_score'] = roc_auc_score(y_test, model.predict(test_pool))
    res = pd.concat(
        [
            res,
            scores
        ]
    )
    print(scores)
    print('=============================\n\n')

   depth bootstrap_type  subsample  learning_rate    recall  precision  \
0      1      Bernoulli        0.3            0.5  0.494845    0.40678   

   roc_auc_score  
0       0.701188  


   depth bootstrap_type  subsample  learning_rate    recall  precision  \
0      1      Bernoulli        0.5            0.5  0.427835   0.342975   

   roc_auc_score  
0       0.661408  


   depth bootstrap_type  subsample  learning_rate    recall  precision  \
0      1      Bernoulli       0.65            0.5  0.443299   0.346774   

   roc_auc_score  
0       0.668149  


   depth bootstrap_type  subsample  learning_rate    recall  precision  \
0      1      Bernoulli        0.8            0.5  0.463918   0.344828   

   roc_auc_score  
0       0.675486  




In [13]:
subsample = 0.3

In [14]:
res = pd.DataFrame()
for val in grid['bootstrap_type']:
    '''
    params = {
        "iterations": 300,
        "depth": grid['depth'][0],
        "learning_rate": val,
        "bootstrap_type": grid['bootstrap_type'][0],
        "subsample": grid['subsample'][0],
        "loss_function": "Logloss",
        "verbose": False,
        'eval_metric': 'AUC',
        'early_stopping_rounds': 20,
        #'ignored_features': ignore_indexes
    }

    scores = catboost.cv(
        train_pool,
        params,
        fold_count=3,
        shuffle = True,
        verbose = False,
        plot=False
    )
    '''
    #scores = scores[scores['test-AUC-mean'] == scores['test-AUC-mean'].max()]
    scores = pd.DataFrame([grid['depth'][0]]).rename(columns = {0: 'depth'})
    scores['bootstrap_type'] = val
    scores['subsample'] = subsample
    scores['learning_rate'] = learning_rate

    model = catboost.CatBoostClassifier(
        iterations=1000,
        depth=grid['depth'][0],
        learning_rate=learning_rate,
        bootstrap_type = val,
        #subsample = val,
        loss_function='Logloss',
        ignored_features = ignore_indexes,
        verbose=False
    )
    model.fit(train_pool, eval_set = eval_pool, plot = False, early_stopping_rounds = 20, use_best_model = True)
    scores['recall'] = recall_score(y_test, model.predict(test_pool))
    scores['precision'] = precision_score(y_test, model.predict(test_pool))
    scores['roc_auc_score'] = roc_auc_score(y_test, model.predict(test_pool))
    res = pd.concat(
        [
            res,
            scores
        ]
    )
    print(scores)
    print('=============================\n\n')

   depth bootstrap_type  subsample  learning_rate    recall  precision  \
0      1      Bernoulli        0.3            0.5  0.515464        0.4   

   roc_auc_score  
0       0.708194  


   depth bootstrap_type  subsample  learning_rate    recall  precision  \
0      1       Bayesian        0.3            0.5  0.458763   0.366255   

   roc_auc_score  
0       0.678523  




In [15]:
bootstrap_type = 'Bernoulli'

In [16]:
res = pd.DataFrame()
for val in grid['depth']:
    '''
    params = {
        "iterations": 300,
        "depth": grid['depth'][0],
        "learning_rate": val,
        "bootstrap_type": grid['bootstrap_type'][0],
        "subsample": grid['subsample'][0],
        "loss_function": "Logloss",
        "verbose": False,
        'eval_metric': 'AUC',
        'early_stopping_rounds': 20,
        #'ignored_features': ignore_indexes
    }

    scores = catboost.cv(
        train_pool,
        params,
        fold_count=3,
        shuffle = True,
        verbose = False,
        plot=False
    )
    '''
    #scores = scores[scores['test-AUC-mean'] == scores['test-AUC-mean'].max()]
    scores = pd.DataFrame([val]).rename(columns = {0: 'depth'})
    scores['bootstrap_type'] = bootstrap_type
    scores['subsample'] = subsample
    scores['learning_rate'] = learning_rate

    model = catboost.CatBoostClassifier(
        iterations=1000,
        depth=val,
        learning_rate=learning_rate,
        bootstrap_type = bootstrap_type,
        subsample = subsample,
        loss_function='Logloss',
        ignored_features = ignore_indexes,
        verbose=False
    )
    model.fit(train_pool, eval_set = eval_pool, plot = False, early_stopping_rounds = 20, use_best_model = True)
    scores['recall'] = recall_score(y_test, model.predict(test_pool))
    scores['precision'] = precision_score(y_test, model.predict(test_pool))
    scores['roc_auc_score'] = roc_auc_score(y_test, model.predict(test_pool))
    res = pd.concat(
        [
            res,
            scores
        ]
    )
    print(scores)
    print('=============================\n\n')

   depth bootstrap_type  subsample  learning_rate    recall  precision  \
0      1      Bernoulli        0.3            0.5  0.494845    0.40678   

   roc_auc_score  
0       0.701188  


   depth bootstrap_type  subsample  learning_rate    recall  precision  \
0      2      Bernoulli        0.3            0.5  0.350515   0.371585   

   roc_auc_score  
0       0.637279  


   depth bootstrap_type  subsample  learning_rate   recall  precision  \
0      3      Bernoulli        0.3            0.5  0.28866   0.325581   

   roc_auc_score  
0       0.606021  




In [17]:
depth = 1

In [18]:
model = catboost.CatBoostClassifier(
    iterations=1000,
    depth=depth,
    learning_rate=learning_rate,
    bootstrap_type = bootstrap_type,
    subsample = subsample,
    loss_function='Logloss',
    ignored_features = ignore_indexes,
    verbose=False
)
model.fit(train_pool, eval_set = eval_pool, plot = False,early_stopping_rounds = 20,use_best_model = True)

<catboost.core.CatBoostClassifier at 0x7fc512e46250>

In [19]:
print('confusion_matrix = \n%s\n' % (confusion_matrix(y_test, model.predict(test_pool))))
print('recall = %s\n' % (recall_score(y_test, model.predict(test_pool))))
print('precision = %s' % (precision_score(y_test, model.predict(test_pool))))
print('roc_auc_score = %s' % (roc_auc_score(y_test, model.predict(test_pool))))

confusion_matrix = 
[[1374  140]
 [  98   96]]

recall = 0.4948453608247423

precision = 0.4067796610169492
roc_auc_score = 0.7011875417069551


In [20]:
pd.DataFrame([model.feature_names_,model.feature_importances_]).T.rename(columns = {0:'feature', 1:'weight'}).sort_values(by = 'weight', ascending = False)

Unnamed: 0,feature,weight
23,ba_person_type,19.2495
24,ba_state,14.1254
2157,count_v_https://console.cloud.yandex.ru/folder...,6.95909
2263,marketplace_trial_consumption_count,6.56764
2358,tfidf_/api/compute/createinstance,4.53955
19,all_trial_consumption_std,3.00656
2258,is_corporate_email,2.78128
14,all_trial_consumption_count,2.59237
20,all_trial_consumption_sum,2.54364
1437,count_v_https://console.cloud.yandex.ru,2.37391


In [None]:
metrics = [
    'all_trial_consumption_avg',
    'mdb_trial_consumption_avg',
    'ai_trial_consumption_avg',
    'storage_trial_consumption_avg',
    'network_trial_consumption_avg',
    'nlb_trial_consumption_avg',
    'marketplace_trial_consumption_avg',
    'snapshot_trial_consumption_avg',
    'image_trial_consumption_avg'
]
research = pd.concat(
    [
        X_test.reset_index(drop=True),
        pd.DataFrame(model.predict_proba(test_pool)).rename(columns={0:'not_paid_prob', 1: 'paid_prob'})
    ],
    axis = 1
)
research['paid_consumption'] = y_test
research['prob_cat'] = research['paid_prob'].apply(lambda x: str(int(x*10)*10) + '% - ' + str(int(x*10 + 1)*10) + '%' )

In [None]:
predict_research = pd.concat(
    [
        to_predict.reset_index(drop=True),
        pd.DataFrame(model.predict_proba(predict_pool)).rename(columns={0:'not_paid_prob', 1: 'paid_prob'})
    ],
    axis = 1
)
#predict_research['paid_consumption'] = y_test
predict_research['prob_cat'] = predict_research['paid_prob'].apply(lambda x: str(int(x*10)*10) + '% - ' + str(int(x*10 + 1)*10) + '%' )

In [None]:
research['prob_cat'].value_counts()

In [None]:
threshold = [0.0,0.1,0.2,0.3,0.4,0.5,0.6, 0.7, 0.8,0.9]
for thr in threshold:
    
    research['trh_prediction'] = research['paid_prob'].apply(lambda x: 1 if x >= thr else 0)
    print(thr, research['trh_prediction'].sum(), precision_score(y_test, research['trh_prediction']), recall_score(y_test, research['trh_prediction']))

In [None]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode()

In [None]:
def describe_cluster(research, col, col_by):
    to_vis = pd.merge(
        research.groupby([col_by, col])['promocode_source'].count().reset_index().rename(columns = {'promocode_source':'local'}),
        research.groupby(col_by)['promocode_source'].count().reset_index().rename(columns = {'promocode_source':'all'}),
        on = col_by,
        how = 'left'
    )
    to_vis['share'] = to_vis['local']*100/to_vis['all']
    data = []
    for cl in to_vis[col].unique():
        data.append({
            'x': list(to_vis[to_vis[col] == cl][col_by]),
            'y': list(to_vis[to_vis[col] == cl]['share']),
            'name': cl,
            'type': 'bar'
        })
    layout = {
      'xaxis': {'title': 'Prob Group'},
      'yaxis': {'title': 'Share'},
      'barmode': 'relative',
      'title': 'Prob Group by %s' % (col)
    }
    iplot({'data': data, 'layout': layout})

In [None]:
'''
for col in ['ba_payment_type','device_type','ba_person_type','age','channel','country','os','segment','sex']:
    describe_cluster(research, col, 'prob_cat')
    '''

In [None]:
'''
metric = 'all_trial_consumption_sum'
for metric in ['all_trial_consumption_sum']:
    data = []
    for prob_cat in sorted(research['prob_cat'].unique()):
        data.append(go.Box(
            y=np.log1p(research[(research['prob_cat'] == prob_cat) & (research[metric] > 0)][metric]),
            name = prob_cat,
            boxpoints=False
        ))
    layout = go.Layout(
        title = "%s By %s, %s" % (metric, 'prob_cat', research[research[metric] > 0].shape[0])
    )
    iplot(go.Figure(data=data,layout=layout))'''

In [None]:
predict_research['prob_cat'].value_counts()/predict_research.shape[0]

In [None]:
'''data = []
data.append(
    go.Scatter(
        x = predict_research.groupby(['first_trial_consumption_datetime'])['puid'].nunique().reset_index()['first_trial_consumption_datetime'],
        y = predict_research.groupby(['first_trial_consumption_datetime'])['puid'].nunique().reset_index()['puid']
    )
)
iplot(data)'''

In [None]:
'''data = []
data.append(
    go.Scatter(
        x = predict_research[predict_research['paid_prob']>0.20].groupby(['first_trial_consumption_datetime'])['puid'].nunique().reset_index()['first_trial_consumption_datetime'],
        y = predict_research[predict_research['paid_prob']>0.20].groupby(['first_trial_consumption_datetime'])['puid'].nunique().reset_index()['puid']
    )
)
iplot(data)'''

In [None]:
'''describe_cluster(research, 'prob_cat', 'first_trial_consumption_datetime')'''

In [None]:
'''describe_cluster(predict_research, 'prob_cat', 'first_trial_consumption_datetime')'''

In [None]:
def execute_query(query, cluster, alias, token, timeout=600):
    logger.info("Executing query: %s", query)
    proxy = "http://{}.yt.yandex.net".format(cluster)
    s = requests.Session()
    url = "{proxy}/query?database={alias}&password={token}".format(proxy=proxy, alias=alias, token=token)
    resp = s.post(url, data=query, timeout=timeout)
    if resp.status_code != 200:
        logger.error("Response status: %s", resp.status_code)
        logger.error("Response headers: %s", resp.headers)
        logger.error("Response content: %s", resp.content)
    resp.raise_for_status()
    rows = resp.content.strip().split('\n')
    logger.info("Time spent: %s seconds, rows returned: %s", resp.elapsed.total_seconds(), len(rows))
    return rows

logger = logging.getLogger(__name__)

In [None]:
cluster = 'hahn'
alias = "*ch_public"
token = '%s' % (yt_creds['value']['token'])

In [None]:
query = '''
SELECT
    t1.puid,
    t0.lead_source,
    t0.call_status,
    t1.state
FROM(
    SELECT
        DISTINCT
        billing_account_id,
        lead_source,
        call_status
    FROM "//home/cloud_analytics_test/cubes/crm_leads/cube"
    WHERE 
        event = 'call'
        AND billing_account_id != ''
) as t0
ANY LEFT JOIN (
    SELECT
        DISTINCT id as billing_account_id,
        owner_id as puid,
        state
    FROM "//home/logfeller/logs/yc-billing-export-billing-accounts/1h/2019-04-10T09:00:00"
) as t1 
ON t0.billing_account_id = t1.billing_account_id
WHERE 
    t1.puid NOT IN ('', '0')
'''

result = execute_query(query=query, cluster=cluster, alias=alias, token=token)

calls = pd.DataFrame([row.split('\t') for row in result], columns = ['puid', 'lead_source', 'call_status', 'state_calls_users'])


In [None]:
calls['puid'] = calls['puid'].astype(int)
temp = pd.merge(
    predict_research,
    calls,
    on = 'puid',
    how = 'left'
)
temp['lead_source'] = temp['lead_source'].fillna("doesn't lead")
temp['state_calls_users'] = temp['state_calls_users'].fillna("doesn't lead")
temp['call_status'] = temp['call_status'].fillna("doesn't lead")

In [None]:
predict_research['prob_cat'].value_counts()/predict_research.shape[0]

In [None]:
'''describe_cluster(temp, 'lead_source', 'prob_cat')'''

In [None]:
'''describe_cluster(temp, 'call_status', 'prob_cat')'''

In [None]:
query = '''
SELECT
    DISTINCT
    owner_id as puid,
    state
FROM "//home/logfeller/logs/yc-billing-export-billing-accounts/1h/2019-04-10T09:00:00"
WHERE 
    puid NOT IN ('', '0')
'''

result = execute_query(query=query, cluster=cluster, alias=alias, token=token)

ba = pd.DataFrame([row.split('\t') for row in result], columns = ['puid', 'state'])

In [None]:
ba['puid'] = ba['puid'].astype(int)
temp = pd.merge(
    temp,
    ba,
    on = 'puid',
    how = 'left'
)

In [None]:
'''describe_cluster(temp[temp['state'] != 'suspended'], 'lead_source', 'prob_cat')'''

In [None]:
'''describe_cluster(temp[(temp['state'] != 'suspended') & (temp['call_status'] != "doesn't lead")], 'call_status', 'prob_cat')'''

In [None]:
'''describe_cluster(temp, 'state', 'prob_cat')'''