In [5]:
import pandas as pd, numpy as np, catboost, logging, os, sys, requests, datetime
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, recall_score, precision_score, roc_auc_score
import scipy.stats as stats 
module_path = os.path.abspath(os.path.join('/home/ktereshin/yandex/arcadia/cloud/analytics/python/work'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_loader import clickhouse
from global_variables import (
    metrika_clickhouse_param_dict,
    cloud_clickhouse_param_dict
)
from nile.api.v1 import (
    clusters,
    aggregators as na,
    extractors as ne,
    filters as nf,
    Record
)
from vault_client import instances

def execute_query(query, cluster, alias, token, timeout=600):
    proxy = "http://{}.yt.yandex.net".format(cluster)
    s = requests.Session()
    url = "{proxy}/query?database={alias}&password={token}".format(proxy=proxy, alias=alias, token=token)
    resp = s.post(url, data=query, timeout=timeout)
    resp.raise_for_status()
    rows = resp.content.strip().split('\n')
    return rows

def chyt_execute_query(query, cluster, alias, token, columns):
    i = 0
    while True:
        try:
            result = execute_query(query=query, cluster=cluster, alias=alias, token=token)
            users = pd.DataFrame([row.split('\t') for row in result], columns = columns)
            return users
        except Exception as err:
            print(err)
            i += 1
            if i > 10:
                print('Break Excecution')
                break

In [6]:
threshold = 0.2

client = instances.Production()
yt_creds = client.get_version('ver-01d33pgv8pzc7t99s3egm24x47')
cluster_yt = clusters.yt.Hahn(
    token = yt_creds['value']['token'],
    pool = yt_creds['value']['pool'],
    
)

In [7]:
cluster = 'hahn'
alias = "*ch_public"
token = '%s' % (yt_creds['value']['token'])

query = '''
SELECT
    DISTINCT
    billing_account_id,
    puid
FROM "//home/cloud_analytics_test/cubes/acquisition_cube/cube"
WHERE 
    event = 'ba_created'
    AND puid != ''
'''

columns = ['billing_account_id', 'puid']
puids = chyt_execute_query(query=query, cluster=cluster, alias=alias, token=token, columns = columns)

features = cluster_yt.read('//home/cloud_analytics/scoring/learning_dataset').as_dataframe()
targets = cluster_yt.read('//home/cloud_analytics/scoring/targets').as_dataframe()

train_data = pd.merge(
    targets[targets['dataset_type'] == 'learning_set'][['puid', 'first_trial_consumption_datetime', 'is_supended']],
    features,
    on = 'puid',
    how = 'left'
).fillna('0')

to_predict = pd.merge(
    targets[targets['dataset_type'] != 'learning_set'][['puid', 'first_trial_consumption_datetime', 'is_supended']],
    features,
    on = 'puid',
    how = 'left'
).fillna('0')

train_data = shuffle(train_data).reset_index(drop=True)

X_train, X_test, y_train, y_test = train_test_split( train_data, train_data['is_supended'], test_size=0.33)
X_train, X_eval, y_train, y_eval = train_test_split( X_train, y_train, test_size=0.25)
positive = X_train[y_train == 1]
res = pd.DataFrame()
for i in range(3):
    X_train = pd.concat(
        [
            X_train,
            positive
        ]
    )
    y_train = pd.concat(
        [
            y_train,
            pd.Series([1]*positive.shape[0])
        ]
    )

cat_col = [
    'ba_payment_type',
    'ba_usage_status',
    'ba_state',
    'device_type',
    'ba_person_type',
    'age',
    'ba_payment_cycle_type',
    'ba_type',
    'channel',
    'country',
    'os',
    'promocode_source',
    'segment',
    'sex',
    'first_trial_consumption_datetime',
    'session_start_time',
    'search_phrase',
    'is_supended',
    'puid'
]
cat_indexes = []
for col in cat_col:
    cat_indexes.append(X_train.columns.get_loc(col))

ignore_col = [
'first_trial_consumption_datetime','session_start_time','search_phrase','is_supended', 'puid'
]
ignore_indexes = []
for col in ignore_col:
    ignore_indexes.append(X_train.columns.get_loc(col))

site_metrics = []
other_metrics = []
for col in X_test:
    if 'count_v' in col or 'tfidf_' in col:
        site_metrics.append(col)
    else:
        other_metrics.append(col)

features.shape[0]
site_metric_new = []
for col in site_metrics:
    temp = features[col].value_counts()[0]
    if features[col].value_counts()[0]/float(features.shape[0]) < 0.95:
        site_metric_new.append(col)

list_columns = other_metrics + site_metric_new


X_train['rand'] = np.random.rand(X_train.shape[0])
X_eval['rand'] = np.random.rand(X_eval.shape[0])
X_test['rand'] = np.random.rand(X_test.shape[0])
to_predict['rand'] = np.random.rand(to_predict.shape[0])

train_pool = catboost.Pool(X_train, y_train, cat_features = cat_indexes)
eval_pool = catboost.Pool(X_eval, y_eval, cat_features = cat_indexes)
test_pool = catboost.Pool(X_test, cat_features = cat_indexes)
predict_pool = catboost.Pool(to_predict, cat_features = cat_indexes)

learning_rate = 0.5
subsample = 0.3
bootstrap_type = 'Bernoulli'
depth = 1

model = catboost.CatBoostClassifier(
    iterations=1000,
    depth=depth,
    learning_rate=learning_rate,
    bootstrap_type = bootstrap_type,
    subsample = subsample,
    loss_function='Logloss',
    ignored_features = ignore_indexes,
    verbose=False
)
model.fit(train_pool, eval_set = eval_pool, plot = False, early_stopping_rounds = 20,use_best_model = True)

metris_dict = {
    'confusion_matrix': confusion_matrix(y_test, model.predict(test_pool)),
    'recall': recall_score(y_test, model.predict(test_pool)),
    'precision': precision_score(y_test, model.predict(test_pool)),
    'roc_auc': roc_auc_score(y_test, model.predict(test_pool))
}
print('confusion_matrix = \n%s\n' % (metris_dict['confusion_matrix']))
print('recall = %s\n' % (metris_dict['recall']))
print('precision = %s' % (metris_dict['precision']))
print('roc_auc = %s' % (metris_dict['roc_auc']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


confusion_matrix = 
[[1194  609]
 [ 147 1039]]

recall = 0.8760539629005059

precision = 0.6304611650485437
roc_auc = 0.7691417901024993


In [8]:
pd.DataFrame([model.feature_names_,model.feature_importances_]).T.rename(columns={0:'feature', 1: 'score'}).sort_values(by = 'score', ascending = False)

Unnamed: 0,feature,score
24,ba_state,48.5091
16,all_trial_consumption_max,11.0063
26,ba_usage_status,10.976
992,tfidf_/api/billing/setpaidaccount,5.18434
22,ba_payment_type,2.2186
35,count_v_/api/billing/setpaidaccount,2.19451
1025,tfidf_/api/iam/createcloudpublic,1.92939
44,count_v_/api/compute/createinstance,1.69728
1038,tfidf_/api/iam/updateusersettings,1.46623
960,os,1.00757
