In [None]:
from datetime import datetime
import pandas as pd
import random
import numpy as np
import joblib

In [1]:
!pip install catboost
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV



# MODEL TRAINING

In [57]:
X = all_data.drop(['target'], axis=1)
y = all_data['target']

In [58]:
X.head()

Unnamed: 0,gender,age,country,city,exp_group,month,weekday,hour,minute,topic,text_len,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,os_iOS,source_organic
0,0.906544,-0.895064,-1.154062,-0.11882,-0.950865,1.251898,0.003896,-0.888751,-0.838438,-1.202894,-0.142506,-0.391283,-0.355427,0.605451,-0.272002,-0.41785,-0.313876,-1.318065,1.36877,-0.490357
1,0.906544,-0.415498,0.346883,0.853076,-0.643327,0.025755,0.003896,0.529871,0.665887,0.135868,-0.142506,-0.391283,-0.355427,0.605451,-0.272002,-0.41785,-0.313876,-1.318065,-0.730583,2.03933
2,-1.10309,-1.278717,-0.245865,0.900497,0.971965,0.025755,-0.492353,-0.07811,0.145159,1.405127,-0.142506,-0.391283,-0.355427,0.605451,-0.272002,-0.41785,-0.313876,-1.318065,1.36877,2.03933
3,-1.10309,-1.278717,0.61108,0.843682,0.921768,1.251898,1.492642,1.137852,-1.127731,1.627725,-0.142506,-0.391283,-0.355427,0.605451,-0.272002,-0.41785,-0.313876,-1.318065,1.36877,2.03933
4,-1.10309,-0.607324,0.229698,-0.747899,0.692993,1.251898,1.492642,1.340513,0.955181,2.343881,-0.142506,-0.391283,-0.355427,0.605451,-0.272002,-0.41785,-0.313876,-1.318065,-0.730583,2.03933


In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [65]:
model1 = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
model2 = CatBoostClassifier(random_state=42, depth=4, iterations=1000, verbose=0)

pipe1_grid = {
    'C': [0.001, 0.01, 0.5, 1, 2, 5, 10, 30],
}

pipe2_grid = {
    'depth': [2, 4, 6],
    'iterations': [100, 300],
    'l2_leaf_reg': [5, 10, 15],
}

pipes_grid = [pipe1_grid, pipe2_grid]
names = ['Logistic Regression', 'CatBoost']

In [66]:
def model_train(model, X_train, y_train, X_test, y_test, grid=None):
    model_default = model
    model_default.fit(X_train, y_train)
    predict = model_default.predict_proba(X_test)[:, 1]
    print(f'Standart model ROC_AUC = {roc_auc_score(y_test, predict)}')

    model_grid = model
    
    clf = GridSearchCV(
    model_grid,
    grid,
    scoring='roc_auc',
    cv=5
)
    clf.fit(X_train, y_train)
    best_model = clf.best_estimator_
    clf_predict = best_model.predict_proba(X_test)[:, 1]
    clf_predict_train = best_model.predict_proba(X_train)[:, 1]
    print(f'Best model ROC_AUC = {roc_auc_score(y_test, clf_predict)}')
    print(f'Best model ROC_AUC train = {roc_auc_score(y_train, clf_predict_train)}')
    
    return [model, best_model]

In [67]:
all_default_models = []
all_best_models = []

for id, model in enumerate([model1, model2]):
    print(f'\t{names[id]}')
    model, best_model = model_train(model, X_train, y_train, X_test, y_test, pipes_grid[id])
    
    all_default_models.append(model)
    all_best_models.append(best_model)
    print()

	Logistic Regression
Standart model ROC_AUC = 0.6407371298139298
Best model ROC_AUC = 0.640752606271174
Best model ROC_AUC train = 0.6408918925955729

	CatBoost
Standart model ROC_AUC = 0.6760477532224514
Best model ROC_AUC = 0.6664723933755052
Best model ROC_AUC train = 0.6694857568562579



# SAVE MODEL

In [70]:
# 2 models for AB-testing

joblib.dump(all_best_models[0], 'model_control.pkl')
joblib.dump(all_best_models[1], 'model_test.pkl')

['CB_best.pkl']