<a href="https://colab.research.google.com/github/Dicere/WB_Internship/blob/main/0_4_roll_params.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost
!pip install optuna

In [2]:
import optuna
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
from catboost import CatBoostClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold

In [3]:
def objective_catboost(trial, X, y):
    params_grid = {
        'iterations':trial.suggest_int("iterations", 100, 2000,step=50),
        'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'learning_rate': trial.suggest_float("learning_rate", 0.005, 0.3,step=0.001),
        'max_depth': trial.suggest_int('max_depth', 1, 12),
        'verbose': 200,
        'eval_metric': 'AUC',
        # 'cat_features':list(collect_features)
        }
    
    if params_grid['bootstrap_type'] == 'Bayesian':
        params_grid['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif params_grid['bootstrap_type'] == 'Bernoulli':
        params_grid['subsample'] = trial.suggest_float('subsample', 0.1, 1)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=22)

    cv_scores = np.empty(5)
    roc_auc_score_1 = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        print('--'*10+f'TRIAL {trial.number} ~ '+f'FOLD {idx}'+'--'*10)
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val  = y.iloc[train_idx], y.iloc[test_idx]

        model = CatBoostClassifier(**params_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val ,y_val )],
            early_stopping_rounds=100,
            use_best_model=True
            
        )
        preds = model.predict_proba(X_val)[:, 1]
        roc_auc_score_1[idx] = roc_auc_score(y_val, preds)
    print("ROC_AUC_MEAN_ON_FOLD-----"+str(np.mean(roc_auc_score_1)))
    return np.mean(roc_auc_score_1)

In [4]:
df = pd.read_csv('/content/drive/MyDrive/WB_стажировка/wb_school_task_2.csv.gzip',compression='gzip')
df = df.drop_duplicates()

In [6]:
collect_features = ['text']
feature=df[df.columns[3:]].columns
X = df[feature].drop(columns=['label','text'])
y = df['label']

In [7]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [8]:
import sqlite3

In [9]:
from optuna.storages import RetryFailedTrialCallback

storage = optuna.storages.RDBStorage(
    url="sqlite:////content/drive/MyDrive/WB_стажировка/MAIN/roll_params_db/optuna_1_round.db",
    heartbeat_interval=1,
    grace_period=120,
    failed_trial_callback=RetryFailedTrialCallback(max_retry=3),
)

  failed_trial_callback=RetryFailedTrialCallback(max_retry=3),


In [10]:
optuna_paranms = {
    'direction': "maximize",
    'study_name': "cat_10000_iter_1_round",
    'storage':storage
}

In [None]:
# study_catboost = optuna.create_study(**optuna_paranms)
func = lambda trial: objective_catboost(trial, X, y)
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study_catboost.optimize(func, n_trials=10000)

#Продолжение дообучения

In [None]:
continue_study = optuna.create_study(study_name= "cat_10000_iter_1_round",direction= "maximize", storage=storage, load_if_exists=True)

continue_study.optimize(func, n_trials=1000)

In [None]:
study_catboost1 = optuna.load_study(study_name="cat_10000_iter_1_round", storage="sqlite://///content/drive/MyDrive/WB_стажировка/MAIN/roll_params_db/optuna_1_round.db")

In [None]:
optuna.visualization.plot_optimization_history(study_catboost1)

In [None]:
optuna.visualization.plot_optimization_history(study_catboost)


In [None]:
param = study_catboost1.best_trial.params

In [None]:
param

{'bootstrap_type': 'MVS',
 'iterations': 800,
 'learning_rate': 0.259,
 'max_depth': 3,
 'objective': 'Logloss'}

In [11]:
def objective_catboost2(trial, X, y):
    params_grid = {'bootstrap_type': 'MVS',
            'iterations': 800,
            'learning_rate': 0.259,
            'max_depth': 3,
            'objective': 'Logloss',
            'verbose': 200,
            'eval_metric': 'AUC',
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
            'border_count': trial.suggest_int('border_count', 32, 512, step=32),
            'random_strength': trial.suggest_uniform('random_strength', 0, 1),
            'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 0.1, 10),

           }


    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=22)

    cv_scores = np.empty(5)
    roc_auc_score_1 = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        print('--'*10+f'TRIAL {trial.number} ~ '+f'FOLD {idx}'+'--'*10)
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = CatBoostClassifier(**params_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_test,y_test)],
            # early_stopping_rounds=100,
            use_best_model=True
            
        )
        preds = model.predict_proba(X_test)[:, 1]
        roc_auc_score_1[idx] = roc_auc_score(y_test, preds)
    print("ROC_AUC_MEAN_ON_FOLD-----"+str(np.mean(roc_auc_score_1)))
    return np.mean(roc_auc_score_1)

In [12]:
storage2 = optuna.storages.RDBStorage(
    url="sqlite:////content/drive/MyDrive/WB_стажировка/MAIN/roll_params_db/opt_2_round.db",
    heartbeat_interval=1,
    grace_period=120,
    failed_trial_callback=RetryFailedTrialCallback(max_retry=3),
)

  failed_trial_callback=RetryFailedTrialCallback(max_retry=3),


In [13]:
optuna_paranms = {
    'direction': "maximize",
    'study_name': "cat_10000_iter_2_round",
    'storage':storage2
}

In [14]:
# study_catboost2 = optuna.create_study(**optuna_paranms)
func = lambda trial: objective_catboost2(trial, X, y)
optuna.logging.set_verbosity(optuna.logging.WARNING)


In [None]:
study_catboost2 = optuna.create_study(study_name= "cat_10000_iter_2_round",direction= "maximize", storage=storage2, load_if_exists=True)
study_catboost2.optimize(func, n_trials=1000)

In [16]:
optuna.visualization.plot_optimization_history(study_catboost2)

In [17]:
param2 = study_catboost2.best_trial.params

In [18]:
param2

{'border_count': 32,
 'colsample_bylevel': 0.09715577437674092,
 'min_data_in_leaf': 20,
 'random_strength': 0.36494249753665964,
 'scale_pos_weight': 0.4366210135974975}