In [75]:
!pip install catboost



In [76]:
import pandas as pd
import numpy as np
import catboost
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from typing import List, Dict
import matplotlib.pyplot as plt

In [66]:
# генератор метрик
def eval_metrics(y_test: pd.DataFrame, y_pred: pd.DataFrame, verbose: bool) -> List[float]:
    """
    :param y_test: test dataset
    :param y_pred: predicted dataset
    :param verbose: print scores flag
    :return: list of scores [Precision Score, Recall Score, F1 Score, Accuracy Score, ROC AUC]
    """

    print(type(y_test), type(y_pred))
    pr_sc = round(precision_score(y_val, y_pred), 3)
    re_sc = round(recall_score(y_val, y_pred), 3)
    f1_sc = round(f1_score(y_val, y_pred), 3)
    ac_sc = round(accuracy_score(y_val, y_pred), 3)
    ra_sc = round(roc_auc_score(y_val, y_pred), 3)

    print('Precision Score: ', pr_sc)
    print('Recall Score: ', re_sc)
    print('F1 Score: ', f1_sc)
    print('Accuracy Score: ', ac_sc)
    print('ROC AUC: ', ra_sc)

    return [pr_sc, re_sc, f1_sc, ac_sc, ra_sc]

In [67]:
# предобработчик наборов данных
def load_datasets(X_filename: str, y_filename: str, X_val_filename: str, y_val_filename: str) -> List[pd.DataFrame]:
    """
    :param X_filename: path to csx with X data
    :param y_filename: path to csx with y data
    :param X_val_filename: path to csx with X_val data
    :param y_val_filename: path to csx with y_val data
    :return: list of pandas dataframes [X, y, X_val, y_val]
    """
    X = pd.read_csv(X_filename)
    y = pd.read_csv(y_filename)
    X_val = pd.read_csv(X_val_filename)
    y_val = pd.read_csv(y_val_filename)

    x_col_names = list(X.columns)
    x_col_names[0] = 'user_id'
    X.columns = x_col_names
    X_val.columns = x_col_names
    y = y.drop(columns=[list(y.columns)[0]])
    y_val = y_val.drop(columns=[list(y_val.columns)[0]])

    return [X, y, X_val, y_val]

In [68]:
# визуализация зависимости метрик от гиперпараметра "iterations"
def visualize_it(catboost_scores: pd.DataFrame) -> None:
    """
    :param catboost_scores: dataframe with index = iterations param value,
             metrics columns = ['Precision Score', 'Recall Score', 'F1 Score', 'Accuracy Score', 'ROC AUC']
    :return:
    """
    plt.figure(figsize=[15, 10])
    plt.plot(catboost_scores['Precision Score'], label=catboost_scores.columns[0])
    plt.plot(catboost_scores['Recall Score'], label=catboost_scores.columns[1])
    plt.plot(catboost_scores['F1 Score'], label=catboost_scores.columns[2])
    plt.plot(catboost_scores['Accuracy Score'], label=catboost_scores.columns[3])
    plt.plot(catboost_scores['ROC AUC'], label=catboost_scores.columns[4])
    plt.xlabel('Количество итераций (параметр "iterations")')
    plt.ylabel('Значение метрики')
    plt.title('Зависимость метрик качества от гиперпараметра "iterations"')
    plt.legend()
    plt.show()

In [69]:
# непосредственно catboost
def catboost_it(X: pd.DataFrame, y: pd.DataFrame, X_val: pd.DataFrame, y_val: pd.DataFrame, min_iterations: int,
                max_iterations: int, iteration_step: int, verbose: bool, category_columns: List[str]) -> pd.DataFrame:
    """
    :param iteration_step:
    :param category_columns:
    :param X: training set of values
    :param y: training set of results
    :param X_val: test set of values
    :param y_val: test set of results
    :param min_iterations: minimum value of catboost's iterations parameter
    :param max_iterations: max value of catboost's iterations parameter
    :param verbose: is needed to show iteration info
    :return: dataframe with index = iterations param value,
             metrics columns = ['Precision Score', 'Recall Score', 'F1 Score', 'Accuracy Score', 'ROC AUC']
    """
    if category_columns is None:
        category_columns = []
    else:
        for cat in category_columns:
            X[cat] = X[cat].astype(str)
    catboost_boost_results = dict()
    for i in range(min_iterations, max_iterations, iteration_step):
        catboost_model = CatBoostClassifier(iterations=i + 1, cat_features=category_columns, learning_rate=0.1,
                                            custom_loss=['AUC'])
        print(catboost_model.feature_names_)
        catboost_model.fit(X, y, cat_features=category_columns, eval_set=(X_val, y_val), plot=True)
        prediction = catboost_model.predict(X_val)
        prediction = np.array(prediction)
        prediction = prediction.astype('bool')
        print(prediction)
        catboost_boost_results[i + 1] = eval_metrics(y_val, prediction, verbose)

    catboost_scores = pd.DataFrame(catboost_boost_results).T
    catboost_scores.columns = ['Precision Score', 'Recall Score', 'F1 Score', 'Accuracy Score', 'ROC AUC']

    return catboost_scores

## Набор данных "Первая итерация"

In [70]:
# загружаем данные
X, y, X_val, y_val = load_datasets('0_X.csv', '0_y.csv', '0_X_val.csv', '0_y_val.csv')

In [71]:
X

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,REGULARITY,FREQ_TOP_PACK
0,957056,12.0,18.0,-0.638518,-0.718161,-0.718841,-0.718706,-0.747321,-0.252563,-0.315014,-0.377476,-0.332739,0.000000,0.000000,0.446632,0.000000
1,536962,8.0,24.0,-0.708945,-0.793489,-0.699894,-0.699898,-0.747321,-0.249717,-0.313873,-0.450555,0.000000,0.000000,-0.220128,0.581223,0.000000
2,1575854,0.0,24.0,0.002368,1.541680,0.190890,0.190749,1.702509,0.028720,-0.212311,-0.236189,0.454859,0.000000,0.000000,-0.091734,1.363902
3,1543057,9.0,24.0,0.699596,0.110447,0.698418,0.698556,0.613696,-0.252563,0.355982,-0.338500,-0.316987,-0.208879,0.000000,1.433635,-0.267198
4,1520724,14.0,24.0,-0.596262,-0.416849,-0.586770,-0.586634,-0.407067,-0.186736,-0.273933,-0.401836,-0.316987,0.000000,0.000000,0.715815,-0.593418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723233,73349,14.0,24.0,0.122094,-0.115537,0.206633,0.206631,0.001238,0.017487,-0.315014,-0.012078,0.785650,0.000000,0.000000,-0.271189,-0.348753
1723234,836489,11.0,24.0,0.000000,0.000000,0.000000,0.000000,0.000000,-0.251140,-0.292191,0.000000,0.000000,0.000000,0.000000,-1.123601,0.000000
1723235,491263,0.0,24.0,-0.004674,-0.040209,0.067875,0.067872,-0.134863,0.000000,-0.295615,-0.002334,0.360347,0.000000,0.000000,-0.226325,0.222132
1723236,491755,14.0,1.0,0.000000,0.000000,-0.767462,-0.767605,-0.883422,-0.252563,-0.313873,0.000000,0.000000,0.000000,0.000000,-0.989009,0.000000


In [72]:
y

Unnamed: 0,CHURN
0,0
1,0
2,0
3,0
4,0
...,...
1723233,1
1723234,0
1723235,0
1723236,1


In [78]:
X['REGION'] = X['REGION'].apply(int)
X_val['REGION'] = X_val['REGION'].apply(int)
X['TENURE'] = X['TENURE'].apply(int)
X_val['TENURE'] = X_val['TENURE'].apply(int)
catboost_model = catboost.CatBoostClassifier(n_estimators=10000, max_depth=6, eval_metric='AUC', reg_lambda = 370, cat_features=['REGION', 'TENURE'])
catboost_model.fit(X, y)
predict = catboost_model.predict(X_val)
print(f'ROC AUC:\n{round(roc_auc_score(y_val, predict), 3)}')


0:	total: 2.39s	remaining: 6h 38m 12s
1:	total: 3.21s	remaining: 4h 27m 32s
2:	total: 4.3s	remaining: 3h 58m 47s
3:	total: 5.44s	remaining: 3h 46m 32s
4:	total: 6.41s	remaining: 3h 33m 30s
5:	total: 7.14s	remaining: 3h 18m 18s
6:	total: 8.15s	remaining: 3h 13m 49s
7:	total: 8.93s	remaining: 3h 5m 58s
8:	total: 9.76s	remaining: 3h 30s
9:	total: 10.4s	remaining: 2h 53m 31s
10:	total: 11.2s	remaining: 2h 50m 9s
11:	total: 12.1s	remaining: 2h 47m 37s
12:	total: 13.1s	remaining: 2h 47m 16s
13:	total: 13.9s	remaining: 2h 45m 30s
14:	total: 14.6s	remaining: 2h 42m 19s
15:	total: 15.2s	remaining: 2h 38m 19s
16:	total: 15.8s	remaining: 2h 34m 13s
17:	total: 16.5s	remaining: 2h 32m 28s
18:	total: 17.3s	remaining: 2h 31m 3s
19:	total: 17.8s	remaining: 2h 28m 21s
20:	total: 18.5s	remaining: 2h 26m 31s
21:	total: 19.1s	remaining: 2h 24m 29s
22:	total: 20.5s	remaining: 2h 28m 9s
23:	total: 21.3s	remaining: 2h 27m 16s
24:	total: 21.9s	remaining: 2h 25m 56s
25:	total: 22.6s	remaining: 2h 24m 24s
26:	t

In [None]:
catboost_scores = catboost_it(X, y, X_val, y_val, 1, 100, 10, True, ['REGION', 'TENURE'])
catboost_scores

In [None]:
visualize_it(catboost_scores)

In [None]:
catboost_scores[catboost_scores['ROC AUC'] == catboost_scores['ROC AUC'].max()]

In [None]:
from catboost import cv

params = {
    'loss_function': 'Logloss',
    'iterations': 150,
    'custom_loss': 'AUC',
    'random_seed': 777,
    'learning_rate': 0.5
}

cv_data = cv(
    params=params,
    pool=Pool(X, label=y, cat_features=['REGION', 'TENURE']),
    fold_count=5, # Разбивка выборки на 5 кусочков
    shuffle=True, # Перемешаем наши данные
    partition_random_seed=0,
    plot=True, # Никуда без визуализатора
    stratified=True,
    verbose=False
)

NameError: name 'Pool' is not defined

## Набор данных "Вторая итерация"

In [None]:
full_df = pd.read_csv('2nd_iter.csv')
full_df

In [None]:
X1 = full_df[
    ['MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT', 'FREQUENCE', 'DATA_VOLUME', 'ON_NET', 'ORANGE',
     'TIGO', 'ZONE1', 'ZONE2', 'REGULARITY', 'TOP_PACK', 'REGION', 'TENURE']]
X1

In [None]:
y1 = full_df['CHURN']
y1

In [None]:
X, X_val, y, y_val = train_test_split(X1, y1, test_size=0.3, random_state=777)
X

In [None]:
cat_col = ['TOP_PACK', 'REGION', 'TENURE']

In [None]:
catboost_it(X, y, X_val, y_val, 1, 100, 10, True, cat_col)