In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# from sklearn.linear_model import LogisticRegression, Ridge
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, auc, recall_score, precision_score, f1_score, multilabel_confusion_matrix
from sklearn import metrics
import pickle
import warnings
warnings.filterwarnings("ignore")

# Data train load
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

bin_cols = ['dual_sim', 'blue', 'four_g', 'three_g', 'touch_screen', 'wifi', ]
num_cols = ['battery_power', 'mobile_wt', 'int_memory', 'px_height', 'px_width', 'ram']
cat_cols = ['clock_speed', 'fc', 'm_dep', 'n_cores', 'pc', 'sc_h', 'sc_w', 'talk_time']
target = ['price_range']

df_train, df_val, y_train, y_val = train_test_split(data[cat_cols + num_cols + bin_cols],
                                                    data['price_range'], test_size=0.2, random_state=1)
# Prepare

def prepare_data(df_train, df_val, test):
    dv = DictVectorizer(sparse=False)

    train_dict = df_train.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dict)

    test_dict = test.to_dict(orient='records')
    X_test = dv.transform(test_dict)
    return dv, X_train, X_val, X_test

dv, X_train, X_val, X_test = prepare_data(df_train, df_val, test)

# training

tunned_model = CatBoostClassifier(
    random_seed=42,
    iterations=1000,
    learning_rate=0.03,
    l2_leaf_reg=3,
    bagging_temperature=1,
    random_strength=1,
    one_hot_max_size=2,
    leaf_estimation_method='Newton'
)

tunned_model.fit(
    X_train, y_train,
    verbose=False,
    eval_set=(X_val, y_val),
    plot=True

)
y_pred = tunned_model.predict(X_val)
print(f'classification_report, {metrics.classification_report(y_pred, y_val)}')

# training final model
print('training the final model')
X = np.vstack([X_train, X_val])
y = np.hstack([y_train, y_val])

best_model = CatBoostClassifier(
    random_seed=42,
    iterations=int(tunned_model.tree_count_ * 1.2)
)

best_model.fit(
    X, y,
    verbose=100
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

classification_report,               precision    recall  f1-score   support

           0       0.96      0.98      0.97        90
           1       0.95      0.88      0.91       104
           2       0.91      0.90      0.90       107
           3       0.92      0.99      0.96        99

    accuracy                           0.93       400
   macro avg       0.93      0.93      0.93       400
weighted avg       0.93      0.93      0.93       400

training the final model
Learning rate set to 0.07027
0:	learn: 1.2729863	total: 15.4ms	remaining: 18.5s
100:	learn: 0.2053783	total: 1.1s	remaining: 11.9s
200:	learn: 0.1181517	total: 2.09s	remaining: 10.4s
300:	learn: 0.0867067	total: 3.15s	remaining: 9.4s
400:	learn: 0.0669843	total: 8.48s	remaining: 16.9s
500:	learn: 0.0529804	total: 12.6s	remaining: 17.6s
600:	learn: 0.0430583	total: 17.5s	remaining: 17.5s
700:	learn: 0.0354672	total: 21s	remaining: 15s
800:	learn: 0.0299757	total: 23.6s	remaining: 11.8s
900:	learn: 0.0257431	total

<catboost.core.CatBoostClassifier at 0x1fc9ccdfdc0>

In [3]:
import bentoml
bentoml.catboost.save_model("price_range", best_model, custom_objects={"DictVectorizer": dv})

Model(tag="price_range:zcvwch2z5kalt2bk", path="C:\Users\DS_PC\bentoml\models\price_range\zcvwch2z5kalt2bk\")