In [1]:
import os
import sys
from dotenv import load_dotenv

sys.path.append('../')
load_dotenv("../.env")

True

In [2]:
%cd ..

/home/cva/Desktop/ITMO/deep-learninging-practice/TableBC


## Preprocess data

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


class DataPreproc:
    def __init__(self):
        self.drop_columns = [
            "is_in_yandex",
            "dist_qty_oper_login_1",
            "total_qty_oper_login_1",
            "total_qty_oper_login_0",
            "total_qty_over_index_and_type",
            "is_privatecategory",
            "index_oper",
            "name_mfi",
            "mailrank",
        ]
        self.numeric_col = [
            "weight",
            "transport_pay",
            "weight_mfi",
            "price_mfi",
            "total_qty_over_index",
        ]
        # Scalers init
        self.scalers = dict()
        for col in self.numeric_col:
            self.scalers[col] = MinMaxScaler()

        # Encoder init
        self.encoder_type = LabelEncoder()

    def preprocess(self, df, dtype="train"):
        if dtype == "train":
            for col in self.numeric_col:
                self.scalers[col].fit(np.array(df[col]).reshape(-1, 1))
            self.encoder_type.fit(df["type"])


        df["oper_type"] = df["oper_type + oper_attr"].apply(lambda x: x.split("_")[0])
        df["oper_attr"] = df["oper_type + oper_attr"].apply(lambda x: x.split("_")[1])
        df["oper_type"] = df["oper_type"].astype(int)
        df["oper_attr"] = df["oper_attr"].astype(int)
        df.drop("oper_type + oper_attr", axis=1, inplace=True)

        
        df["priority"] = df["priority"].astype(int)
        df["is_privatecategory"].replace({"N": 1, "Y": 2, "0": 0}, inplace=True)
        df["class"] = df["class"].astype(int)
        df["is_return"].replace({"N": 1, "Y": 2}, inplace=True)
        df["mailtype"] = df["mailtype"].astype(int)
        df["mailctg"] = df["mailctg"].astype(int)
        df["directctg"] = df["directctg"].astype(int)
        df["postmark"] = df["postmark"].astype(int)
        df["total_qty_over_index"] = df["total_qty_over_index"].astype("int")
        df.drop(self.drop_columns, axis=1, inplace=True)
        
        for col in self.numeric_col:
            df[col] = self.scalers[col].transform(np.array(df[col]).reshape(-1, 1))
        df["type"] = self.encoder_type.transform(df["type"])
        
        return df


## Read data

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
train_dataset_path = "data/train_dataset_train.csv"

preproc = DataPreproc()
train_df = pd.read_csv(train_dataset_path)

  train_df = pd.read_csv(train_dataset_path)


In [35]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000000 entries, 0 to 5999999
Data columns (total 21 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   id                     int64  
 1   type                   int64  
 2   priority               int64  
 3   class                  int64  
 4   is_return              int64  
 5   weight                 float64
 6   mailtype               int64  
 7   mailctg                int64  
 8   directctg              int64  
 9   transport_pay          float64
 10  postmark               int64  
 11  weight_mfi             float64
 12  price_mfi              float64
 13  total_qty_over_index   float64
 14  is_wrong_sndr_name     int64  
 15  is_wrong_rcpn_name     int64  
 16  is_wrong_phone_number  int64  
 17  is_wrong_address       int64  
 18  label                  int64  
 19  oper_type              int64  
 20  oper_attr              int64  
dtypes: float64(5), int64(16)
memory usage: 961.3 MB


In [6]:
train_df.head()

Unnamed: 0,id,oper_type + oper_attr,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,...,dist_qty_oper_login_1,total_qty_oper_login_1,total_qty_oper_login_0,total_qty_over_index_and_type,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,label
0,6818780,1043_-1,628629.0,Участок,7503.0,N,0.0,Y,N,87.0,...,42.0,720176.0,58950.0,779126.0,8290896.0,0,0,0,0,0
1,9907176,1023_-1,102976.0,ММПО,7503.0,N,0.0,N,N,107.0,...,914.0,48856658.0,83318932.0,132175590.0,136819803.0,0,0,0,0,0
2,3304275,1018_-1,620962.0,Цех,7503.0,N,0.0,Y,N,50.0,...,62.0,3246292.0,3233068.0,6479360.0,52708071.0,0,1,0,0,0
3,9020937,1019_-1,344964.0,Цех,7503.0,N,0.0,Y,N,416.0,...,55.0,2060928.0,653280.0,2714208.0,19562334.0,0,0,0,0,0
4,3082311,1020_-1,629819.0,Участок,7503.0,N,0.0,Y,N,795.0,...,16.0,316919.0,27911.0,344830.0,4719186.0,0,0,0,0,0


In [7]:
train_df = preproc.preprocess(train_df, dtype="train")

In [8]:
train_df

Unnamed: 0,id,type,priority,class,is_return,weight,mailtype,mailctg,directctg,transport_pay,...,weight_mfi,price_mfi,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,label,oper_type,oper_attr
0,6818780,18,7503,0,1,0.000551,5,1,2,0.000000,...,0.002278,0.000939,0.016573,0,0,0,0,0,1043,-1
1,9907176,4,7503,0,1,0.000677,5,1,2,0.000000,...,0.003778,0.002505,0.273502,0,0,0,0,0,1023,-1
2,3304275,19,7503,0,1,0.000316,5,1,2,0.000000,...,0.003111,0.001365,0.105363,0,1,0,0,0,1018,-1
3,9020937,19,7503,0,1,0.002633,5,1,2,0.042553,...,0.001833,0.000626,0.039105,0,0,0,0,0,1019,-1
4,3082311,18,7503,0,1,0.005032,5,1,2,0.063239,...,0.039778,0.006262,0.009434,0,0,0,0,0,1020,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5999995,9958614,4,7503,0,1,0.000158,5,0,2,0.000000,...,0.001333,0.000626,0.376626,0,0,1,0,0,1022,-1
5999996,2234489,19,7503,0,1,0.000525,5,1,2,0.000000,...,0.005556,0.011471,0.031148,0,0,0,0,0,1022,-1
5999997,4304572,19,7506,0,1,0.010759,5,1,2,0.113293,...,0.052889,0.005009,0.151109,0,0,0,0,0,1041,-1
5999998,6550634,19,7506,0,1,0.001703,5,1,2,0.000000,...,0.000000,0.000000,0.079937,0,1,0,0,0,1018,-1


In [37]:
X = train_df.drop(["label", "id"], axis=1)
y = train_df["label"]

In [39]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

## Train data

In [40]:
import os
import json
import mlflow
import datetime
from catboost import CatBoostClassifier

In [41]:
remote_server_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("pochta-task")

<Experiment: artifact_location='s3://arts/3', creation_time=1667396965657, experiment_id='3', last_update_time=1667396965657, lifecycle_stage='active', name='pochta-task', tags={}>

In [42]:
cat_features = [
    "type",
    "priority",
    "class",
    "is_return",
    "mailtype",
    "mailctg",
    "directctg",
    "postmark",
    "is_wrong_sndr_name",
    "is_wrong_rcpn_name",
    "is_wrong_phone_number",
    "is_wrong_address",
    "oper_type",
    "oper_attr",
]

In [43]:
ITERATIONS = 50
RANDOM_SEED = 42
LEARNING_RATE = 0.5

MODEL_NAME = 'catbost'
TIMESTAMP = datetime.datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
LOG_DIR = f'log/catbost-{TIMESTAMP}'

PLOT_FILE = f'{LOG_DIR}/plot'
SNAPSHOT_FILE = f'{LOG_DIR}/snapshot'
MODELS_PATH = f'{LOG_DIR}/models'

In [44]:
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)
if not os.path.exists(MODELS_PATH):
    os.makedirs(MODELS_PATH)

In [47]:
mlflow.start_run()
mlflow.set_tag("mlflow.runName", f"{MODEL_NAME}-{TIMESTAMP}")

In [48]:
params ={
    "iterations": ITERATIONS,
    "batch_size":RANDOM_SEED,
    "num_epochs": LEARNING_RATE,
    "random_seed": RANDOM_SEED,
}
mlflow.log_params(params)

In [84]:
from sklearn.metrics import confusion_matrix
from scipy.special import expit


class FinalMetric(object):
    @staticmethod
    def get_profit(y_true, y_pred):
        y_pred = expit(y_pred).astype(int)
        y_true = y_true.astype(int)
        #print("ACCURACY:",(y_pred==y_true).mean())
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        
        print(tn, fp, fn, tp)
        print(type(tn), type(fp), type(fn), type(tp))
        recall = tp/(tp+fn)
        tpr = tp/(tp+fn)
        fpr = fp/(fp+tp)
        auc = (1+tpr-fpr)/2

        loss = 0.1*recall + 0.9*auc
        return loss
    
    def is_max_optimal(self):
        return True # greater is better

    def evaluate(self, approxes, target, weight):            
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        y_true = np.array(target).astype(int)
        approx = approxes[0]
        score = self.get_profit(y_true, approx)
        return score, 1

    def get_final_error(self, error, weight):
        return error

In [85]:
clf = CatBoostClassifier(
    iterations=ITERATIONS,
    random_seed=RANDOM_SEED,
    learning_rate=LEARNING_RATE,
    snapshot_file=SNAPSHOT_FILE,
    train_dir=LOG_DIR,
    eval_metric=FinalMetric(),
    custom_loss=['AUC', 'Recall', 'Precision', 'Accuracy']
)

clf.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
    verbose=True,
)

  _check_train_params(params)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
0:	learn: nan	test: nan	best: -inf (-1)	total: 2.74s	remaining: 2m 14s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
1:	learn: nan	test: nan	best: -inf (-1)	total: 4.8s	remaining: 1m 55s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
2:	learn: nan	test: nan	best: -inf (-1)	total: 6.83s	remaining: 1m 47s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
3:	learn: nan	test: nan	best: -inf (-1)	total: 8.88s	remaining: 1m 42s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
4:	learn: nan	test: nan	best: -inf (-1)	total: 11.2s	remaining: 1m 40s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
5:	learn: nan	test: nan	best: -inf (-1)	total: 13.7s	remaining: 1m 40s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
6:	learn: nan	test: nan	best: -inf (-1)	total: 15.7s	remaining: 1m 36s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
7:	learn: nan	test: nan	best: -inf (-1)	total: 17.7s	remaining: 1m 32s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
8:	learn: nan	test: nan	best: -inf (-1)	total: 19.5s	remaining: 1m 28s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
9:	learn: nan	test: nan	best: -inf (-1)	total: 21.5s	remaining: 1m 26s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
10:	learn: nan	test: nan	best: -inf (-1)	total: 23.5s	remaining: 1m 23s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
11:	learn: nan	test: nan	best: -inf (-1)	total: 25.5s	remaining: 1m 20s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
12:	learn: nan	test: nan	best: -inf (-1)	total: 27.6s	remaining: 1m 18s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
13:	learn: nan	test: nan	best: -inf (-1)	total: 29.6s	remaining: 1m 16s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
14:	learn: nan	test: nan	best: -inf (-1)	total: 31.7s	remaining: 1m 14s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
15:	learn: nan	test: nan	best: -inf (-1)	total: 33.8s	remaining: 1m 11s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
16:	learn: nan	test: nan	best: -inf (-1)	total: 36.3s	remaining: 1m 10s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
17:	learn: nan	test: nan	best: -inf (-1)	total: 38.4s	remaining: 1m 8s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
18:	learn: nan	test: nan	best: -inf (-1)	total: 40.6s	remaining: 1m 6s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
19:	learn: nan	test: nan	best: -inf (-1)	total: 42.6s	remaining: 1m 3s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
20:	learn: nan	test: nan	best: -inf (-1)	total: 44.6s	remaining: 1m 1s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
21:	learn: nan	test: nan	best: -inf (-1)	total: 46.5s	remaining: 59.2s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
22:	learn: nan	test: nan	best: -inf (-1)	total: 48.5s	remaining: 56.9s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
23:	learn: nan	test: nan	best: -inf (-1)	total: 50.7s	remaining: 55s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
24:	learn: nan	test: nan	best: -inf (-1)	total: 52.8s	remaining: 52.8s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
25:	learn: nan	test: nan	best: -inf (-1)	total: 54.7s	remaining: 50.5s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
26:	learn: nan	test: nan	best: -inf (-1)	total: 56.5s	remaining: 48.1s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
27:	learn: nan	test: nan	best: -inf (-1)	total: 58.5s	remaining: 45.9s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
28:	learn: nan	test: nan	best: -inf (-1)	total: 1m	remaining: 43.7s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
29:	learn: nan	test: nan	best: -inf (-1)	total: 1m 2s	remaining: 41.5s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
30:	learn: nan	test: nan	best: -inf (-1)	total: 1m 4s	remaining: 39.4s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
31:	learn: nan	test: nan	best: -inf (-1)	total: 1m 6s	remaining: 37.2s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
32:	learn: nan	test: nan	best: -inf (-1)	total: 1m 7s	remaining: 35s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
33:	learn: nan	test: nan	best: -inf (-1)	total: 1m 9s	remaining: 32.9s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
34:	learn: nan	test: nan	best: -inf (-1)	total: 1m 11s	remaining: 30.8s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
35:	learn: nan	test: nan	best: -inf (-1)	total: 1m 13s	remaining: 28.6s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
36:	learn: nan	test: nan	best: -inf (-1)	total: 1m 15s	remaining: 26.5s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
37:	learn: nan	test: nan	best: -inf (-1)	total: 1m 17s	remaining: 24.4s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
38:	learn: nan	test: nan	best: -inf (-1)	total: 1m 19s	remaining: 22.3s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
39:	learn: nan	test: nan	best: -inf (-1)	total: 1m 20s	remaining: 20.2s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
40:	learn: nan	test: nan	best: -inf (-1)	total: 1m 22s	remaining: 18.2s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
41:	learn: nan	test: nan	best: -inf (-1)	total: 1m 24s	remaining: 16.1s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
42:	learn: nan	test: nan	best: -inf (-1)	total: 1m 26s	remaining: 14.1s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
43:	learn: nan	test: nan	best: -inf (-1)	total: 1m 28s	remaining: 12.1s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
44:	learn: nan	test: nan	best: -inf (-1)	total: 1m 30s	remaining: 10s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
45:	learn: nan	test: nan	best: -inf (-1)	total: 1m 32s	remaining: 8.04s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
46:	learn: nan	test: nan	best: -inf (-1)	total: 1m 34s	remaining: 6.02s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
47:	learn: nan	test: nan	best: -inf (-1)	total: 1m 36s	remaining: 4.01s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
48:	learn: nan	test: nan	best: -inf (-1)	total: 1m 38s	remaining: 2.01s


  fpr = fp/(fp+tp)



4663067 0 136933 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>


  fpr = fp/(fp+tp)



1165953 0 34047 0
<class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'> <class 'numpy.int64'>
49:	learn: nan	test: nan	best: -inf (-1)	total: 1m 40s	remaining: 0us

bestTest = -inf
bestIteration = -1



  fpr = fp/(fp+tp)


<catboost.core.CatBoostClassifier at 0x7fbc0c559ac0>

In [50]:
best_score = clf.get_best_score()
logs = clf.get_evals_result()

In [51]:
with open(f'{LOG_DIR}/catboost_training.json') as f:
    catboost_training = json.load(f)

In [52]:
test_metric = [metric["name"] for metric in catboost_training["meta"]["test_metrics"]]
train_metric = [metric["name"] for metric in catboost_training["meta"]["learn_metrics"]]

In [53]:
for i, iteration in enumerate(catboost_training["iterations"]):
    log = dict()
    log["Epoch"] = i + 1
    for name, value in zip(train_metric, iteration["learn"]):
        log[f"Learn_{name}"] = value
    for name, value in zip(test_metric, iteration["test"]):
        log[f"Test_{name}"] = value
    log["Test_Result"] = 0.1*log["Test_Recall"]+0.9*log["Test_AUC"]
    mlflow.log_metrics(log)

In [54]:
mlflow.catboost.save_model(clf, MODELS_PATH)
mlflow.catboost.log_model(clf, MODELS_PATH)

<mlflow.models.model.ModelInfo at 0x7fbc1953e4c0>

## Predict

In [55]:
test_dataset = "data/test_dataset_test.csv"

test_df = pd.read_csv(test_dataset, low_memory=False)

In [56]:
test_df = preproc.preprocess(test_df, dtype="test")

In [57]:
test_df.head()

Unnamed: 0,id,type,priority,class,is_return,weight,mailtype,mailctg,directctg,transport_pay,postmark,weight_mfi,price_mfi,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,oper_type,oper_attr
0,7815282,4,7503,0,1,0.000538,5,1,2,0.0,0,0.004056,0.003062,0.329689,0,0,0,0,8,13
1,8443555,3,7503,4,1,0.000133,5,0,2,0.0,0,0.001056,0.001165,4.1e-05,0,1,1,0,8,2
2,6352559,3,7503,4,1,0.002456,5,1,2,0.040951,0,0.019333,0.003131,7.6e-05,0,0,0,0,1020,-1
3,4921420,15,7503,0,1,0.007614,5,1,2,0.100807,0,0.0,0.0,0.023507,0,1,0,0,1016,-1
4,1420440,19,7506,0,1,0.006051,5,1,2,0.018001,0,0.052556,0.007933,0.168136,0,1,0,0,1018,-1


In [58]:
cid = test_df["id"]
test_df = test_df.drop("id", axis=1)

In [59]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000000 entries, 0 to 3999999
Data columns (total 19 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   type                   int64  
 1   priority               int64  
 2   class                  int64  
 3   is_return              int64  
 4   weight                 float64
 5   mailtype               int64  
 6   mailctg                int64  
 7   directctg              int64  
 8   transport_pay          float64
 9   postmark               int64  
 10  weight_mfi             float64
 11  price_mfi              float64
 12  total_qty_over_index   float64
 13  is_wrong_sndr_name     int64  
 14  is_wrong_rcpn_name     int64  
 15  is_wrong_phone_number  int64  
 16  is_wrong_address       int64  
 17  oper_type              int64  
 18  oper_attr              int64  
dtypes: float64(5), int64(14)
memory usage: 579.8 MB


In [60]:
test_df

Unnamed: 0,type,priority,class,is_return,weight,mailtype,mailctg,directctg,transport_pay,postmark,weight_mfi,price_mfi,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,oper_type,oper_attr
0,4,7503,0,1,0.000538,5,1,2,0.000000,0,0.004056,0.003062,0.329689,0,0,0,0,8,13
1,3,7503,4,1,0.000133,5,0,2,0.000000,0,0.001056,0.001165,0.000041,0,1,1,0,8,2
2,3,7503,4,1,0.002456,5,1,2,0.040951,0,0.019333,0.003131,0.000076,0,0,0,0,1020,-1
3,15,7503,0,1,0.007614,5,1,2,0.100807,0,0.000000,0.000000,0.023507,0,1,0,0,1016,-1
4,19,7506,0,1,0.006051,5,1,2,0.018001,0,0.052556,0.007933,0.168136,0,1,0,0,1018,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3999995,3,7503,4,1,0.000335,5,1,2,0.000000,0,0.003389,0.001641,0.000129,0,0,0,0,1017,1001
3999996,4,7503,0,1,0.001810,5,1,2,0.038591,0,0.000000,0.000000,0.273502,0,1,0,0,1023,-1
3999997,15,7503,0,1,0.004639,5,1,2,0.060181,0,0.036778,0.001878,0.041621,0,1,0,0,1022,-1
3999998,18,7503,0,1,0.001677,5,1,2,0.000000,0,0.013722,0.003131,0.028068,0,0,0,0,39,-1


In [61]:
y = clf.predict(test_df)

In [62]:
res = pd.DataFrame({'id':cid.values, 'label': y}, index=None)

In [63]:
res.to_csv(f"{LOG_DIR}/output.csv", index=False)

## Finish

In [64]:
mlflow.log_artifact(LOG_DIR)

In [65]:
mlflow.end_run()

In [66]:
LOG_DIR

'log/catbost-11-05-2022-12-28-40'