In [1]:
import os
import sys
from dotenv import load_dotenv

sys.path.append('../')
load_dotenv("../.env")

True

In [2]:
%cd ..

/home/cva/Desktop/ITMO/deep-learninging-practice/TableBC


## Preprocess data

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


class DataPreproc:
    def __init__(self):
        self.drop_columns = [
            "is_in_yandex",
            "dist_qty_oper_login_1",
            "total_qty_oper_login_1",
            "total_qty_oper_login_0",
            "total_qty_over_index_and_type",
            "is_privatecategory",
            "index_oper",
            "name_mfi",
            "mailrank",
        ]
        self.numeric_col = [
            "weight",
            "transport_pay",
            "weight_mfi",
            "price_mfi",
            "total_qty_over_index",
        ]
        # Scalers init
        self.scalers = dict()
        for col in self.numeric_col:
            self.scalers[col] = MinMaxScaler()

        # Encoder init
        self.encoder_type = LabelEncoder()

    def preprocess(self, df, dtype="train"):
        if dtype == "train":
            for col in self.numeric_col:
                self.scalers[col].fit(np.array(df[col]).reshape(-1, 1))
            self.encoder_type.fit(df["type"])


        df["oper_type"] = df["oper_type + oper_attr"].apply(lambda x: x.split("_")[0])
        df["oper_attr"] = df["oper_type + oper_attr"].apply(lambda x: x.split("_")[1])
        df["oper_type"] = df["oper_type"].astype(int)
        df["oper_attr"] = df["oper_attr"].astype(int)
        df.drop("oper_type + oper_attr", axis=1, inplace=True)

        
        df["priority"] = df["priority"].astype(int)
        df["is_privatecategory"].replace({"N": 1, "Y": 2, "0": 0}, inplace=True)
        df["class"] = df["class"].astype(int)
        df["is_return"].replace({"N": 1, "Y": 2}, inplace=True)
        df["mailtype"] = df["mailtype"].astype(int)
        df["mailctg"] = df["mailctg"].astype(int)
        df["directctg"] = df["directctg"].astype(int)
        df["postmark"] = df["postmark"].astype(int)
        df["total_qty_over_index"] = df["total_qty_over_index"].astype("int")
        df.drop(self.drop_columns, axis=1, inplace=True)
        
        for col in self.numeric_col:
            df[col] = self.scalers[col].transform(np.array(df[col]).reshape(-1, 1))
        df["type"] = self.encoder_type.transform(df["type"])
        
        return df


## Read data

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
train_dataset_path = "data/train_dataset_train.csv"

preproc = DataPreproc()
train_df = pd.read_csv(train_dataset_path)

  train_df = pd.read_csv(train_dataset_path)


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000000 entries, 0 to 5999999
Data columns (total 29 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   id                             int64  
 1   oper_type + oper_attr          object 
 2   index_oper                     object 
 3   type                           object 
 4   priority                       float64
 5   is_privatecategory             object 
 6   class                          float64
 7   is_in_yandex                   object 
 8   is_return                      object 
 9   weight                         float64
 10  mailtype                       float64
 11  mailctg                        float64
 12  mailrank                       float64
 13  directctg                      float64
 14  transport_pay                  float64
 15  postmark                       float64
 16  name_mfi                       object 
 17  weight_mfi                     float64
 18  pr

In [7]:
train_df.head()

Unnamed: 0,id,oper_type + oper_attr,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,...,dist_qty_oper_login_1,total_qty_oper_login_1,total_qty_oper_login_0,total_qty_over_index_and_type,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,label
0,6818780,1043_-1,628629.0,Участок,7503.0,N,0.0,Y,N,87.0,...,42.0,720176.0,58950.0,779126.0,8290896.0,0,0,0,0,0
1,9907176,1023_-1,102976.0,ММПО,7503.0,N,0.0,N,N,107.0,...,914.0,48856658.0,83318932.0,132175590.0,136819803.0,0,0,0,0,0
2,3304275,1018_-1,620962.0,Цех,7503.0,N,0.0,Y,N,50.0,...,62.0,3246292.0,3233068.0,6479360.0,52708071.0,0,1,0,0,0
3,9020937,1019_-1,344964.0,Цех,7503.0,N,0.0,Y,N,416.0,...,55.0,2060928.0,653280.0,2714208.0,19562334.0,0,0,0,0,0
4,3082311,1020_-1,629819.0,Участок,7503.0,N,0.0,Y,N,795.0,...,16.0,316919.0,27911.0,344830.0,4719186.0,0,0,0,0,0


In [8]:
train_df = preproc.preprocess(train_df, dtype="train")

In [9]:
train_df

Unnamed: 0,id,type,priority,class,is_return,weight,mailtype,mailctg,directctg,transport_pay,...,weight_mfi,price_mfi,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,label,oper_type,oper_attr
0,6818780,18,7503,0,1,0.000551,5,1,2,0.000000,...,0.002278,0.000939,0.016573,0,0,0,0,0,1043,-1
1,9907176,4,7503,0,1,0.000677,5,1,2,0.000000,...,0.003778,0.002505,0.273502,0,0,0,0,0,1023,-1
2,3304275,19,7503,0,1,0.000316,5,1,2,0.000000,...,0.003111,0.001365,0.105363,0,1,0,0,0,1018,-1
3,9020937,19,7503,0,1,0.002633,5,1,2,0.042553,...,0.001833,0.000626,0.039105,0,0,0,0,0,1019,-1
4,3082311,18,7503,0,1,0.005032,5,1,2,0.063239,...,0.039778,0.006262,0.009434,0,0,0,0,0,1020,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5999995,9958614,4,7503,0,1,0.000158,5,0,2,0.000000,...,0.001333,0.000626,0.376626,0,0,1,0,0,1022,-1
5999996,2234489,19,7503,0,1,0.000525,5,1,2,0.000000,...,0.005556,0.011471,0.031148,0,0,0,0,0,1022,-1
5999997,4304572,19,7506,0,1,0.010759,5,1,2,0.113293,...,0.052889,0.005009,0.151109,0,0,0,0,0,1041,-1
5999998,6550634,19,7506,0,1,0.001703,5,1,2,0.000000,...,0.000000,0.000000,0.079937,0,1,0,0,0,1018,-1


In [10]:
X = train_df.drop(["label", "id"], axis=1)
y = train_df["label"]

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

## Train data

In [12]:
import os
import uuid
import json
import mlflow
import datetime
from catboost import CatBoostClassifier

In [13]:
remote_server_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("pochta-task")

<Experiment: artifact_location='s3://arts/3', creation_time=1667396965657, experiment_id='3', last_update_time=1667396965657, lifecycle_stage='active', name='pochta-task', tags={}>

In [14]:
cat_features = [
    "type",
    "priority",
    "class",
    "is_return",
    "mailtype",
    "mailctg",
    "directctg",
    "postmark",
    "is_wrong_sndr_name",
    "is_wrong_rcpn_name",
    "is_wrong_phone_number",
    "is_wrong_address",
    "oper_type",
    "oper_attr",
]

In [15]:
ITERATIONS = 300
RANDOM_SEED = uuid.uuid4().int & (1<<64)-1
LEARNING_RATE = 0.5

MODEL_NAME = 'catbost'
TIMESTAMP = datetime.datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
LOG_DIR = f'log/catbost-{TIMESTAMP}'

PLOT_FILE = f'{LOG_DIR}/plot'
SNAPSHOT_FILE = f'{LOG_DIR}/snapshot'
MODELS_PATH = f'{LOG_DIR}/models'

In [16]:
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)
if not os.path.exists(MODELS_PATH):
    os.makedirs(MODELS_PATH)

In [17]:
mlflow.start_run()
mlflow.set_tag("mlflow.runName", f"{MODEL_NAME}-{TIMESTAMP}")

In [18]:
params ={
    "iterations": ITERATIONS,
    "batch_size":RANDOM_SEED,
    "num_epochs": LEARNING_RATE,
    "random_seed": RANDOM_SEED,
}
mlflow.log_params(params)

In [19]:
from sklearn.metrics import confusion_matrix
from scipy.special import expit


class FinalMetric(object):
    @staticmethod
    def get_profit(y_true, y_pred):
        y_pred = (expit(y_pred) > 0.5).astype(int)
        y_true = y_true.astype(int)

        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        tn, fp, fn, tp = tn.tolist(), fp.tolist(), fn.tolist(), tp.tolist()

        recall = tp / (tp + fn)
        if tp + fn != 0:
            tpr = tp / (tp + fn)
        else:
            tpr = 0
        if fp + tp != 0:
            fpr = fp / (fp + tp)
        else:
            fpr = 0
        auc = (1 + tpr - fpr) / 2

        loss = 0.1 * recall + 0.9 * auc
        return loss

    def is_max_optimal(self):
        return True  # greater is better

    def evaluate(self, approxes, target, weight):
        y_true = np.array(target).astype(int)
        approx = approxes[0]
        score = self.get_profit(y_true, approx)
        return score, 1

    def get_final_error(self, error, weight):
        return error


In [20]:
clf = CatBoostClassifier(
    iterations=ITERATIONS,
    random_seed=RANDOM_SEED,
    learning_rate=LEARNING_RATE,
    snapshot_file=SNAPSHOT_FILE,
    train_dir=LOG_DIR,
    eval_metric=FinalMetric(),
    custom_loss=['AUC', 'Recall', 'Precision', 'Accuracy']
)

clf.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
    verbose=True,
)

  _check_train_params(params)


0:	learn: 0.4500000	test: 0.4500000	best: 0.4500000 (0)	total: 5.3s	remaining: 26m 23s
1:	learn: 0.4479491	test: 0.4479310	best: 0.4500000 (0)	total: 10.6s	remaining: 26m 21s
2:	learn: 0.5116411	test: 0.5115927	best: 0.5115927 (2)	total: 15.3s	remaining: 25m 11s
3:	learn: 0.5155573	test: 0.5143120	best: 0.5143120 (3)	total: 19.4s	remaining: 23m 55s
4:	learn: 0.5757977	test: 0.5764656	best: 0.5764656 (4)	total: 23.6s	remaining: 23m 11s
5:	learn: 0.5798557	test: 0.5800733	best: 0.5800733 (5)	total: 27.8s	remaining: 22m 41s
6:	learn: 0.5895453	test: 0.5897595	best: 0.5897595 (6)	total: 31.8s	remaining: 22m 10s
7:	learn: 0.5855825	test: 0.5849912	best: 0.5897595 (6)	total: 36.6s	remaining: 22m 15s
8:	learn: 0.5900633	test: 0.5892715	best: 0.5897595 (6)	total: 41.1s	remaining: 22m 8s
9:	learn: 0.5986929	test: 0.5972653	best: 0.5972653 (9)	total: 45.4s	remaining: 21m 55s
10:	learn: 0.6072539	test: 0.6064089	best: 0.6064089 (10)	total: 50s	remaining: 21m 54s
11:	learn: 0.6089328	test: 0.60798

<catboost.core.CatBoostClassifier at 0x7f40fabeaa00>

In [21]:
best_score = clf.get_best_score()
logs = clf.get_evals_result()

In [22]:
with open(f'{LOG_DIR}/catboost_training.json') as f:
    catboost_training = json.load(f)

In [23]:
test_metric = [metric["name"] for metric in catboost_training["meta"]["test_metrics"]]
train_metric = [metric["name"] for metric in catboost_training["meta"]["learn_metrics"]]

In [24]:
for i, iteration in enumerate(catboost_training["iterations"]):
    log = dict()
    log["Epoch"] = i + 1
    for name, value in zip(train_metric, iteration["learn"]):
        log[f"Learn_{name}"] = value
    for name, value in zip(test_metric, iteration["test"]):
        log[f"Test_{name}"] = value
    log["Test_Result"] = 0.1*log["Test_Recall"]+0.9*log["Test_AUC"]
    mlflow.log_metrics(log)

In [25]:
mlflow.catboost.save_model(clf, MODELS_PATH)
mlflow.catboost.log_model(clf, MODELS_PATH)

<mlflow.models.model.ModelInfo at 0x7f4132f51f40>

## Predict

In [26]:
test_dataset = "data/test_dataset_test.csv"

test_df = pd.read_csv(test_dataset, low_memory=False)

In [27]:
test_df = preproc.preprocess(test_df, dtype="test")

In [28]:
test_df.head()

Unnamed: 0,id,type,priority,class,is_return,weight,mailtype,mailctg,directctg,transport_pay,postmark,weight_mfi,price_mfi,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,oper_type,oper_attr
0,7815282,4,7503,0,1,0.000538,5,1,2,0.0,0,0.004056,0.003062,0.329689,0,0,0,0,8,13
1,8443555,3,7503,4,1,0.000133,5,0,2,0.0,0,0.001056,0.001165,4.1e-05,0,1,1,0,8,2
2,6352559,3,7503,4,1,0.002456,5,1,2,0.040951,0,0.019333,0.003131,7.6e-05,0,0,0,0,1020,-1
3,4921420,15,7503,0,1,0.007614,5,1,2,0.100807,0,0.0,0.0,0.023507,0,1,0,0,1016,-1
4,1420440,19,7506,0,1,0.006051,5,1,2,0.018001,0,0.052556,0.007933,0.168136,0,1,0,0,1018,-1


In [29]:
cid = test_df["id"]
test_df = test_df.drop("id", axis=1)

In [30]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000000 entries, 0 to 3999999
Data columns (total 19 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   type                   int64  
 1   priority               int64  
 2   class                  int64  
 3   is_return              int64  
 4   weight                 float64
 5   mailtype               int64  
 6   mailctg                int64  
 7   directctg              int64  
 8   transport_pay          float64
 9   postmark               int64  
 10  weight_mfi             float64
 11  price_mfi              float64
 12  total_qty_over_index   float64
 13  is_wrong_sndr_name     int64  
 14  is_wrong_rcpn_name     int64  
 15  is_wrong_phone_number  int64  
 16  is_wrong_address       int64  
 17  oper_type              int64  
 18  oper_attr              int64  
dtypes: float64(5), int64(14)
memory usage: 579.8 MB


In [31]:
test_df

Unnamed: 0,type,priority,class,is_return,weight,mailtype,mailctg,directctg,transport_pay,postmark,weight_mfi,price_mfi,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,oper_type,oper_attr
0,4,7503,0,1,0.000538,5,1,2,0.000000,0,0.004056,0.003062,0.329689,0,0,0,0,8,13
1,3,7503,4,1,0.000133,5,0,2,0.000000,0,0.001056,0.001165,0.000041,0,1,1,0,8,2
2,3,7503,4,1,0.002456,5,1,2,0.040951,0,0.019333,0.003131,0.000076,0,0,0,0,1020,-1
3,15,7503,0,1,0.007614,5,1,2,0.100807,0,0.000000,0.000000,0.023507,0,1,0,0,1016,-1
4,19,7506,0,1,0.006051,5,1,2,0.018001,0,0.052556,0.007933,0.168136,0,1,0,0,1018,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3999995,3,7503,4,1,0.000335,5,1,2,0.000000,0,0.003389,0.001641,0.000129,0,0,0,0,1017,1001
3999996,4,7503,0,1,0.001810,5,1,2,0.038591,0,0.000000,0.000000,0.273502,0,1,0,0,1023,-1
3999997,15,7503,0,1,0.004639,5,1,2,0.060181,0,0.036778,0.001878,0.041621,0,1,0,0,1022,-1
3999998,18,7503,0,1,0.001677,5,1,2,0.000000,0,0.013722,0.003131,0.028068,0,0,0,0,39,-1


In [32]:
y = clf.predict(test_df)

In [33]:
len(y), y.sum()

(4000000, 91630)

In [34]:
res = pd.DataFrame({'id':cid.values, 'label': y}, index=None)

In [35]:
res.to_csv(f"{LOG_DIR}/output.csv", index=False)

## Finish

In [36]:
mlflow.log_artifact(LOG_DIR)

In [37]:
mlflow.end_run()

In [38]:
LOG_DIR

'log/catbost-11-06-2022-00-12-40'