In [1]:
import os
import sys
from dotenv import load_dotenv

sys.path.append('../')
load_dotenv("../.env")

True

In [2]:
%cd ..

/home/cva/Desktop/ITMO/deep-learninging-practice/TableBC


## Read data

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split


In [34]:
train_dataset = "data/train_preprocess.csv"
train_df = pd.read_csv(train_dataset)

In [5]:
train_df.head()

Unnamed: 0,type,priority,class,is_return,weight,mailtype,mailctg,directctg,transport_pay,postmark,weight_mfi,price_mfi,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,label,oper_type,oper_attr
0,18,7503,0,1,0.000551,5,1,2,0.0,0,0.002278,0.000939,0.016573,0,0,0,0,0,1043,-1
1,4,7503,0,1,0.000677,5,1,2,0.0,0,0.003778,0.002505,0.273502,0,0,0,0,0,1023,-1
2,19,7503,0,1,0.000316,5,1,2,0.0,0,0.003111,0.001365,0.105363,0,1,0,0,0,1018,-1
3,19,7503,0,1,0.002633,5,1,2,0.042553,0,0.001833,0.000626,0.039105,0,0,0,0,0,1019,-1
4,18,7503,0,1,0.005032,5,1,2,0.063239,0,0.039778,0.006262,0.009434,0,0,0,0,0,1020,-1


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5999846 entries, 0 to 5999845
Data columns (total 20 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   type                   int64  
 1   priority               int64  
 2   class                  int64  
 3   is_return              int64  
 4   weight                 float64
 5   mailtype               int64  
 6   mailctg                int64  
 7   directctg              int64  
 8   transport_pay          float64
 9   postmark               int64  
 10  weight_mfi             float64
 11  price_mfi              float64
 12  total_qty_over_index   float64
 13  is_wrong_sndr_name     int64  
 14  is_wrong_rcpn_name     int64  
 15  is_wrong_phone_number  int64  
 16  is_wrong_address       int64  
 17  label                  int64  
 18  oper_type              int64  
 19  oper_attr              int64  
dtypes: float64(5), int64(15)
memory usage: 915.5 MB


In [7]:
X = train_df.drop("label", axis=1)
y = train_df["label"]

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

## Train data

In [9]:
import os
import json
import mlflow
import datetime
from catboost import CatBoostClassifier

In [10]:
remote_server_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("pochta-task")

<Experiment: artifact_location='s3://arts/3', creation_time=1667396965657, experiment_id='3', last_update_time=1667396965657, lifecycle_stage='active', name='pochta-task', tags={}>

In [11]:
cat_features = [
    "type",
    "priority",
    "class",
    "is_return",
    "mailtype",
    "mailctg",
    "directctg",
    "postmark",
    "is_wrong_sndr_name",
    "is_wrong_rcpn_name",
    "is_wrong_phone_number",
    "is_wrong_address",
    "oper_type",
    "oper_attr",
]

In [12]:
ITERATIONS = 200
RANDOM_SEED = 42
LEARNING_RATE = 0.5

MODEL_NAME = 'catbost'
TIMESTAMP = datetime.datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
LOG_DIR = f'log/catbost-{TIMESTAMP}'

PLOT_FILE = f'{LOG_DIR}/plot'
SNAPSHOT_FILE = f'{LOG_DIR}/snapshot'
MODELS_PATH = f'{LOG_DIR}/models'

In [13]:
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)
if not os.path.exists(MODELS_PATH):
    os.makedirs(MODELS_PATH)

In [14]:
mlflow.start_run()
mlflow.set_tag("mlflow.runName", f"{MODEL_NAME}-{TIMESTAMP}")

In [15]:
params ={
    "iterations": ITERATIONS,
    "batch_size":RANDOM_SEED,
    "num_epochs": LEARNING_RATE,
    "random_seed": RANDOM_SEED,
}
mlflow.log_params(params)

In [16]:
clf = CatBoostClassifier(
    iterations=ITERATIONS,
    random_seed=RANDOM_SEED,
    learning_rate=LEARNING_RATE,
    snapshot_file=SNAPSHOT_FILE,
    train_dir=LOG_DIR,
    custom_loss=['AUC', 'Recall', 'Precision', 'Accuracy']
)

clf.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
    verbose=True,
)

0:	learn: 0.1575808	test: 0.1574450	best: 0.1574450 (0)	total: 4.48s	remaining: 14m 52s
1:	learn: 0.0521484	test: 0.0522138	best: 0.0522138 (1)	total: 8.55s	remaining: 14m 6s
2:	learn: 0.0458858	test: 0.0459979	best: 0.0459979 (2)	total: 12.6s	remaining: 13m 45s
3:	learn: 0.0440737	test: 0.0442518	best: 0.0442518 (3)	total: 16.5s	remaining: 13m 30s
4:	learn: 0.0431872	test: 0.0433491	best: 0.0433491 (4)	total: 19.5s	remaining: 12m 41s
5:	learn: 0.0427559	test: 0.0429230	best: 0.0429230 (5)	total: 23.3s	remaining: 12m 33s
6:	learn: 0.0425538	test: 0.0427185	best: 0.0427185 (6)	total: 26.5s	remaining: 12m 11s
7:	learn: 0.0423846	test: 0.0425632	best: 0.0425632 (7)	total: 30.2s	remaining: 12m 3s
8:	learn: 0.0422421	test: 0.0424224	best: 0.0424224 (8)	total: 34.3s	remaining: 12m 7s
9:	learn: 0.0421462	test: 0.0423223	best: 0.0423223 (9)	total: 37.6s	remaining: 11m 54s
10:	learn: 0.0417739	test: 0.0419463	best: 0.0419463 (10)	total: 41.1s	remaining: 11m 46s
11:	learn: 0.0415051	test: 0.0416

<catboost.core.CatBoostClassifier at 0x7f61ad5ae610>

In [17]:
best_score = clf.get_best_score()
logs = clf.get_evals_result()

In [18]:
with open(f'{LOG_DIR}/catboost_training.json') as f:
    catboost_training = json.load(f)

In [19]:
test_metric = [metric["name"] for metric in catboost_training["meta"]["test_metrics"]]
train_metric = [metric["name"] for metric in catboost_training["meta"]["learn_metrics"]]

In [20]:
for i, iteration in enumerate(catboost_training["iterations"]):
    log = dict()
    log["Epoch"] = i + 1
    for name, value in zip(train_metric, iteration["learn"]):
        log[f"Learn_{name}"] = value
    for name, value in zip(test_metric, iteration["test"]):
        log[f"Test_{name}"] = value
    log["Test_Result"] = 0.1*log["Test_Recall"]+0.9*log["Test_AUC"]
    mlflow.log_metrics(log)

mlflow.log_artifact(LOG_DIR)

In [21]:
mlflow.catboost.save_model(clf, MODELS_PATH)
mlflow.catboost.log_model(clf, MODELS_PATH)

<mlflow.models.model.ModelInfo at 0x7f61ea3918e0>

In [22]:
mlflow.end_run()

## Predict

In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


class DataPreproc:
    def __init__(self):
        self.drop_columns = [
            "is_in_yandex",
            "dist_qty_oper_login_1",
            "total_qty_oper_login_1",
            "total_qty_oper_login_0",
            "total_qty_over_index_and_type",
            "is_privatecategory",
            "index_oper",
            "name_mfi",
            "mailrank",
        ]
        self.numeric_col = [
            "weight",
            "transport_pay",
            "weight_mfi",
            "price_mfi",
            "total_qty_over_index",
        ]
        
        pass

    def preprocess(self, df, dtype="train"):
        if dtype == "train":
            pass

        df["oper_type"] = df["oper_type + oper_attr"].apply(lambda x: x.split("_")[0])
        df["oper_attr"] = df["oper_type + oper_attr"].apply(lambda x: x.split("_")[1])
        df["oper_type"] = df["oper_type"].astype(int)
        df["oper_attr"] = df["oper_attr"].astype(int)
        df.drop("oper_type + oper_attr", axis=1, inplace=True)
        df["type"] = LabelEncoder().fit_transform(df["type"])
        df["priority"] = df["priority"].astype(int)
        df["is_privatecategory"].replace({"N": 1, "Y": 2, "0": 0}, inplace=True)
        df["class"] = df["class"].astype(int)
        df["is_return"].replace({"N": 1, "Y": 2}, inplace=True)
        df["mailtype"] = df["mailtype"].astype(int)
        df["mailctg"] = df["mailctg"].astype(int)
        df["directctg"] = df["directctg"].astype(int)
        df["postmark"] = df["postmark"].astype(int)
        df["total_qty_over_index"] = df["total_qty_over_index"].astype("int")

        df.drop(self.drop_columns, axis=1, inplace=True)
        
        for col in numeric_col:
            scaler = MinMaxScaler().fit(np.array(df[col]).reshape(-1, 1))
            df[col] = scaler.transform(np.array(df[col]).reshape(-1, 1))
        return df


In [29]:
test_dataset = "data/test_dataset_test.csv"
test_df = pd.read_csv(test_dataset, low_memory=False)

In [30]:
preproc_test_df = preprocess(test_df)

In [35]:
train_df.head()

Unnamed: 0,type,priority,class,is_return,weight,mailtype,mailctg,directctg,transport_pay,postmark,weight_mfi,price_mfi,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,label,oper_type,oper_attr
0,18,7503,0,1,0.000551,5,1,2,0.0,0,0.002278,0.000939,0.016573,0,0,0,0,0,1043,-1
1,4,7503,0,1,0.000677,5,1,2,0.0,0,0.003778,0.002505,0.273502,0,0,0,0,0,1023,-1
2,19,7503,0,1,0.000316,5,1,2,0.0,0,0.003111,0.001365,0.105363,0,1,0,0,0,1018,-1
3,19,7503,0,1,0.002633,5,1,2,0.042553,0,0.001833,0.000626,0.039105,0,0,0,0,0,1019,-1
4,18,7503,0,1,0.005032,5,1,2,0.063239,0,0.039778,0.006262,0.009434,0,0,0,0,0,1020,-1


In [39]:
preproc_test_df.head()

Unnamed: 0,type,priority,class,is_return,weight,mailtype,mailctg,directctg,transport_pay,postmark,weight_mfi,price_mfi,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,oper_type,oper_attr
0,4,7503,0,1,0.000538,5,1,2,0.0,0,0.004056,0.003062,0.329689,0,0,0,0,8,13
1,3,7503,4,1,0.000133,5,0,2,0.0,0,0.001056,0.001165,4.1e-05,0,1,1,0,8,2
2,3,7503,4,1,0.002456,5,1,2,0.040951,0,0.019333,0.003131,7.6e-05,0,0,0,0,1020,-1
3,15,7503,0,1,0.007614,5,1,2,0.100807,0,0.0,0.0,0.023507,0,1,0,0,1016,-1
4,19,7506,0,1,0.006051,5,1,2,0.018001,0,0.052556,0.007933,0.168136,0,1,0,0,1018,-1


In [38]:
cid = preproc_test_df["id"]
preproc_test_df = preproc_test_df.drop("id", axis=1)

In [40]:
y = clf.predict(preproc_test_df)

In [49]:
y

array([0, 0, 0, ..., 0, 0, 0])

In [45]:
cid.values

array([7815282, 8443555, 6352559, ..., 4701757,  611153, 7776936])

In [54]:
res = pd.DataFrame({'id':cid.values, 'label': y}, index=None)

In [55]:
res.to_csv("output.csv", index=False)