In [1]:
import os
import sys
from dotenv import load_dotenv

sys.path.append('../')
load_dotenv("../.env")

True

# Read data

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [10]:
train_dataset_path = "../data/train_dataset_train_preprocess.csv"

train_df = pd.read_csv(train_dataset_path)
print(train_df.shape)
train_df.head()

(5999846, 20)


Unnamed: 0,type,priority,class,is_return,weight,mailtype,mailctg,directctg,transport_pay,postmark,weight_mfi,price_mfi,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,label,oper_type,oper_attr
0,18,7503,0,1,0.000551,5,1,2,0.0,0,0.002278,0.000939,0.016573,0,0,0,0,0,1043,-1
1,4,7503,0,1,0.000677,5,1,2,0.0,0,0.003778,0.002505,0.273502,0,0,0,0,0,1023,-1
2,19,7503,0,1,0.000316,5,1,2,0.0,0,0.003111,0.001365,0.105363,0,1,0,0,0,1018,-1
3,19,7503,0,1,0.002633,5,1,2,0.042553,0,0.001833,0.000626,0.039105,0,0,0,0,0,1019,-1
4,18,7503,0,1,0.005032,5,1,2,0.063239,0,0.039778,0.006262,0.009434,0,0,0,0,0,1020,-1


In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5999846 entries, 0 to 5999845
Data columns (total 20 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   type                   int64  
 1   priority               int64  
 2   class                  int64  
 3   is_return              int64  
 4   weight                 float64
 5   mailtype               int64  
 6   mailctg                int64  
 7   directctg              int64  
 8   transport_pay          float64
 9   postmark               int64  
 10  weight_mfi             float64
 11  price_mfi              float64
 12  total_qty_over_index   float64
 13  is_wrong_sndr_name     int64  
 14  is_wrong_rcpn_name     int64  
 15  is_wrong_phone_number  int64  
 16  is_wrong_address       int64  
 17  label                  int64  
 18  oper_type              int64  
 19  oper_attr              int64  
dtypes: float64(5), int64(15)
memory usage: 915.5 MB


In [12]:
X = train_df.drop(["label"], axis=1)
y = train_df["label"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
X_train.shape[0], y_train.shape[0]

(4799876, 4799876)

In [14]:
X_val.shape[0], y_val.shape[0]

(1199970, 1199970)

# Train

In [31]:
import os
import json
import mlflow
import datetime
from xgboost import XGBClassifier
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score

In [36]:
remote_server_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("pochta-task")

<Experiment: artifact_location='s3://arts/3', creation_time=1667396965657, experiment_id='3', last_update_time=1667396965657, lifecycle_stage='active', name='pochta-task', tags={}>

In [38]:
MODEL_NAME = 'xgboost'
TIMESTAMP = datetime.datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
LOG_DIR = f'log/xgboost-{TIMESTAMP}'

PLOT_FILE = f'{LOG_DIR}/plot'
SNAPSHOT_FILE = f'{LOG_DIR}/snapshot'
MODELS_PATH = f'{LOG_DIR}/models'

In [39]:
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)
if not os.path.exists(MODELS_PATH):
    os.makedirs(MODELS_PATH)

In [40]:
mlflow.start_run()
mlflow.set_tag("mlflow.runName", f"{MODEL_NAME}-{TIMESTAMP}")

In [41]:
params = {
    "n_estimators": 100,
    "max_depth": 6,
    "max_leaves": 0,
    "grow_policy": "depthwise",
    "learning_rate": 0.5,
    "booster": "gbtree",
    "random_state": 17,
    "eval_metric": log_loss
}

In [42]:
model = XGBClassifier(
    n_estimators=params['n_estimators'],
    max_depth=params['max_depth'],
    max_leaves=params['max_leaves'],
    grow_policy=params['grow_policy'],
    learning_rate=params['learning_rate'],
    booster=params['booster'],
    random_state=params['random_state'],
    eval_metric=params["eval_metric"]
)

model.fit(X_train, y_train)

In [43]:
predict_train = model.predict(X_train)
predict_val = model.predict(X_val)

accuracy_score_train = accuracy_score(y_train, predict_train)
precision_score_train = precision_score(y_train, predict_train)
recall_score_train = recall_score(y_train, predict_train)

accuracy_score_val = accuracy_score(y_val, predict_val)
precision_score_val = precision_score(y_val, predict_val)
recall_score_val = recall_score(y_val, predict_val)

In [44]:
accuracy_score_train, precision_score_train, recall_score_train

(0.9823030844963495, 0.7324580650361362, 0.5976135500087684)

In [45]:
accuracy_score_val, precision_score_val, recall_score_val

(0.9816328741551872, 0.7173381294964029, 0.5843980775993436)

In [46]:
log = dict()

log["Learn_Accuracy"] = accuracy_score_train
log["Learn_Precision"] = precision_score_train
log["Learn_Recall"] = recall_score_train

log["Test_Accuracy"] = accuracy_score_val
log["Test_Precision"] = precision_score_val
log["Test_Recall"] = recall_score_val

mlflow.log_metrics(log)

In [48]:
mlflow.xgboost.save_model(model, MODELS_PATH)
mlflow.xgboost.log_model(model, MODELS_PATH)



<mlflow.models.model.ModelInfo at 0x7efc55be6fd0>

In [47]:
MODELS_PATH

'log/xgboost-11-05-2022-22-42-20/models'

In [56]:
mlflow.log_artifact(LOG_DIR)

In [57]:
mlflow.end_run()

In [58]:
LOG_DIR

'log/xgboost-11-05-2022-22-42-20'

### Get optimal threshold

In [28]:
import mlflow
from sklearn.metrics import precision_recall_curve, roc_auc_score
import numpy as np

MODELS_PATH = 'log/xgboost-11-05-2022-22-42-20/models'

In [16]:
remote_server_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("pochta-task")

model = mlflow.xgboost.load_model(MODELS_PATH)

In [18]:
proba = model.predict_proba(X_val)
proba

array([[2.7194691e-01, 7.2805309e-01],
       [9.9999964e-01, 3.7524737e-07],
       [9.9999392e-01, 6.0785437e-06],
       ...,
       [9.9981201e-01, 1.8800095e-04],
       [9.9999976e-01, 2.1281740e-07],
       [9.9996263e-01, 3.7391783e-05]], dtype=float32)

In [31]:
precision, recall, thresholds = precision_recall_curve(y_val, proba[:, 1])
auc = roc_auc_score(y_val, proba[:, 1])
precision, recall, thresholds

(array([0.02843738, 0.0284374 , 0.02843742, ..., 0.        , 0.        ,
        1.        ]),
 array([1., 1., 1., ..., 0., 0., 0.]),
 array([6.5822908e-10, 6.9017614e-10, 7.5454387e-10, ..., 9.9999607e-01,
        9.9999809e-01, 9.9999857e-01], dtype=float32))

In [35]:
np.where(recall > 0.99)

(array([     0,      1,      2, ..., 705347, 705348, 705349]),)

In [36]:
recall[716794]

0.9001875512835541

In [38]:
thresholds[716794]

0.2063084

In [39]:
0.1 * recall[716794] + 0.9 * auc

0.9818813841637188

In [40]:
auc

0.9909584767059593

# Test on private data

In [2]:
import mlflow

from src.utils.data_preparation import DataPrepare

MODELS_PATH = 'log/xgboost-11-05-2022-22-42-20/models'

In [3]:
remote_server_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("pochta-task")
model = mlflow.xgboost.load_model(MODELS_PATH)

In [4]:
test_dataset = "../data/test_dataset_test.csv"

prepare = DataPrepare(test_dataset)
df_test = prepare.preprocess()

In [5]:
df_test.head()

Unnamed: 0,type,priority,class,is_return,weight,mailtype,mailctg,directctg,transport_pay,postmark,weight_mfi,price_mfi,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,oper_type,oper_attr
0,4,7503,0,1,0.000538,5,1,2,0.0,0,0.004056,0.003062,0.329689,0,0,0,0,8,13
1,3,7503,4,1,0.000133,5,0,2,0.0,0,0.001056,0.001165,4.1e-05,0,1,1,0,8,2
2,3,7503,4,1,0.002456,5,1,2,0.040951,0,0.019333,0.003131,7.6e-05,0,0,0,0,1020,-1
3,15,7503,0,1,0.007614,5,1,2,0.100807,0,0.0,0.0,0.023507,0,1,0,0,1016,-1
4,19,7506,0,1,0.006051,5,1,2,0.018001,0,0.052556,0.007933,0.168136,0,1,0,0,1018,-1


In [7]:
df_test.shape

(4000000, 19)

In [11]:
predict_v1 = model.predict(df_test)
predict_v2 = model.predict_proba(df_test)
predict_v1.shape, predict_v2.shape

((4000000,), (4000000, 2))

In [12]:
proba_positive = predict_v2[:, 1]

predict_v2 = list()
for proba in proba_positive:
    if proba > 0.2063084:
        predict_v2.append(1)
    else:
        predict_v2.append(0)

In [13]:
sum(predict_v1), sum(predict_v2)

(93355, 200648)

In [20]:
import pandas as pd
cid = pd.read_csv(test_dataset, low_memory=False)['id']
res_v1 = pd.DataFrame({'id':cid.values, 'label': predict_v1}, index=None)
res_v2 = pd.DataFrame({'id':cid.values, 'label': predict_v2}, index=None)

In [21]:
res_v1

Unnamed: 0,id,label
0,7815282,0
1,8443555,0
2,6352559,0
3,4921420,0
4,1420440,0
...,...,...
3999995,9030800,0
3999996,6275048,0
3999997,4701757,0
3999998,611153,0


In [22]:
res_v2

Unnamed: 0,id,label
0,7815282,0
1,8443555,0
2,6352559,0
3,4921420,0
4,1420440,0
...,...,...
3999995,9030800,0
3999996,6275048,0
3999997,4701757,0
3999998,611153,0


In [23]:
res_v1.to_csv('./log/xgboost-11-05-2022-22-42-20/output_v1.csv', index=False)
res_v2.to_csv('./log/xgboost-11-05-2022-22-42-20/output_v2.csv', index=False)