In [17]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [1]:
# Set repository root as a Workding Directory 

# For virtual environment: uncomment the lines below
# %cd ..
# %env PYTHONPATH=.

# For Docker
%cd /home/tutorial-predict-device-change

/home/tutorial-predict-device-change


In [2]:
import pandas as pd

# Config

In [7]:
CONFIG = './src/config.yaml'

with open(CONFIG) as conf_file:
    print(conf_file.read())

Base:
  random_state: 42
  log_level: DEBUG


Data:
  user_features_raw: ./data/raw/user_features.feather # train_x
  target_raw: ./data/raw/target.feather # train_y
  scoring_user_features_raw: ./data/raw/scoring_user_features.feather # test_x
  scoring_target_raw: ./data/raw/scoring_target.feather # control for test_y


Features:
  features_path: ./data/processed/features.feather # train_x
  scoring_features_path: ./data/processed/scoring_features.feather # test_x
  predicted_target_path: ./data/processed/predicted_target.feather # prediction for test_y


Train:
  top_K_coef: 0.05
  model_path: ./models/model.joblib
  train_metrics: ./reports/train_metrics.json
  model_params:
    iterations: 10
    thread_count: 50
    has_time: True
    allow_writing_files: False
    cat_features:
      - feature_17
      - feature_21
      - feature_11
      - feature_11
      - feature_16
      - feature_22


# Initialize DVC project

In [10]:
%%bash

# Если DVC репозиторий уже инициализирован, будет сообщение об ошибке 

dvc init

ERROR: failed to initiate DVC - '.dvc' exists. Use `-f` to force.


CalledProcessError: Command 'b'\n# \xd0\x95\xd1\x81\xd0\xbb\xd0\xb8 DVC \xd1\x80\xd0\xb5\xd0\xbf\xd0\xbe\xd0\xb7\xd0\xb8\xd1\x82\xd0\xbe\xd1\x80\xd0\xb8\xd0\xb9 \xd1\x83\xd0\xb6\xd0\xb5 \xd0\xb8\xd0\xbd\xd0\xb8\xd1\x86\xd0\xb8\xd0\xb0\xd0\xbb\xd0\xb8\xd0\xb7\xd0\xb8\xd1\x80\xd0\xbe\xd0\xb2\xd0\xb0\xd0\xbd, \xd0\xb1\xd1\x83\xd0\xb4\xd0\xb5\xd1\x82 \xd1\x81\xd0\xbe\xd0\xbe\xd0\xb1\xd1\x89\xd0\xb5\xd0\xbd\xd0\xb8\xd0\xb5 \xd0\xbe\xd0\xb1 \xd0\xbe\xd1\x88\xd0\xb8\xd0\xb1\xd0\xba\xd0\xb5 \n\ndvc init\n'' returned non-zero exit status 1.

In [6]:
%%bash

git add .git && git commit -a -m "Init DVC REPO"

[step-3-dvc 430cf2d] Init DVC REPO
 2 files changed, 1 insertion(+), 5 deletions(-)


# Configure DVC project

In [11]:
%%bash

dvc remote add -f -d myremote /tmp/dvc/tutorial-predict-device-change

Setting 'myremote' as a default remote.


In [13]:
%%bash
git add .
git commit -m "Add dvc remote storage"

[automate-pipelines 6d67094] Add dvc remote storage
 1 file changed, 4 insertions(+)


# Create DVC pipeline

## Load & process data, Create features

In [41]:
# !dvc run \
#     -n data_load \
#     -d src/pipelines/data_load.py \
#     -d data/raw/target.feather \
#     -d data/raw/user_features.feather \
#     -o data/processed/target.feather \
#     -o data/processed/user_features.feather \
#     -p data_load \
#     python src/pipelines/data_load.py --config=params.yaml

In [58]:
! dvc status

Data and pipelines are up to date.                                    core[39m>
[0m

In [39]:
!dvc run \
    -n Features \
    -d src/featurize.py \
    -d data/raw/target.feather \
    -d data/raw/user_features.feather \
    -o data/processed/features.feather \
    -p Features \
    python src/featurize.py --config=params.yaml

Running stage 'Features':                                             core[39m>
> python src/featurize.py --config=params.yaml
2021-05-25 10:07:17,011 — DATA_LOAD — INFO — Load raw target for train..
2021-05-25 10:07:17,083 — DATA_LOAD — INFO — Load raw data for train..
2021-05-25 10:07:17,580 — DATA_LOAD — INFO — Train_XY loaded
2021-05-25 10:07:17,585 — FEATURES — INFO — Create train features...
2021-05-25 10:07:17,848 — FEATURES — INFO — Done
Creating 'dvc.yaml'                                                             
Adding stage 'Features' in 'dvc.yaml'
Generating lock file 'dvc.lock'
Updating lock file 'dvc.lock'

To track the changes with git, run:

	git add dvc.yaml dvc.lock
[0m

In [40]:
# ! dvc repro

In [44]:
!ls data/processed/

features.feather  predicted_target.feather  scoring_features.feather


In [46]:
features = pd.read_feather('data/processed/features.feather')
features.head()

Unnamed: 0,user_id,month,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,target
0,0,2020-04-30,0.993121,-15,2.274309,18,2868,-1.305588,-0.097643,0.617778,...,AOKOISPPQLWGKK,-3.146305,-1.655508,25,3.032537,-29960,-1.86466,-0.212668,-3.245333,1.0
1,0,2020-05-31,-1.416912,-145,-1.087891,-8,-1763,-1.322007,3.143865,-0.272231,...,OORLO7PMHCZFEMSR,2.365636,-2.632201,-11,-2.373654,-173398,0.45899,1.376687,-0.46015,0.0
2,0,2020-06-30,0.673564,23,0.016666,-7,-4092,-0.936663,-1.909813,0.715618,...,NZFU27MWLPZRTX4G5D,-2.220931,1.930994,-10,3.301401,-46619,0.26982,0.900846,0.315063,1.0
3,0,2020-07-31,-2.124908,-15,1.234815,18,9348,0.233297,-0.108647,-1.83146,...,KQBIGTVRDJZJLQRRPPY,1.64178,-5.166544,-23,0.531913,-131224,-0.341103,0.208832,1.869453,0.0
4,0,2020-08-31,-2.122264,-50,-0.584464,-8,1894,-0.092315,-0.54748,-0.065323,...,KQBIGTVRDJZJLQRRPPY,1.852084,-0.761511,-24,-1.080867,68577,-1.328331,-0.681723,0.431699,1.0


## Train

In [47]:
!dvc run \
    -n Train \
    -d src/train.py \
    -d data/processed/features.feather \
    -o models/model.joblib \
    -m reports/train_metrics.json \
    -p Train \
    python src/train.py --config=params.yaml

Running stage 'Train':                                                core[39m>
> python src/train.py --config=params.yaml
2021-05-25 10:13:30,943 — MODEL_TRAIN — INFO — Start training model..
2021-05-25 10:13:31,089 — MODEL_TRAIN — INFO — Fold 1:
2021-05-25 10:13:31,089 — MODEL_TRAIN — INFO — Train: 2020-04-30 00:00:00 - 2020-04-30 00:00:00
2021-05-25 10:13:31,089 — MODEL_TRAIN — INFO — Test: 2020-05-31 00:00:00 

2021-05-25 10:13:31,170 — MODEL_TRAIN — INFO — Train shapes: X - (150484, 30), y - (150484,)
2021-05-25 10:13:31,170 — MODEL_TRAIN — INFO — Test shapes: X - (150411, 30), y - (150411,)
Learning rate set to 0.5
0:	learn: 0.6136792	total: 132ms	remaining: 1.19s
1:	learn: 0.5580362	total: 207ms	remaining: 829ms
2:	learn: 0.5270051	total: 280ms	remaining: 654ms
3:	learn: 0.5080045	total: 373ms	remaining: 560ms
4:	learn: 0.4978499	total: 448ms	remaining: 448ms
5:	learn: 0.4870497	total: 517ms	remaining: 345ms
6:	learn: 0.4816503	total: 608ms	remaining: 261ms
7:	learn: 0.4764344	

In [52]:
!ls models

model.joblib


In [51]:
!cat reports/train_metrics.json

{
    "lift_max": 2.112150973456816,
    "lift_min": 2.0964878319828544,
    "lift_std": 0.007651517626364342,
    "lift_mean": 2.104739162031797,
    "precision_at_k_max": 0.824549274052013,
    "precision_at_k_min": 0.8159602191139712,
    "precision_at_k_std": 0.003609463695961109,
    "precision_at_k_mean": 0.8207466893580813,
    "recall_at_k_max": 1.0,
    "recall_at_k_min": 1.0,
    "recall_at_k_std": 0.0,
    "recall_at_k_mean": 1.0
}

In [54]:
# %%bash

# git add .
# git commit -m "Create dvc pipeline"

## List stages

In [55]:
!dvc stage list

Features  Outputs data/processed/features.feather
Train     Outputs models/model.joblib; Reports reports/train_metrics.json
[0m

In [56]:
!dvc repro


Stage 'Features' didn't change, skipping                              core[39m>
Stage 'Train' didn't change, skipping
Data and pipelines are up to date.
[0m

# Push DVC cache to remote

In [57]:
!dvc push

  0% Uploading|                                      |0/3 [00:00<?,     ?file/s]
![A
  0%|          |models/model.joblib            0.00/22.2k [00:00<?,       ?it/s][A

  0%|          |reports/train_metrics.json          0/446 [00:00<?,       ?it/s][A[A


  0%|          |data/processed/features.feather 0.00/132M [00:00<?,       ?it/s][A[A[A
                                                                                [A

                                                                                [A[A


3 files pushed                                                                  [A[A[A
[0m

In [59]:
! dvc status

Data and pipelines are up to date.                                    core[39m>
[0m