In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# specify substep parameters for interactive run
# this cell will be replaced during job run with the parameters from json within params subfolder
substep_params={
    "param1":"None1",
    "param2":"None2"}

In [3]:
# load pipeline and step parameters - do not edit
from sinara.substep import get_pipeline_params, get_step_params
pipeline_params = get_pipeline_params(pprint=True)
step_params = get_step_params(pprint=True)

**Pipeline params:**


{'X': 'something',
 'env_name': 'user',
 'pipeline_name': 'pipeline',
 'zone_name': 'zone'}




**Step params:**


{'Y': 'something_else'}




In [4]:
#3 define substep interface
from sinara.substep import NotebookSubstep, ENV_NAME, PIPELINE_NAME, ZONE_NAME, STEP_NAME, RUN_ID, ENTITY_NAME, ENTITY_PATH, SUBSTEP_NAME

substep = NotebookSubstep(pipeline_params, step_params, substep_params)

substep.interface(
   
    inputs =
    [
        { STEP_NAME: "data_prep", ENTITY_NAME: "X_train" }, 
        { STEP_NAME: "data_prep", ENTITY_NAME: "X_val" },
        { STEP_NAME: "data_prep", ENTITY_NAME: "X_test" },
        { STEP_NAME: "data_prep", ENTITY_NAME: "y_train" },
        { STEP_NAME: "data_prep", ENTITY_NAME: "y_val" },
        { STEP_NAME: "data_prep", ENTITY_NAME: "y_test" },
        
    ],
    outputs = 
    [
        { ENTITY_NAME: "california_bento" }
    ]
)

substep.print_interface_info()

substep.exit_in_visualize_mode()

**STEP NAME:**


'model_train'




**INPUTS:**


[{'user.pipeline.zone.data_prep.X_train': '/data/home/jovyan/pipeline/zone/data_prep/run-25-01-14-152101/X_train'},
 {'user.pipeline.zone.data_prep.X_val': '/data/home/jovyan/pipeline/zone/data_prep/run-25-01-14-152101/X_val'},
 {'user.pipeline.zone.data_prep.X_test': '/data/home/jovyan/pipeline/zone/data_prep/run-25-01-14-152101/X_test'},
 {'user.pipeline.zone.data_prep.y_train': '/data/home/jovyan/pipeline/zone/data_prep/run-25-01-14-152101/y_train'},
 {'user.pipeline.zone.data_prep.y_val': '/data/home/jovyan/pipeline/zone/data_prep/run-25-01-14-152101/y_val'},
 {'user.pipeline.zone.data_prep.y_test': '/data/home/jovyan/pipeline/zone/data_prep/run-25-01-14-152101/y_test'}]




**OUTPUTS:**


[{'user.pipeline.zone.model_train.california_bento': '/data/home/jovyan/pipeline/zone/model_train/run-25-01-15-050332/california_bento'}]




In [5]:
#4 run spark
from sinara.spark import SinaraSpark

spark = SinaraSpark.run_session(0)
SinaraSpark.ui_url()

Session is run


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/15 05:03:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
#5 read inputs 
prev_step_inputs = substep.inputs(step_name="data_prep")

X_train = spark.read.parquet(prev_step_inputs.X_train).toPandas()
X_val = spark.read.parquet(prev_step_inputs.X_val).toPandas()
X_test = spark.read.parquet(prev_step_inputs.X_test).toPandas()

y_train = spark.read.parquet(prev_step_inputs.y_train).toPandas()
y_val = spark.read.parquet(prev_step_inputs.y_val).toPandas()
y_test = spark.read.parquet(prev_step_inputs.y_test).toPandas()


                                                                                

In [11]:
#6 train_model
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_percentage_error,mean_absolute_error

train_pool = Pool(X_train, y_train)
test_pool = Pool(X_val, y_val)
cat= CatBoostRegressor(random_seed=42
                    )
cat.fit(X_train, y_train, 
          eval_set=test_pool, 
          use_best_model=True, 
          early_stopping_rounds=10)



Learning rate set to 0.077444
0:	learn: 1.1448605	test: 1.1729223	best: 1.1729223 (0)	total: 6.82ms	remaining: 6.81s
1:	learn: 1.1445400	test: 1.1727717	best: 1.1727717 (1)	total: 10.4ms	remaining: 5.19s
2:	learn: 1.1442161	test: 1.1727156	best: 1.1727156 (2)	total: 13.8ms	remaining: 4.59s
3:	learn: 1.1436728	test: 1.1726374	best: 1.1726374 (3)	total: 17.2ms	remaining: 4.27s
4:	learn: 1.1432437	test: 1.1725687	best: 1.1725687 (4)	total: 20.8ms	remaining: 4.14s
5:	learn: 1.1429398	test: 1.1725029	best: 1.1725029 (5)	total: 23.5ms	remaining: 3.9s
6:	learn: 1.1426205	test: 1.1724558	best: 1.1724558 (6)	total: 25.7ms	remaining: 3.65s
7:	learn: 1.1421787	test: 1.1726028	best: 1.1724558 (6)	total: 28.3ms	remaining: 3.51s
8:	learn: 1.1416821	test: 1.1728915	best: 1.1724558 (6)	total: 30.6ms	remaining: 3.37s
9:	learn: 1.1414192	test: 1.1729396	best: 1.1724558 (6)	total: 33.9ms	remaining: 3.35s
10:	learn: 1.1411211	test: 1.1729788	best: 1.1724558 (6)	total: 36.2ms	remaining: 3.25s
11:	learn: 1.

<catboost.core.CatBoostRegressor at 0x7f6b54be94e0>

In [12]:
#7 check trained model quality on eval dataset using MAE and MAPE
mae = mean_absolute_error(y_val, cat.predict(X_val))
mape = mean_absolute_percentage_error(y_val, cat.predict(X_val))
print("The root mean squared error (MAE) on eval set: {:.4f}".format(mae))
print("The root mean squared error (MAPE) on eval set: {:.4f}".format(mape))

The root mean squared error (MAE) on eval set: 0.9237
The root mean squared error (MAPE) on eval set: 0.6188


In [13]:
#8 create and save a bentoservice
from model_service import ModelService
from sinara.bentoml import save_bentoservice

outputs = substep.outputs()
test_data = {}
test_data['X'] = X_test.to_dict(orient='records')
test_data['Y'] = y_test.to_dict(orient='records')

model = ModelService()
model.pack('model', cat)
model.pack('test_data', test_data)

save_bentoservice(model, path=outputs.california_bento, substep=substep)

[2025-01-15 05:08:33,780] INFO - BentoService bundle 'ModelService:user.pipeline.zone.california_bento.run-25-01-15-050332' created at: /home/sinarian/work/Fetch_California_Housing/california-model_train/tmp/run-25-01-15-050332/california_bento


In [14]:
#8 stop spark
SinaraSpark.stop_session()