In [1]:
!pip install tensorflow_decision_forests
!pip install wurlitzer


Collecting tensorflow_decision_forests
  Downloading tensorflow_decision_forests-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow~=2.13.0 (from tensorflow_decision_forests)
  Downloading tensorflow-2.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (524.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m524.1/524.1 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting wurlitzer (from tensorflow_decision_forests)
  Downloading wurlitzer-3.0.3-py3-none-any.whl (7.3 kB)
Collecting keras<2.14,>=2.13.1 (from tensorflow~=2.13.0->tensorflow_decision_forests)
  Downloading keras-2.13.1-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.14,>=2.13 (from tensorflow~=2.13.0->tens

In [2]:
import pandas as pd
import numpy as np
import tensorflow_decision_forests as tfdf
import tensorflow as tf

## Import and split

In [3]:
dublin_port = pd.read_csv('dublin_port.csv')
dublin_port.head()

Unnamed: 0.1,Unnamed: 0,datetime,rain,temp,wetb,dewpt,vappr,rhum,msl,no2,so2,pm10,pm2.5
0,0,2022-01-01 00:00:00,0.0,13.1,11.4,9.8,12.1,80.0,1007.3,22.0,2.12,7.31,4.08
1,1,2022-01-01 01:00:00,0.0,13.9,11.2,8.5,11.1,70.0,1005.8,17.49,2.05,20.61,12.7
2,2,2022-01-01 02:00:00,0.0,14.0,11.0,8.1,10.8,67.0,1005.0,13.28,2.29,30.0,16.92
3,3,2022-01-01 03:00:00,0.0,14.6,11.4,8.4,11.0,66.0,1003.9,18.56,2.79,31.65,17.93
4,4,2022-01-01 04:00:00,0.0,14.9,12.1,9.5,11.9,70.0,1002.7,13.83,2.07,27.55,15.19


In [4]:
dublin_port = dublin_port.iloc[:, ~dublin_port.columns.isin(['Unnamed: 0', 'datetime'])]

total_samples = len(dublin_port)
train_samples = int(0.7 * total_samples)
val_samples = int(0.15 * total_samples)

train_data = dublin_port[:train_samples]
val_data = dublin_port[train_samples:train_samples + val_samples]
test_data = dublin_port[train_samples + val_samples:]

print("Training:", len(train_data))
print("Validation:", len(val_data))
print("Testing:", len(test_data))

Training: 6115
Validation: 1310
Testing: 1312


In [5]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_data, label="pm2.5", task=tfdf.keras.Task.REGRESSION)
val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(val_data, label="pm2.5", task=tfdf.keras.Task.REGRESSION)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_data, label="pm2.5", task=tfdf.keras.Task.REGRESSION)

## Base model

In [6]:
model1 = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION)

Use /tmp/tmp6n6801pw as temporary training directory


In [7]:
model1.fit(train_ds)

Reading training dataset...
Training dataset read in 0:00:10.525188. Found 6115 examples.
Training model...
Model trained in 0:00:06.621221
Compiling model...


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code
Model compiled.


<keras.src.callbacks.History at 0x7f63557d8be0>

In [8]:
model1.compile(metrics=["mse", "mae"])
model1_val_result = model1.evaluate(val_ds, return_dict=True)



In [9]:
model1_val_result

{'loss': 0.0, 'mse': 5.636040210723877, 'mae': 1.6604976654052734}

In [10]:
model1_test_result = model1.evaluate(test_ds, return_dict=True)



In [11]:
model1_test_result

{'loss': 0.0, 'mse': 5.933740615844727, 'mae': 1.7678805589675903}

## Manual hyperparameter testing

In [12]:
tuner = tfdf.tuner.RandomSearch(num_trials=50)
tuner.choice("min_examples", [2, 5, 7, 10])
tuner.choice("categorical_algorithm", ["CART", "RANDOM"])
local_search_space = tuner.choice("growing_strategy", ["LOCAL"])
local_search_space.choice("max_depth", [3, 4, 5, 6, 8])
global_search_space = tuner.choice("growing_strategy", ["BEST_FIRST_GLOBAL"], merge=True)
global_search_space.choice("max_num_nodes", [16, 32, 64, 128, 256])
tuner.choice("num_candidate_attributes_ratio", [0.2, 0.5, 0.9, 1.0])

<tensorflow_decision_forests.component.tuner.tuner.SearchSpace at 0x7f6355713790>

In [13]:
tuned_model = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION, tuner=tuner)
tuned_model.fit(train_ds, verbose=2)

Use /tmp/tmp9sxy6g5u as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'rain': <tf.Tensor 'data:0' shape=(None,) dtype=float64>, 'temp': <tf.Tensor 'data_1:0' shape=(None,) dtype=float64>, 'wetb': <tf.Tensor 'data_2:0' shape=(None,) dtype=float64>, 'dewpt': <tf.Tensor 'data_3:0' shape=(None,) dtype=float64>, 'vappr': <tf.Tensor 'data_4:0' shape=(None,) dtype=float64>, 'rhum': <tf.Tensor 'data_5:0' shape=(None,) dtype=float64>, 'msl': <tf.Tensor 'data_6:0' shape=(None,) dtype=float64>, 'no2': <tf.Tensor 'data_7:0' shape=(None,) dtype=float64>, 'so2': <tf.Tensor 'data_8:0' shape=(None,) dtype=float64>, 'pm10': <tf.Tensor 'data_9:0' shape=(None,) dtype=float64>}
Label: Tensor("data_10:0", shape=(None,), dtype=float64)
Weights: None
Normalized tensor features:
 {'rain': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast:0' shape=(None,) dtype=float32>), 'temp': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.T

[INFO 23-07-12 10:30:02.9184 UTC kernel.cc:773] Start Yggdrasil model training
[INFO 23-07-12 10:30:02.9185 UTC kernel.cc:774] Collect training examples
[INFO 23-07-12 10:30:02.9185 UTC kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-07-12 10:30:02.9187 UTC kernel.cc:393] Number of batches: 7
[INFO 23-07-12 10:30:02.9187 UTC kernel.cc:394] Number of examples: 6115
[INFO 23-07-12 10:30:02.9201 UTC kernel.cc:794] Training dataset:
Number of records: 6115
Number of columns: 11

Number of columns by type:
	NUMERICAL: 11 (100%)

Columns:

NUMERICAL: 11 (100%)
	0: "__LABEL" NUMERICAL mean:6.97927 min:0.51 max:19.2 sd:3.63809
	1: "dewpt" NUMERICAL mean:7.47856 min:-5 max:18.8 sd:4.52343
	2: "msl" NUMERICAL mean:1017.38 mi

Model trained in 0:02:11.449283
Compiling model...
Model compiled.


<keras.src.callbacks.History at 0x7f6355712650>

In [14]:
tuned_model.compile(metrics=["mse", "mae"])
tuned_model_val_result = tuned_model.evaluate(val_ds, return_dict=True)



In [15]:
tuned_model_val_result

{'loss': 0.0, 'mse': 6.013693332672119, 'mae': 1.7160286903381348}

In [16]:
tuned_model_test_result = tuned_model.evaluate(test_ds, return_dict=True)



In [17]:
tuned_model_test_result

{'loss': 0.0, 'mse': 6.736940860748291, 'mae': 1.8777081966400146}

In [18]:
tuning_logs = tuned_model.make_inspector().tuning_logs()
tuning_logs

Unnamed: 0,score,evaluation_time,best,min_examples,categorical_algorithm,growing_strategy,max_num_nodes,num_candidate_attributes_ratio,max_depth
0,-1.96009,2.360869,False,2,RANDOM,BEST_FIRST_GLOBAL,32.0,0.5,
1,-1.998669,5.267418,False,7,RANDOM,LOCAL,,0.5,6.0
2,-1.911054,8.094003,False,7,CART,BEST_FIRST_GLOBAL,256.0,1.0,
3,-1.947484,15.089781,False,10,CART,LOCAL,,0.5,6.0
4,-1.907538,15.899941,False,2,CART,BEST_FIRST_GLOBAL,256.0,0.9,
5,-2.128587,16.814596,False,7,CART,LOCAL,,0.5,3.0
6,-1.983955,20.492176,False,10,CART,BEST_FIRST_GLOBAL,16.0,1.0,
7,-1.958282,21.840014,False,10,CART,BEST_FIRST_GLOBAL,64.0,0.9,
8,-2.127177,22.196644,False,2,CART,LOCAL,,0.5,3.0
9,-1.978377,24.333203,False,7,CART,BEST_FIRST_GLOBAL,128.0,0.5,


## Automatic hyperparameter testing

In [19]:
auto_tuner = tfdf.tuner.RandomSearch(num_trials=50, use_predefined_hps=True)

In [20]:
auto_tuned_model = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION, tuner=auto_tuner)
auto_tuned_model.fit(train_ds, verbose=2)

Use /tmp/tmple0bh39v as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'rain': <tf.Tensor 'data:0' shape=(None,) dtype=float64>, 'temp': <tf.Tensor 'data_1:0' shape=(None,) dtype=float64>, 'wetb': <tf.Tensor 'data_2:0' shape=(None,) dtype=float64>, 'dewpt': <tf.Tensor 'data_3:0' shape=(None,) dtype=float64>, 'vappr': <tf.Tensor 'data_4:0' shape=(None,) dtype=float64>, 'rhum': <tf.Tensor 'data_5:0' shape=(None,) dtype=float64>, 'msl': <tf.Tensor 'data_6:0' shape=(None,) dtype=float64>, 'no2': <tf.Tensor 'data_7:0' shape=(None,) dtype=float64>, 'so2': <tf.Tensor 'data_8:0' shape=(None,) dtype=float64>, 'pm10': <tf.Tensor 'data_9:0' shape=(None,) dtype=float64>}
Label: Tensor("data_10:0", shape=(None,), dtype=float64)
Weights: None
Normalized tensor features:
 {'rain': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast:0' shape=(None,) dtype=float32>), 'temp': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.T

[INFO 23-07-12 10:32:15.3491 UTC kernel.cc:773] Start Yggdrasil model training
[INFO 23-07-12 10:32:15.3491 UTC kernel.cc:774] Collect training examples
[INFO 23-07-12 10:32:15.3491 UTC kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-07-12 10:32:15.3492 UTC kernel.cc:393] Number of batches: 7
[INFO 23-07-12 10:32:15.3492 UTC kernel.cc:394] Number of examples: 6115
[INFO 23-07-12 10:32:15.3499 UTC kernel.cc:794] Training dataset:
Number of records: 6115
Number of columns: 11

Number of columns by type:
	NUMERICAL: 11 (100%)

Columns:

NUMERICAL: 11 (100%)
	0: "__LABEL" NUMERICAL mean:6.97927 min:0.51 max:19.2 sd:3.63809
	1: "dewpt" NUMERICAL mean:7.47856 min:-5 max:18.8 sd:4.52343
	2: "msl" NUMERICAL mean:1017.38 mi

Model trained in 0:35:42.488805
Compiling model...
Model compiled.


<keras.src.callbacks.History at 0x7f6354790280>

In [21]:
auto_tuned_model.compile(metrics=["mse", "mae"])
auto_tuned_model_val_result = auto_tuned_model.evaluate(val_ds, return_dict=True)





In [22]:
auto_tuned_model_val_result

{'loss': 0.0, 'mse': 5.779103755950928, 'mae': 1.6832051277160645}

In [23]:
auto_tuned_model_test_result = auto_tuned_model.evaluate(test_ds, return_dict=True)





In [24]:
auto_tuned_model_test_result

{'loss': 0.0, 'mse': 6.209792137145996, 'mae': 1.8106248378753662}

In [25]:
auto_tuning_logs = auto_tuned_model.make_inspector().tuning_logs()
auto_tuning_logs

Unnamed: 0,score,evaluation_time,best,split_axis,sparse_oblique_projection_density_factor,sparse_oblique_normalization,sparse_oblique_weights,categorical_algorithm,growing_strategy,max_num_nodes,sampling_method,subsample,shrinkage,min_examples,num_candidate_attributes_ratio,max_depth
0,-1.917802,88.741703,False,SPARSE_OBLIQUE,5.0,MIN_MAX,BINARY,CART,BEST_FIRST_GLOBAL,32.0,RANDOM,1.0,0.1,10,1.0,
1,-1.903895,120.215114,False,SPARSE_OBLIQUE,4.0,NONE,BINARY,CART,BEST_FIRST_GLOBAL,512.0,RANDOM,1.0,0.05,5,0.9,
2,-2.096056,152.792027,False,SPARSE_OBLIQUE,4.0,STANDARD_DEVIATION,CONTINUOUS,CART,LOCAL,,RANDOM,0.8,0.05,7,1.0,3.0
3,-1.896099,221.571514,False,SPARSE_OBLIQUE,4.0,STANDARD_DEVIATION,BINARY,CART,LOCAL,,RANDOM,1.0,0.02,20,1.0,8.0
4,-1.909813,259.283872,False,SPARSE_OBLIQUE,5.0,MIN_MAX,BINARY,RANDOM,BEST_FIRST_GLOBAL,64.0,RANDOM,0.9,0.1,20,0.2,
5,-1.95513,283.602565,False,SPARSE_OBLIQUE,5.0,MIN_MAX,BINARY,CART,BEST_FIRST_GLOBAL,16.0,RANDOM,0.6,0.1,20,0.5,
6,-1.915706,312.059277,False,SPARSE_OBLIQUE,1.0,NONE,CONTINUOUS,RANDOM,BEST_FIRST_GLOBAL,256.0,RANDOM,0.6,0.1,10,0.9,
7,-1.860066,381.189131,False,SPARSE_OBLIQUE,2.0,MIN_MAX,BINARY,RANDOM,BEST_FIRST_GLOBAL,512.0,RANDOM,0.6,0.05,7,0.9,
8,-1.854117,395.762887,False,SPARSE_OBLIQUE,4.0,STANDARD_DEVIATION,CONTINUOUS,CART,BEST_FIRST_GLOBAL,64.0,RANDOM,0.9,0.05,20,0.2,
9,-1.902129,468.67347,False,SPARSE_OBLIQUE,5.0,NONE,CONTINUOUS,CART,BEST_FIRST_GLOBAL,64.0,RANDOM,0.6,0.05,10,0.2,


In [32]:
pd.DataFrame({ "Model": ["GradientBoostedTrees", "GradientBoostedTrees", "GradientBoostedTrees"],
               "Model Type": ["Base", "Manual hyperparameter", "Automatic hyperparameter"],
               "Validation MSE": [model1_val_result['mse'], tuned_model_val_result['mse'], auto_tuned_model_val_result['mse']],
               "Validation MAE": [model1_val_result['mae'], tuned_model_val_result['mae'], auto_tuned_model_val_result['mae']],
               "Testing MSE": [model1_test_result['mse'], tuned_model_test_result['mse'], auto_tuned_model_test_result['mse']],
               "Testing MAE": [model1_test_result['mae'], tuned_model_test_result['mae'], auto_tuned_model_test_result['mae']]})

Unnamed: 0,Model,Model Type,Validation MSE,Validation MAE,Testing MSE,Testing MAE
0,GradientBoostedTrees,Base,5.63604,1.660498,5.933741,1.767881
1,GradientBoostedTrees,Manual hyperparameter,6.013693,1.716029,6.736941,1.877708
2,GradientBoostedTrees,Automatic hyperparameter,5.779104,1.683205,6.209792,1.810625
