In [1]:
!pip install tensorflow_decision_forests
!pip install wurlitzer


Collecting tensorflow_decision_forests
  Downloading tensorflow_decision_forests-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow~=2.13.0 (from tensorflow_decision_forests)
  Downloading tensorflow-2.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (524.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m524.1/524.1 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting wurlitzer (from tensorflow_decision_forests)
  Downloading wurlitzer-3.0.3-py3-none-any.whl (7.3 kB)
Collecting keras<2.14,>=2.13.1 (from tensorflow~=2.13.0->tensorflow_decision_forests)
  Downloading keras-2.13.1-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.14,>=2.13 (from tensorflow~=2.13.0->tens

In [2]:
import pandas as pd
import numpy as np
import tensorflow_decision_forests as tfdf
import tensorflow as tf

## Import and split

In [4]:
dublin_port = pd.read_csv('dublin_port.csv')
dublin_port = dublin_port.drop(['Unnamed: 0'], axis=1)
dublin_port.head()

Unnamed: 0,datetime,rain,temp,wetb,dewpt,vappr,rhum,msl,no2,so2,pm10,pm2.5
0,2022-01-01 00:00:00,0.0,13.1,11.4,9.8,12.1,80.0,1007.3,22.0,2.12,7.31,4.08
1,2022-01-01 01:00:00,0.0,13.9,11.2,8.5,11.1,70.0,1005.8,17.49,2.05,20.61,12.7
2,2022-01-01 02:00:00,0.0,14.0,11.0,8.1,10.8,67.0,1005.0,13.28,2.29,30.0,16.92
3,2022-01-01 03:00:00,0.0,14.6,11.4,8.4,11.0,66.0,1003.9,18.56,2.79,31.65,17.93
4,2022-01-01 04:00:00,0.0,14.9,12.1,9.5,11.9,70.0,1002.7,13.83,2.07,27.55,15.19


In [5]:
total_samples = len(dublin_port)
train_samples = int(0.7 * total_samples)
val_samples = int(0.15 * total_samples)

train_data = dublin_port[:train_samples]
val_data = dublin_port[train_samples:train_samples + val_samples]
test_data = dublin_port[train_samples + val_samples:]

print("Training:", len(train_data))
print("Validation:", len(val_data))
print("Testing:", len(test_data))

Training: 6115
Validation: 1310
Testing: 1312


In [6]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_data, label="pm2.5", task=tfdf.keras.Task.REGRESSION)
val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(val_data, label="pm2.5", task=tfdf.keras.Task.REGRESSION)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_data, label="pm2.5", task=tfdf.keras.Task.REGRESSION)

## Base Model

In [7]:
model1 = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)

Use /tmp/tmp9aw_67r7 as temporary training directory


In [8]:
model1.fit(train_ds)

Reading training dataset...
Training dataset read in 0:00:05.500145. Found 6115 examples.
Training model...
Model trained in 0:00:09.735704
Compiling model...


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code
Model compiled.


<keras.src.callbacks.History at 0x7ff287b359f0>

In [9]:
model1.compile(metrics=["mse", "mae"])
model1_val_result = model1.evaluate(val_ds, return_dict=True)



In [10]:
model1_val_result

{'loss': 0.0, 'mse': 4.888377666473389, 'mae': 1.5456970930099487}

In [11]:
model1_test_result = model1.evaluate(test_ds, return_dict=True)



In [12]:
model1_test_result

{'loss': 0.0, 'mse': 6.026219367980957, 'mae': 1.7709946632385254}

## Manual hyperparameter testing

In [13]:
tuner = tfdf.tuner.RandomSearch(num_trials=50)
tuner.choice("min_examples", [2, 5, 7, 10])
tuner.choice("categorical_algorithm", ["CART", "RANDOM"])
local_search_space = tuner.choice("growing_strategy", ["LOCAL"])
local_search_space.choice("max_depth", [3, 4, 5, 6, 8])
global_search_space = tuner.choice("growing_strategy", ["BEST_FIRST_GLOBAL"], merge=True)
global_search_space.choice("max_num_nodes", [16, 32, 64, 128, 256])
tuner.choice("num_candidate_attributes_ratio", [0.2, 0.5, 0.9, 1.0])

<tensorflow_decision_forests.component.tuner.tuner.SearchSpace at 0x7ff2775504c0>

In [14]:
tuned_model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION, tuner=tuner)
tuned_model.fit(train_ds, verbose=2)

Use /tmp/tmpscr5fagy as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'datetime': <tf.Tensor 'data:0' shape=(None,) dtype=string>, 'rain': <tf.Tensor 'data_1:0' shape=(None,) dtype=float64>, 'temp': <tf.Tensor 'data_2:0' shape=(None,) dtype=float64>, 'wetb': <tf.Tensor 'data_3:0' shape=(None,) dtype=float64>, 'dewpt': <tf.Tensor 'data_4:0' shape=(None,) dtype=float64>, 'vappr': <tf.Tensor 'data_5:0' shape=(None,) dtype=float64>, 'rhum': <tf.Tensor 'data_6:0' shape=(None,) dtype=float64>, 'msl': <tf.Tensor 'data_7:0' shape=(None,) dtype=float64>, 'no2': <tf.Tensor 'data_8:0' shape=(None,) dtype=float64>, 'so2': <tf.Tensor 'data_9:0' shape=(None,) dtype=float64>, 'pm10': <tf.Tensor 'data_10:0' shape=(None,) dtype=float64>}
Label: Tensor("data_11:0", shape=(None,), dtype=float64)
Weights: None
Normalized tensor features:
 {'datetime': SemanticTensor(semantic=<Semantic.CATEGORICAL: 2>, tensor=<tf.Tensor 'data:0' shape=(None,) dtype=string>), 

[INFO 23-07-12 10:05:40.4496 UTC kernel.cc:773] Start Yggdrasil model training
[INFO 23-07-12 10:05:40.4496 UTC kernel.cc:774] Collect training examples
[INFO 23-07-12 10:05:40.4496 UTC kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-07-12 10:05:40.4498 UTC kernel.cc:393] Number of batches: 7
[INFO 23-07-12 10:05:40.4499 UTC kernel.cc:394] Number of examples: 6115
[INFO 23-07-12 10:05:40.4584 UTC data_spec_inference.cc:305] 6115 item(s) have been pruned (i.e. they are considered out of dictionary) for the column datetime (0 item(s) left) because min_value_count=5 and max_number_of_unique_values=2000
[INFO 23-07-12 10:05:40.4664 UTC kernel.cc:794] Training dataset:
Number of records: 6115
Number of columns: 12

Numb

Model trained in 0:03:39.851767
Compiling model...
Model compiled.


<keras.src.callbacks.History at 0x7ff2775535e0>

In [15]:
tuned_model.compile(metrics=["mse", "mae"])
tuned_model_val_result = tuned_model.evaluate(val_ds, return_dict=True)



In [16]:
tuned_model_val_result

{'loss': 0.0, 'mse': 4.909885406494141, 'mae': 1.5439581871032715}

In [17]:
tuned_model_test_result = tuned_model.evaluate(test_ds, return_dict=True)



In [18]:
tuned_model_test_result

{'loss': 0.0, 'mse': 6.052990913391113, 'mae': 1.7690787315368652}

In [19]:
tuning_logs = tuned_model.make_inspector().tuning_logs()
tuning_logs

Unnamed: 0,score,evaluation_time,best,min_examples,categorical_algorithm,growing_strategy,max_num_nodes,num_candidate_attributes_ratio,max_depth
0,-2.09633,6.380139,False,2,RANDOM,BEST_FIRST_GLOBAL,32.0,0.5,
1,-2.124673,12.778176,False,7,RANDOM,LOCAL,,0.5,6.0
2,-1.903441,16.059612,False,7,CART,BEST_FIRST_GLOBAL,256.0,1.0,
3,-2.123331,22.181479,False,10,CART,LOCAL,,0.5,6.0
4,-2.403478,24.759267,False,7,CART,LOCAL,,0.5,3.0
5,-1.905756,28.045575,False,2,CART,BEST_FIRST_GLOBAL,256.0,0.9,
6,-2.167794,34.454906,False,10,CART,BEST_FIRST_GLOBAL,16.0,1.0,
7,-2.403478,37.602498,False,2,CART,LOCAL,,0.5,3.0
8,-2.013989,40.783772,False,10,CART,BEST_FIRST_GLOBAL,64.0,0.9,
9,-1.94372,51.935463,False,7,CART,BEST_FIRST_GLOBAL,128.0,0.5,


## Automatic hyperparameter testing

In [20]:
auto_tuner = tfdf.tuner.RandomSearch(num_trials=50, use_predefined_hps=True)

In [21]:
auto_tuned_model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION, tuner=auto_tuner)
auto_tuned_model.fit(train_ds, verbose=2)

Use /tmp/tmp8qsf_29t as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'datetime': <tf.Tensor 'data:0' shape=(None,) dtype=string>, 'rain': <tf.Tensor 'data_1:0' shape=(None,) dtype=float64>, 'temp': <tf.Tensor 'data_2:0' shape=(None,) dtype=float64>, 'wetb': <tf.Tensor 'data_3:0' shape=(None,) dtype=float64>, 'dewpt': <tf.Tensor 'data_4:0' shape=(None,) dtype=float64>, 'vappr': <tf.Tensor 'data_5:0' shape=(None,) dtype=float64>, 'rhum': <tf.Tensor 'data_6:0' shape=(None,) dtype=float64>, 'msl': <tf.Tensor 'data_7:0' shape=(None,) dtype=float64>, 'no2': <tf.Tensor 'data_8:0' shape=(None,) dtype=float64>, 'so2': <tf.Tensor 'data_9:0' shape=(None,) dtype=float64>, 'pm10': <tf.Tensor 'data_10:0' shape=(None,) dtype=float64>}
Label: Tensor("data_11:0", shape=(None,), dtype=float64)
Weights: None
Normalized tensor features:
 {'datetime': SemanticTensor(semantic=<Semantic.CATEGORICAL: 2>, tensor=<tf.Tensor 'data:0' shape=(None,) dtype=string>), 

[INFO 23-07-12 10:13:00.6742 UTC kernel.cc:773] Start Yggdrasil model training
[INFO 23-07-12 10:13:00.6742 UTC kernel.cc:774] Collect training examples
[INFO 23-07-12 10:13:00.6743 UTC kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-07-12 10:13:00.6746 UTC kernel.cc:393] Number of batches: 7
[INFO 23-07-12 10:13:00.6746 UTC kernel.cc:394] Number of examples: 6115
[INFO 23-07-12 10:13:00.6835 UTC data_spec_inference.cc:305] 6115 item(s) have been pruned (i.e. they are considered out of dictionary) for the column datetime (0 item(s) left) because min_value_count=5 and max_number_of_unique_values=2000
[INFO 23-07-12 10:13:00.6935 UTC kernel.cc:794] Training dataset:
Number of records: 6115
Number of columns: 12

Numb

Model trained in 1:37:26.520875
Compiling model...
Model compiled.


<keras.src.callbacks.History at 0x7ff277506d40>

In [22]:
auto_tuned_model.compile(metrics=["mse", "mae"])
auto_tuned_model_val_result = auto_tuned_model.evaluate(val_ds, return_dict=True)





In [23]:
auto_tuned_model_val_result

{'loss': 0.0, 'mse': 5.186985015869141, 'mae': 1.5876216888427734}

In [24]:
auto_tuned_model_test_result = auto_tuned_model.evaluate(test_ds, return_dict=True)





In [25]:
auto_tuned_model_test_result

{'loss': 0.0, 'mse': 5.893780708312988, 'mae': 1.7773699760437012}

In [26]:
auto_tuning_logs = auto_tuned_model.make_inspector().tuning_logs()
auto_tuning_logs

Unnamed: 0,score,evaluation_time,best,split_axis,sparse_oblique_projection_density_factor,sparse_oblique_normalization,sparse_oblique_weights,categorical_algorithm,winner_take_all,max_depth,min_examples
0,-1.994781,184.105175,False,SPARSE_OBLIQUE,5.0,MIN_MAX,BINARY,CART,True,12,40
1,-1.868576,349.542549,False,SPARSE_OBLIQUE,5.0,MIN_MAX,CONTINUOUS,CART,True,16,1
2,-1.882709,412.070455,False,SPARSE_OBLIQUE,4.0,MIN_MAX,CONTINUOUS,RANDOM,True,16,10
3,-1.877557,588.531844,False,SPARSE_OBLIQUE,1.0,MIN_MAX,CONTINUOUS,CART,True,20,10
4,-1.851542,714.281896,False,SPARSE_OBLIQUE,4.0,STANDARD_DEVIATION,BINARY,CART,True,25,2
5,-1.884519,795.09455,False,SPARSE_OBLIQUE,2.0,NONE,BINARY,CART,True,20,10
6,-1.890554,966.297247,False,SPARSE_OBLIQUE,5.0,MIN_MAX,BINARY,RANDOM,True,20,10
7,-1.872225,1090.722061,False,SPARSE_OBLIQUE,5.0,MIN_MAX,BINARY,RANDOM,True,30,5
8,-1.988227,1226.577188,False,SPARSE_OBLIQUE,1.0,NONE,CONTINUOUS,RANDOM,True,25,40
9,-1.850238,1302.397339,False,SPARSE_OBLIQUE,3.0,NONE,CONTINUOUS,CART,True,30,2


In [27]:
pd.DataFrame({ "Model": ["RandomForest", "RandomForest", "RandomForest"],
               "Model Type": ["Base", "Manual hyperparameter", "Automatic hyperparameter"],
               "Validation MSE": [model1_val_result['mse'], tuned_model_val_result['mse'], auto_tuned_model_val_result['mse']],
               "Validation MAE": [model1_val_result['mae'], tuned_model_val_result['mae'], auto_tuned_model_val_result['mae']],
               "Testing MSE": [model1_test_result['mse'], tuned_model_test_result['mse'], auto_tuned_model_test_result['mse']],
               "Testing MAE": [model1_test_result['mae'], tuned_model_test_result['mae'], auto_tuned_model_test_result['mae']]})

Unnamed: 0,Model,Model Type,Validation MSE,Validation MAE,Testing MSE,Testing MAE
0,RandomForest,Base,4.888378,1.545697,6.026219,1.770995
1,RandomForest,Manual hyperparameter,4.909885,1.543958,6.052991,1.769079
2,RandomForest,Automatic hyperparameter,5.186985,1.587622,5.893781,1.77737
