This notebook demonstrates the process of generating and preparing a dataset, followed by building and tuning a decision tree model using XGBoost and Hyperopt.

It begins by generating a dataset with 10 numeric features over a period of 24 weeks. A date is assigned to each sample and a target variable is created with an 80/20 class distribution. These features are combined into a single DataFrame for model training.

The CoresetTreeServiceDTC is configured with specific parameters for building the Coreset tree chunked by week. The get_hyperparameter_tuning_data function is used to split the data into training and validation sets, with the first 20 weeks used for training and the last 4 weeks for validation.

Hyperparameter tuning is performed using the Hyperopt library’s fmin function, optimizing the XGBoost model based on the F1 score. After finding the best hyperparameters, the final model is trained on both the training and validation sets (retrieved by using the get_coreset function) using these parameters.

#### Import the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import datetime
import xgboost
from dataheroes import CoresetTreeServiceDTC, DataTuningParamsClassification
import warnings
warnings.filterwarnings('ignore')

#### Generate and prepare the data

In [24]:
# Parameters
weeks = 6 * 4  # 24 weeks
samples_per_week = np.random.randint(9500, 10500, size=weeks)
total_samples = sum(samples_per_week)

# First DataFrame with 10 numeric features and random samples
df_numeric = pd.DataFrame(np.random.randn(total_samples, 10), columns=[f'f{i}' for i in range(10)])

# Generate a sequence column with timestamps
data = []
start_date = datetime.datetime(2024, 4, 24)
for week, samples in enumerate(samples_per_week):
    start_of_week = start_date + datetime.timedelta(weeks=week)
    dates = [start_of_week.date()] * samples
    data.extend(dates)

# Ensure the number of timestamps matches total_samples
df_dates = pd.DataFrame(data[:total_samples], columns=['seq'])

# Third DataFrame with target column (80% 0, 20% 1)
target_values = np.random.choice([0, 1], size=total_samples, p=[0.8, 0.2])
df_target = pd.DataFrame({'target': target_values})

In [25]:
df_numeric

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9
0,-0.582464,0.609310,-0.244867,-0.263557,0.023136,-0.380135,-0.412420,-0.034841,-0.083252,-0.074547
1,-0.947061,-0.800182,-1.523543,1.258308,-1.893976,1.019886,-0.996969,-1.236140,-0.290084,-2.398909
2,0.936893,-1.510016,0.041593,1.513228,-0.225873,-1.262586,0.307411,1.941631,-0.950935,-0.935260
3,0.442771,2.124550,1.444860,-0.346951,0.840326,-0.202314,-0.001161,0.099413,0.610096,-0.322072
4,-1.060175,0.817385,-0.126564,-0.080536,0.191594,-0.529459,-0.916929,0.179388,0.905792,-1.292681
...,...,...,...,...,...,...,...,...,...,...
240499,0.068844,-0.400247,1.096227,-0.390827,-0.441302,-0.933292,0.242286,0.316337,0.635870,0.561669
240500,0.524675,1.464806,-0.166618,0.907481,0.800533,-0.357963,-0.656992,-2.715094,0.006416,-1.471388
240501,-1.008480,0.065468,-1.185543,0.056485,-0.065472,-1.060652,0.102467,0.417393,0.268768,-0.450023
240502,0.559086,0.177855,-1.082523,-0.297011,1.716573,-1.509080,1.288531,-0.102917,-1.862996,0.505966


In [26]:
df_dates['seq'].value_counts().sort_index()

seq
2024-04-24     9690
2024-05-01    10245
2024-05-08    10194
2024-05-15    10390
2024-05-22     9812
2024-05-29    10450
2024-06-05    10295
2024-06-12    10432
2024-06-19     9634
2024-06-26     9502
2024-07-03    10385
2024-07-10     9979
2024-07-17    10231
2024-07-24    10290
2024-07-31    10184
2024-08-07    10273
2024-08-14     9748
2024-08-21     9903
2024-08-28     9719
2024-09-04     9870
2024-09-11     9997
2024-09-18     9566
2024-09-25     9744
2024-10-02     9971
Name: count, dtype: int64

In [27]:
df_target

Unnamed: 0,target
0,1
1,0
2,0
3,0
4,1
...,...
240499,0
240500,0
240501,0
240502,0


In [28]:
# Join the features dataset together with the sequence column dataset
df = pd.concat([df_numeric, df_dates], axis=1)
df

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,seq
0,-0.582464,0.609310,-0.244867,-0.263557,0.023136,-0.380135,-0.412420,-0.034841,-0.083252,-0.074547,2024-04-24
1,-0.947061,-0.800182,-1.523543,1.258308,-1.893976,1.019886,-0.996969,-1.236140,-0.290084,-2.398909,2024-04-24
2,0.936893,-1.510016,0.041593,1.513228,-0.225873,-1.262586,0.307411,1.941631,-0.950935,-0.935260,2024-04-24
3,0.442771,2.124550,1.444860,-0.346951,0.840326,-0.202314,-0.001161,0.099413,0.610096,-0.322072,2024-04-24
4,-1.060175,0.817385,-0.126564,-0.080536,0.191594,-0.529459,-0.916929,0.179388,0.905792,-1.292681,2024-04-24
...,...,...,...,...,...,...,...,...,...,...,...
240499,0.068844,-0.400247,1.096227,-0.390827,-0.441302,-0.933292,0.242286,0.316337,0.635870,0.561669,2024-10-02
240500,0.524675,1.464806,-0.166618,0.907481,0.800533,-0.357963,-0.656992,-2.715094,0.006416,-1.471388,2024-10-02
240501,-1.008480,0.065468,-1.185543,0.056485,-0.065472,-1.060652,0.102467,0.417393,0.268768,-0.450023,2024-10-02
240502,0.559086,0.177855,-1.082523,-0.297011,1.716573,-1.509080,1.288531,-0.102917,-1.862996,0.505966,2024-10-02


#### Prepare service and coreset parameters

In [None]:
data_params_dict = {
    'seq_column': {
        'name': 'seq',
        'granularity': 'W',
        'chunk_by': True,
        'datetime_format': '%Y-%m-%d',
    },
}

#### Build the tree

In [None]:
# Since the dataset isn’t highly imbalanced, it is better to set fair to False.

service = CoresetTreeServiceDTC(
    data_params=data_params_dict,
    coreset_params=coreset_params,
    data_tuning_params=DataTuningParamsClassification(coreset_size=[0.2], fair = [False]),
    optimized_for='training',
    model_cls=xgboost.XGBClassifier,
)

In [31]:
service.build_from_df(
    datasets=[df],
    target_datasets=[df_target],
)

2024-10-10 13:52:33 Build Started.
2024-10-10 13:52:33 Completed chunk #1, [33m(chunk was built in 0.001 seconds)[00m
2024-10-10 13:52:33 Completed chunk #2, [33m(chunk was built in 0.002 seconds)[00m
2024-10-10 13:52:33 Completed chunk #3, [33m(chunk was built in 0.003 seconds)[00m
2024-10-10 13:52:33 Completed chunk #4, [33m(chunk was built in 0.002 seconds)[00m
2024-10-10 13:52:33 Completed chunk #5, [33m(chunk was built in 0.001 seconds)[00m
2024-10-10 13:52:33 Completed chunk #6, [33m(chunk was built in 0.010 seconds)[00m
2024-10-10 13:52:33 Completed chunk #7, [33m(chunk was built in 0.009 seconds)[00m
2024-10-10 13:52:33 Completed chunk #8, [33m(chunk was built in 0.001 seconds)[00m
2024-10-10 13:52:34 Completed chunk #9, [33m(chunk was built in 0.034 seconds)[00m
2024-10-10 13:52:34 Completed chunk #10, [33m(chunk was built in 0.006 seconds)[00m
2024-10-10 13:52:34 Completed chunk #11, [33m(chunk was built in 0.002 seconds)[00m
2024-10-10 13:52:34 Complete

<dataheroes.services.coreset_tree.dtc.CoresetTreeServiceDTC at 0x2bcd63610>

#### Get training and validation data

In [32]:
result = service.get_hyperparameter_tuning_data(
    seq_train_from="2024-04-24", 
    seq_train_to="2024-09-09", 
    seq_validate_from="2024-09-10", 
    seq_validate_to="2024-10-02", 
    validation_method='seq-dependent-validation'
)

# Get training and validation data
train_indexes, validation_indexes = next(result['splitter'])
X_train = result['X'].iloc[train_indexes]
y_train = result['y'][train_indexes]
w_train = result['w'][train_indexes]
X_validation = result['X'].iloc[validation_indexes]
y_validation = result['y'][validation_indexes]

#### Hyperparameter tuning

In [33]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import f1_score
# Define the objective function for Hyperopt
def objective(space):
    # Train the XGBoost model with the hyperparameters from the search space
    clf = xgboost.XGBClassifier(
        n_estimators=int(space['n_estimators']),
        max_depth=int(space['max_depth']),
        learning_rate=space['learning_rate'],
        gamma=space['gamma'],
        min_child_weight=space['min_child_weight'],
        subsample=space['subsample'],
        colsample_bytree=space['colsample_bytree'],
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
        verbosity=0
    )
    
    # Fit the model using sample weights
    clf.fit(X_train, y_train, sample_weight=w_train)
    
    # Make predictions on the validation set
    predictions = clf.predict(X_validation)
    
    # Calculate F1 score
    f1 = f1_score(y_validation, predictions)
    
    # Return negative F1 score since Hyperopt minimizes the objective
    return {'loss': -f1, 'status': STATUS_OK}

# Define the hyperparameter search space
space = {
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 50),
    'gamma': hp.uniform('gamma', 0, 0.5),
    'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)
}

# Create a Trials object to store the results
trials = Trials()

# Run the Hyperopt optimization
best_hyperparams = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials
)

# Print the best hyperparameters found
print("The best hyperparameters are:", best_hyperparams)

# Retrieve the best model based on the best hyperparameters
best_model = xgboost.XGBClassifier(
    n_estimators=int(best_hyperparams['n_estimators']),
    max_depth=int(best_hyperparams['max_depth']),
    learning_rate=best_hyperparams['learning_rate'],
    gamma=best_hyperparams['gamma'],
    min_child_weight=best_hyperparams['min_child_weight'],
    subsample=best_hyperparams['subsample'],
    colsample_bytree=best_hyperparams['colsample_bytree'],
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    verbosity=0
)

# Retrieving the coreset for the date range previously used for train and validation, so the final model can be trained on it.
coreset = service.get_coreset(seq_from="2024-04-24", seq_to="2024-10-02")

best_model.fit(coreset['X'], coreset['y'], sample_weight=coreset['w'])

100%|██████████| 50/50 [00:20<00:00,  2.42trial/s, best loss: -0.14652649832014739]
The best hyperparameters are: {'colsample_bytree': 0.9971706601732324, 'gamma': 0.23319530172002323, 'learning_rate': 0.2978033535275081, 'max_depth': 3.0, 'min_child_weight': 3.0, 'n_estimators': 700.0, 'subsample': 0.5655554193275153}
