## Prepare train_data and test_data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from hypergbm import make_experiment
from hypernets.tabular.metrics import metric_to_scoring

In [2]:
data = pd.read_csv('datasets/Metro_Interstate_Traffic_Volume/data.csv.gz')

In [3]:
data.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   holiday              48204 non-null  object 
 1   temp                 48204 non-null  float64
 2   rain_1h              48204 non-null  float64
 3   snow_1h              48204 non-null  float64
 4   clouds_all           48204 non-null  int64  
 5   weather_main         48204 non-null  object 
 6   weather_description  48204 non-null  object 
 7   date_time            48204 non-null  object 
 8   traffic_volume       48204 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 3.3+ MB


In [5]:
##Transform data_time' dtype to datetime
data.date_time = pd.to_datetime(data.date_time)

In [6]:
train_data,test_data = train_test_split(data,test_size=0.2)
X_train = train_data.copy()
y_train = X_train.pop('traffic_volume')
X_test = test_data.copy()
y_test = X_test.pop('traffic_volume')

# Without Feature Generation


In [7]:
experiment = make_experiment(train_data.copy(), target='traffic_volume',
                             random_state=8888,max_trials=10)
estimator = experiment.run()

In [8]:
scorer = metric_to_scoring('rmse')
score = scorer(estimator, X_test, y_test)*scorer._sign
score

1880.6791797381823

# About Feature Generation

See [FeatureTools](https://github.com/alteryx/featuretools) for more datails

In [9]:
from hypernets.tabular.feature_generators import FeatureGenerationTransformer
featureGenerationTransformer = FeatureGenerationTransformer(datetime_cols=['date_time'])
featureGenerationTransformer.fit(pd.DataFrame(X_train['date_time']))
X_test_new = featureGenerationTransformer.transform(pd.DataFrame(X_test['date_time']))

In [10]:
X_test_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9641 entries, 0 to 9640
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   date_time                9641 non-null   datetime64[ns]
 1   DAY__date_time__         9641 non-null   int64         
 2   HOUR__date_time__        9641 non-null   int64         
 3   IS_WEEKEND__date_time__  9641 non-null   int64         
 4   MINUTE__date_time__      9641 non-null   int64         
 5   MONTH__date_time__       9641 non-null   int64         
 6   SECOND__date_time__      9641 non-null   int64         
 7   WEEK__date_time__        9641 non-null   int64         
 8   WEEKDAY__date_time__     9641 non-null   int64         
dtypes: datetime64[ns](1), int64(8)
memory usage: 753.2 KB


In [11]:
X_test_new.head()

Unnamed: 0_level_0,date_time,DAY__date_time__,HOUR__date_time__,IS_WEEKEND__date_time__,MINUTE__date_time__,MONTH__date_time__,SECOND__date_time__,WEEK__date_time__,WEEKDAY__date_time__
e_hypernets_ft_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,2017-01-10 08:00:00,10,8,0,0,1,0,2,1
1,2018-07-17 18:00:00,17,18,0,0,7,0,29,1
2,2018-08-24 11:00:00,24,11,0,0,8,0,34,4
3,2012-11-23 04:00:00,23,4,0,0,11,0,47,4
4,2016-11-15 02:00:00,15,2,0,0,11,0,46,1


# Enable Feature Generation in HyperGBM

Set `feature_generation=True` to enable feature generation in HyperGBM experiment. Possible options:
* feature_generation : bool (default False), whether to enable feature generation.
* feature_generation_trans_primitives: list (default None), FeatureTools transform primitives list.
* feature_generation_categories_cols: list (default None), column name list to generate new features as FeatureTools Categorical variables.
* feature_generation_continuous_cols: list (default detected from X_train), column name list to generate new features as FeatureTools Numeric variables.
* feature_generation_datetime_cols: list (default detected from X_train), column name list to generate new features as FeatureTools Datetime variables.
* feature_generation_latlong_cols: list (default None), column name list to generate new features as FeatureTools LatLong variables.
* feature_generation_text_cols: list (default None), column name list to generate new features as FeatureTools Text(NaturalLanguage) variables.

In [12]:
experiment = make_experiment(train_data.copy(), target='traffic_volume',
                             random_state=8888, max_trials=10,
                             feature_generation=True,
#                              feature_generation_datetime_cols=['date_time'],
                            )
estimator = experiment.run()

In [13]:
estimator.steps

[('data_clean',
  DataCleanStep(cv=True,
                data_cleaner_args={'correct_object_dtype': True,
                                   'drop_columns': None,
                                   'drop_constant_columns': True,
                                   'drop_duplicated_columns': False,
                                   'drop_idness_columns': True,
                                   'drop_label_nan_rows': True,
                                   'int_convert_to': 'float', 'nan_chars': None,
                                   'reduce_mem_usage': False,
                                   'reserve_columns': ['date_time']},
                name='data_clean')),
 ('feature_generation',
  FeatureGenerationStep(datetime_cols=['date_time'], latlong_cols=[],
                        name='feature_generation', text_cols=[])),
 ('estimator',
  GreedyEnsemble(weight=[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], scores=[-331.35453353661273, -331.35453353661273, -331.35453353661273, -

In [14]:
scorer = metric_to_scoring('rmse')
score = scorer(estimator, X_test, y_test)*scorer._sign
score

316.9958244736899