## Prepare train_data and test_data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from hypergbm import make_experiment
from hypernets.tabular.metrics import metric_to_scoring


In [2]:
train_data = pd.read_csv('datasets/West_Nile_Virus_II/train.csv.gz')
test_data = pd.read_csv('datasets/West_Nile_Virus_II/test.csv.gz')
X_train = train_data.copy()
y_train = X_train.pop('WnvPresent')
X_test = test_data.copy()
y_test = X_test.pop('WnvPresent')

In [3]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,Latitude1,Longitude1,AddressAccuracy,NumMosquitos
0,6709,"South Cottage Grove Avenue, Chicago, IL, USA",CULEX PIPIENS/RESTUANS,10,S COTTAGE GROVE,T102,"1000 S COTTAGE GROVE, Chicago, IL",41.750498,-87.605294,32.601011,-86.680736,5,6
1,789,"1500 West Webster Avenue, Chicago, IL 60614, USA",CULEX PIPIENS,15,W WEBSTER AVE,T045,"1500 W WEBSTER AVE, Chicago, IL",41.9216,-87.666455,61.302501,-158.77502,8,2
2,6556,"ORD Terminal 5, O'Hare International Airport, ...",CULEX RESTUANS,10,W OHARE AIRPORT,T900,"1000 W OHARE AIRPORT, Chicago, IL",41.974689,-87.890615,34.168219,-111.930907,9,16
3,8170,"ORD Terminal 5, O'Hare International Airport, ...",CULEX RESTUANS,10,W OHARE AIRPORT,T900,"1000 W OHARE AIRPORT, Chicago, IL",41.974689,-87.890615,34.751928,-92.131378,9,2
4,6517,"3700 South Pulaski Road, Chicago, IL 60623, USA",CULEX PIPIENS/RESTUANS,37,S PULASKI RD,T063,"3700 S PULASKI RD, Chicago, IL",41.82561,-87.726549,37.271875,-119.270415,9,10


In [4]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8404 entries, 0 to 8403
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              8404 non-null   int64  
 1   Address                 8404 non-null   object 
 2   Species                 8404 non-null   object 
 3   Block                   8404 non-null   int64  
 4   Street                  8404 non-null   object 
 5   Trap                    8404 non-null   object 
 6   AddressNumberAndStreet  8404 non-null   object 
 7   Latitude                8404 non-null   float64
 8   Longitude               8404 non-null   float64
 9   Latitude1               8404 non-null   float64
 10  Longitude1              8404 non-null   float64
 11  AddressAccuracy         8404 non-null   int64  
 12  NumMosquitos            8404 non-null   int64  
dtypes: float64(4), int64(4), object(5)
memory usage: 853.7+ KB


# Without Feature Generation

In [5]:
experiment = make_experiment(train_data.copy(), target='WnvPresent',
                             reward_metric='f1', 
                             random_state=8888, max_trials=30,
                             class_balancing='ClassWeight') #'RandomUnderSampling')
estimator = experiment.run()

In [6]:
scorer = metric_to_scoring('f1',pos_label=1)
score = scorer(estimator, X_test, y_test)*scorer._sign
score

0.2196796338672769

# About Feature Generation

See [FeatureTools](https://github.com/alteryx/featuretools) for more datails

In [7]:
from hypernets.tabular.feature_generators import FeatureGenerationTransformer
train_data['latlong1'] = train_data[['Latitude', 'Longitude']].apply(tuple, axis=1)
test_data['latlong1'] = test_data[['Latitude', 'Longitude']].apply(tuple, axis=1)
train_data['latlong2'] = train_data[['Latitude1', 'Longitude1']].apply(tuple, axis=1)
test_data['latlong2'] = test_data[['Latitude1', 'Longitude1']].apply(tuple, axis=1)
X_train = train_data.copy()
y_train = X_train.pop('WnvPresent')
X_test = test_data.copy()
y_test = X_test.pop('WnvPresent')

In [8]:
featureGenerationTransformer = FeatureGenerationTransformer(latlong_cols=['latlong1','latlong2'])
featureGenerationTransformer.fit(pd.DataFrame(train_data[['latlong1','latlong2']]))
X_test_new = featureGenerationTransformer.transform(pd.DataFrame(test_data[['latlong1','latlong2']]))

In [9]:
X_test_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2102 entries, 0 to 2101
Data columns (total 5 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   latlong1                                               2102 non-null   object 
 1   latlong2                                               2102 non-null   object 
 2   GEOHASH__latlong1__                                    2102 non-null   object 
 3   GEOHASH__latlong2__                                    2102 non-null   object 
 4   HAVERSINE__latlong1____latlong2____unit__kilometers__  2102 non-null   float64
dtypes: float64(1), object(4)
memory usage: 98.5+ KB


In [10]:
X_test_new.head()

Unnamed: 0_level_0,latlong1,latlong2,GEOHASH__latlong1__,GEOHASH__latlong2__,HAVERSINE__latlong1____latlong2____unit__kilometers__
e_hypernets_ft_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"(41.948167, -87.730698)","(32.6010112, -86.6807365)",dp3weskmy945,djf37wc48kyq,1043.476383
1,"(41.903002, -87.688267)","(61.3025006, -158.7750198)",dp3wks5qem9p,b6zhdzz8cykk,5052.504275
2,"(41.974689, -87.890615)","(34.1682185, -111.930907)",dp3wb12021pk,9w06kunkkcd6,2266.646309
3,"(41.869107, -87.696293)","(34.7519275, -92.1313784)",dp3whqj90r0r,9ynmxqruxkc4,880.605092
4,"(41.825902, -87.667827)","(37.2718745, -119.2704153)",dp3tvn4jx94p,9qehjcjumuur,2740.466848


# Enable Feature Generation in HyperGBM

Set `feature_generation=True` to enable feature generation in HyperGBM experiment. Possible options:
* feature_generation : bool (default False), whether to enable feature generation.
* feature_generation_trans_primitives: list (default None), FeatureTools transform primitives list.
* feature_generation_categories_cols: list (default None), column name list to generate new features as FeatureTools Categorical variables.
* feature_generation_continuous_cols: list (default detected from X_train), column name list to generate new features as FeatureTools Numeric variables.
* feature_generation_datetime_cols: list (default detected from X_train), column name list to generate new features as FeatureTools Datetime variables.
* feature_generation_latlong_cols: list (default None), column name list to generate new features as FeatureTools LatLong variables.
* feature_generation_text_cols: list (default None), column name list to generate new features as FeatureTools Text(NaturalLanguage) variables.

In [11]:
experiment = make_experiment(train_data.copy(), target='WnvPresent',
                             reward_metric='f1', 
                             random_state=8888, max_trials=30,  class_balancing='ClassWeight',
                             feature_generation=True, 
                            )
estimator = experiment.run()

In [12]:
estimator.steps

[('data_clean',
  DataCleanStep(cv=True,
                data_cleaner_args={'correct_object_dtype': True,
                                   'drop_columns': None,
                                   'drop_constant_columns': True,
                                   'drop_duplicated_columns': False,
                                   'drop_idness_columns': True,
                                   'drop_label_nan_rows': True,
                                   'int_convert_to': 'float', 'nan_chars': None,
                                   'reduce_mem_usage': False,
                                   'reserve_columns': ['latlong1', 'latlong2',
                                                       'Address',
                                                       'AddressNumberAndStreet']},
                name='data_clean')),
 ('feature_generation',
  FeatureGenerationStep(datetime_cols=[], latlong_cols=['latlong1', 'latlong2'],
                        name='feature_generation',
          

In [13]:
scorer = metric_to_scoring('f1' )
score = scorer(estimator, X_test, y_test)
score

0.22807017543859642