In [1]:
import sys
sys.path.append('.')

In [2]:
import yaml
import numpy as np
import pandas as pd

In [3]:
from competition.warehouse import Warehouse
from competition.featurise import Featuriser

In [4]:
with open('configs/test.yaml', 'r') as f:
    config = yaml.load(f, yaml.Loader)
config

{'warehouse': {'data_path': 'input/',
  'train': True,
  'create_args': {'use_geo': True,
   'use_era5': True,
   'era5_metrics': '*',
   'era5_years': '*'}},
 'featurise': {'calcers': {'dates_features': {},
   'geo_cat_features': {},
   'geo_neighbors_features': {'count_neighbors': 5},
   'grib_features': {'metric': 'temp',
    'pooling_size': 3,
    'lags': [0, 1, 2, 3],
    'agg_funcs': ['max', 'mean']},
   "target_base'": {}}}}

In [5]:
engine = (Warehouse(data_path=config['warehouse']['data_path'],
                    train=config['warehouse']['train'])
          .create(config['warehouse']['create_args']))
engine

In [5]:
features_df = Featuriser(engine).get_features(config['featurise']['calcers'])
features_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 488103 entries, 0 to 488102
Data columns (total 51 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   dt                          488103 non-null  object 
 1   grid_index                  488103 non-null  int64  
 2   month                       488103 non-null  int64  
 3   week                        488103 non-null  int64  
 4   day_of_week                 488103 non-null  int64  
 5   geo_population              163421 non-null  object 
 6   geo_place                   201614 non-null  object 
 7   geo_cn5_city_mean_distance  488103 non-null  float64
 8   geo_cn5_city_max_distance   488103 non-null  float64
 9   geo_cn5_city_min_distance   488103 non-null  float64
 10  temp_stl1_ws3_max_lag_0     488024 non-null  float32
 11  temp_stl1_ws3_mean_lag_0    488024 non-null  float32
 12  temp_t2m_ws3_max_lag_0      488024 non-null  float32
 13  temp_t2m_ws3_m

In [6]:
features_df.columns

Index(['dt', 'grid_index', 'month', 'week', 'day_of_week', 'geo_population',
       'geo_place', 'geo_cn5_city_mean_distance', 'geo_cn5_city_max_distance',
       'geo_cn5_city_min_distance', 'temp_stl1_ws3_max_lag_0',
       'temp_stl1_ws3_mean_lag_0', 'temp_t2m_ws3_max_lag_0',
       'temp_t2m_ws3_mean_lag_0', 'temp_d2m_ws3_max_lag_0',
       'temp_d2m_ws3_mean_lag_0', 'temp_skt_ws3_max_lag_0',
       'temp_skt_ws3_mean_lag_0', 'temp_stl1_ws3_max_lag_1',
       'temp_stl1_ws3_mean_lag_1', 'temp_t2m_ws3_max_lag_1',
       'temp_t2m_ws3_mean_lag_1', 'temp_d2m_ws3_max_lag_1',
       'temp_d2m_ws3_mean_lag_1', 'temp_skt_ws3_max_lag_1',
       'temp_skt_ws3_mean_lag_1', 'temp_stl1_ws3_max_lag_2',
       'temp_stl1_ws3_mean_lag_2', 'temp_t2m_ws3_max_lag_2',
       'temp_t2m_ws3_mean_lag_2', 'temp_d2m_ws3_max_lag_2',
       'temp_d2m_ws3_mean_lag_2', 'temp_skt_ws3_max_lag_2',
       'temp_skt_ws3_mean_lag_2', 'temp_stl1_ws3_max_lag_3',
       'temp_stl1_ws3_mean_lag_3', 'temp_t2m_ws3_max_la

In [7]:
features_df.head()

Unnamed: 0,dt,grid_index,month,week,day_of_week,geo_population,geo_place,geo_cn5_city_mean_distance,geo_cn5_city_max_distance,geo_cn5_city_min_distance,...,temp_skt_ws3_mean_lag_3,infire_day_1,infire_day_2,infire_day_3,infire_day_4,infire_day_5,infire_day_6,infire_day_7,infire_day_8,infire_day_num
0,2020-05-04,143,5,19,0,,,0.344685,0.378005,0.288359,...,288.899872,0,0,0,0,0,0,0,0,0
1,2021-02-24,891,2,8,2,,,0.290611,0.404587,0.16458,...,269.359772,0,0,0,0,0,0,0,0,0
2,2021-02-27,891,2,8,5,,,0.290611,0.404587,0.16458,...,266.279785,0,0,0,0,0,0,0,0,0
3,2021-04-01,892,4,13,3,2054.0,village,0.216737,0.292387,0.096216,...,274.353943,0,0,0,0,0,0,0,0,0
4,2020-03-14,893,3,11,5,,village,0.189796,0.267262,0.103796,...,282.899078,0,0,0,0,0,0,0,0,0


In [8]:
features_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
dt,488103,478.0,2020-04-06,3179.0,,,,,,,
grid_index,488103,,,,63888.7,28129.1,143.0,43198.0,58824.0,85844.0,150385.0
month,488103,,,,5.5763,2.99957,1.0,3.0,4.0,8.0,12.0
week,488103,,,,22.8233,13.2975,1.0,13.0,18.0,33.0,53.0
day_of_week,488103,,,,2.98554,2.00231,0.0,1.0,3.0,5.0,6.0
geo_population,163421,4495.0,343,557.0,,,,,,,
geo_place,201614,3.0,village,162508.0,,,,,,,
geo_cn5_city_mean_distance,488103,,,,0.939561,1.16253,0.102042,0.190475,0.389698,1.29134,12.3188
geo_cn5_city_max_distance,488103,,,,1.24092,1.45142,0.127769,0.271617,0.540167,1.72956,13.4317
geo_cn5_city_min_distance,488103,,,,0.566772,0.891286,0.000457111,0.0869435,0.188916,0.718968,11.523
