In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
import pandas as pd

from src.paths import PREPROCESSED_DATA_DIR
from src.data import aggregate_columns_with_lag, select_features
from src.config import NUM_AGG_FEATURES, NUM_WEATHER_FEATURES, UNDERSAMPLE, RANDOM_SEED

In [3]:
# load preprocessed data
df_train = pd.read_pickle(PREPROCESSED_DATA_DIR / 'preprocessed_train.pkl')
df_weather = pd.read_pickle(PREPROCESSED_DATA_DIR / 'clean_weather.pkl').set_index(
    'Date')

print(f'Train columns: {df_train.columns.to_list()}')
print(f'Weather columns: {df_weather.columns.to_list()}')

Train columns: ['Date', 'Species', 'Trap', 'Latitude', 'Longitude', 'Dayofyear', 'Week', 'Month', 'Year', 'WnvPresent']
Weather columns: ['Tmax_1', 'Tmin_1', 'Tavg_1', 'DewPoint_1', 'WetBulb_1', 'PrecipTotal_1', 'Tmax_2', 'Tmin_2', 'Tavg_2', 'DewPoint_2', 'WetBulb_2', 'PrecipTotal_2']


### Adding lagged and aggregated weather features

In [4]:
df_agg = aggregate_columns_with_lag(
    df_weather, 
    lag_range=(1, 14, 3), 
    window_range=(1,11,3), 
    agg_func='mean'
)
df_agg.head()

Unnamed: 0_level_0,Tmax_1_mean_l1_w1,Tmin_1_mean_l1_w1,Tavg_1_mean_l1_w1,DewPoint_1_mean_l1_w1,WetBulb_1_mean_l1_w1,PrecipTotal_1_mean_l1_w1,Tmax_2_mean_l1_w1,Tmin_2_mean_l1_w1,Tavg_2_mean_l1_w1,DewPoint_2_mean_l1_w1,...,Tavg_1_mean_l13_w10,DewPoint_1_mean_l13_w10,WetBulb_1_mean_l13_w10,PrecipTotal_1_mean_l13_w10,Tmax_2_mean_l13_w10,Tmin_2_mean_l13_w10,Tavg_2_mean_l13_w10,DewPoint_2_mean_l13_w10,WetBulb_2_mean_l13_w10,PrecipTotal_2_mean_l13_w10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-05-23,87.0,55.0,71.0,46.0,58.0,0.0,87.0,60.0,74.0,44.0,...,62.3,45.2,53.5,0.0133,74.6,53.2,64.0,45.2,54.0,0.0022
2007-05-24,89.0,60.0,75.0,49.0,61.0,0.0,89.0,66.0,78.0,48.0,...,61.7,44.3,53.0,0.0133,73.5,52.9,63.3,44.5,53.4,0.0022
2007-05-25,88.0,63.0,76.0,54.0,63.0,0.02,88.0,66.0,77.0,53.0,...,62.1,43.7,52.9,0.0133,74.0,53.3,63.7,44.0,53.3,0.0022
2007-05-26,70.0,56.0,63.0,44.0,53.0,0.001,66.0,57.0,62.0,46.0,...,62.1,43.0,52.7,0.0133,74.2,52.9,63.6,43.2,52.9,0.0022
2007-05-27,65.0,49.0,57.0,55.0,56.0,1.01,70.0,50.0,60.0,55.0,...,63.6,43.6,53.6,0.0132,75.4,53.2,64.4,43.5,53.7,0.0022


In [5]:
df_agg.columns

Index(['Tmax_1_mean_l1_w1', 'Tmin_1_mean_l1_w1', 'Tavg_1_mean_l1_w1',
       'DewPoint_1_mean_l1_w1', 'WetBulb_1_mean_l1_w1',
       'PrecipTotal_1_mean_l1_w1', 'Tmax_2_mean_l1_w1', 'Tmin_2_mean_l1_w1',
       'Tavg_2_mean_l1_w1', 'DewPoint_2_mean_l1_w1',
       ...
       'Tavg_1_mean_l13_w10', 'DewPoint_1_mean_l13_w10',
       'WetBulb_1_mean_l13_w10', 'PrecipTotal_1_mean_l13_w10',
       'Tmax_2_mean_l13_w10', 'Tmin_2_mean_l13_w10', 'Tavg_2_mean_l13_w10',
       'DewPoint_2_mean_l13_w10', 'WetBulb_2_mean_l13_w10',
       'PrecipTotal_2_mean_l13_w10'],
      dtype='object', length=240)

### Feature selection

In [7]:
df_train.head()

Unnamed: 0,Date,Species,Trap,Latitude,Longitude,Dayofyear,Week,Month,Year,WnvPresent
201,2007-07-02,1.0,T002,41.95469,-87.800991,183,27,7,2007,0
202,2007-07-02,2.0,T002,41.95469,-87.800991,183,27,7,2007,0
205,2007-07-02,1.0,T015,41.974089,-87.824812,183,27,7,2007,0
206,2007-07-02,1.0,T045,41.9216,-87.666455,183,27,7,2007,0
207,2007-07-02,2.0,T045,41.9216,-87.666455,183,27,7,2007,0


In [8]:
df_weather.head()

Unnamed: 0_level_0,Tmax_1,Tmin_1,Tavg_1,DewPoint_1,WetBulb_1,PrecipTotal_1,Tmax_2,Tmin_2,Tavg_2,DewPoint_2,WetBulb_2,PrecipTotal_2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2007-05-01,83,50,67,51,56.0,0.0,84,52,68,51,57.0,0.0
2007-05-02,59,42,51,42,47.0,0.0,60,43,52,42,47.0,0.0
2007-05-03,66,46,56,40,48.0,0.0,67,48,58,40,50.0,0.0
2007-05-04,66,49,58,41,50.0,0.001,78,51,64,42,50.0,0.0
2007-05-05,66,53,60,38,49.0,0.001,66,54,60,39,50.0,0.001


In [9]:
df_agg.head()

Unnamed: 0_level_0,Tmax_1_mean_l1_w1,Tmin_1_mean_l1_w1,Tavg_1_mean_l1_w1,DewPoint_1_mean_l1_w1,WetBulb_1_mean_l1_w1,PrecipTotal_1_mean_l1_w1,Tmax_2_mean_l1_w1,Tmin_2_mean_l1_w1,Tavg_2_mean_l1_w1,DewPoint_2_mean_l1_w1,...,Tavg_1_mean_l13_w10,DewPoint_1_mean_l13_w10,WetBulb_1_mean_l13_w10,PrecipTotal_1_mean_l13_w10,Tmax_2_mean_l13_w10,Tmin_2_mean_l13_w10,Tavg_2_mean_l13_w10,DewPoint_2_mean_l13_w10,WetBulb_2_mean_l13_w10,PrecipTotal_2_mean_l13_w10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-05-23,87.0,55.0,71.0,46.0,58.0,0.0,87.0,60.0,74.0,44.0,...,62.3,45.2,53.5,0.0133,74.6,53.2,64.0,45.2,54.0,0.0022
2007-05-24,89.0,60.0,75.0,49.0,61.0,0.0,89.0,66.0,78.0,48.0,...,61.7,44.3,53.0,0.0133,73.5,52.9,63.3,44.5,53.4,0.0022
2007-05-25,88.0,63.0,76.0,54.0,63.0,0.02,88.0,66.0,77.0,53.0,...,62.1,43.7,52.9,0.0133,74.0,53.3,63.7,44.0,53.3,0.0022
2007-05-26,70.0,56.0,63.0,44.0,53.0,0.001,66.0,57.0,62.0,46.0,...,62.1,43.0,52.7,0.0133,74.2,52.9,63.6,43.2,52.9,0.0022
2007-05-27,65.0,49.0,57.0,55.0,56.0,1.01,70.0,50.0,60.0,55.0,...,63.6,43.6,53.6,0.0132,75.4,53.2,64.4,43.5,53.7,0.0022


In [13]:
if UNDERSAMPLE:
    data_train_0 = df_train[df_train.WnvPresent==0].groupby(
        ['Trap', 'Year', 'Month']).sample(frac=0.15, random_state=RANDOM_SEED)
    data_train_1 = df_train[df_train.WnvPresent==1]
    data_train = pd.concat([data_train_0, data_train_1])
else:
    data_train = df_train.copy()

In [14]:
data_train = pd.merge(data_train, df_weather.reset_index(), on='Date')
X_train = data_train[df_weather.columns]
labels = data_train['WnvPresent']

sfs_forward = select_features(X_train, labels, NUM_WEATHER_FEATURES, 'lin_regr')

selected_weather_cols = df_weather.columns[sfs_forward.get_support()].to_list()
selected_weather_cols

In [16]:
data_train = pd.merge(data_train, df_agg.reset_index(), on='Date')
X_train = data_train[df_agg.columns]
labels = data_train['WnvPresent']

sfs_forward = select_features(X_train, labels, NUM_AGG_FEATURES, 'lin_regr')

selected_agg_cols = df_agg.columns[sfs_forward.get_support()].to_list()
selected_agg_cols

['DewPoint_1_mean_l1_w4',
 'PrecipTotal_2_mean_l7_w1',
 'Tmin_1_mean_l10_w1',
 'DewPoint_1_mean_l10_w1',
 'Tmax_2_mean_l13_w10']

In [17]:
data_train = data_train[df_train.columns.to_list()+selected_weather_cols+selected_agg_cols]
data_train.drop(['Date', 'Month', 'Year', 'Trap'], axis=1, inplace=True)
data_train.to_pickle(PREPROCESSED_DATA_DIR / 'data_train.pkl')