In [1]:
%load_ext autoreload
%autoreload 2

In [23]:
import pandas as pd

from src.paths import PREPROCESSED_DATA_DIR
from src.data import aggregate_columns_with_lag, select_features, FeatureSelector
from src.config import NUM_AGG_FEATURES, NUM_WEATHER_FEATURES, UNDERSAMPLE, RANDOM_SEED
from src.config import SELECTOR, LAG_RANGE, WINDOW_RANGE
from src.training import undersample

In [3]:
# load preprocessed data
df_train = pd.read_pickle(PREPROCESSED_DATA_DIR / 'preprocessed_train.pkl')
df_test = pd.read_pickle(PREPROCESSED_DATA_DIR / 'preprocessed_test.pkl')
df_weather = pd.read_pickle(PREPROCESSED_DATA_DIR / 'clean_weather.pkl').set_index(
    'Date')

print(f'Train columns: {df_train.columns.to_list()}')
print(f'Weather columns: {df_weather.columns.to_list()}')

Train columns: ['Date', 'Trap', 'Latitude', 'Longitude', 'WnvPresent', 'Month', 'Year', 'Week', 'Dayofyear', 'Species_CULEX PIPIENS', 'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS']
Weather columns: ['Tmax_1', 'Tmin_1', 'Tavg_1', 'DewPoint_1', 'WetBulb_1', 'PrecipTotal_1', 'Tmax_2', 'Tmin_2', 'Tavg_2', 'DewPoint_2', 'WetBulb_2', 'PrecipTotal_2']


### Adding lagged and aggregated weather features

In [4]:
df_agg = aggregate_columns_with_lag(
    df_weather, 
    lag_range=LAG_RANGE, 
    window_range=WINDOW_RANGE,
    agg_func='mean'
)
df_agg.head()

Unnamed: 0_level_0,Tmax_1_mean_l1_w1,Tmin_1_mean_l1_w1,Tavg_1_mean_l1_w1,DewPoint_1_mean_l1_w1,WetBulb_1_mean_l1_w1,PrecipTotal_1_mean_l1_w1,Tmax_2_mean_l1_w1,Tmin_2_mean_l1_w1,Tavg_2_mean_l1_w1,DewPoint_2_mean_l1_w1,...,Tavg_1_mean_l13_w10,DewPoint_1_mean_l13_w10,WetBulb_1_mean_l13_w10,PrecipTotal_1_mean_l13_w10,Tmax_2_mean_l13_w10,Tmin_2_mean_l13_w10,Tavg_2_mean_l13_w10,DewPoint_2_mean_l13_w10,WetBulb_2_mean_l13_w10,PrecipTotal_2_mean_l13_w10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-05-23,87.0,55.0,71.0,46.0,58.0,0.0,87.0,60.0,74.0,44.0,...,62.3,45.2,53.5,0.0133,74.6,53.2,64.0,45.2,54.0,0.0022
2007-05-24,89.0,60.0,75.0,49.0,61.0,0.0,89.0,66.0,78.0,48.0,...,61.7,44.3,53.0,0.0133,73.5,52.9,63.3,44.5,53.4,0.0022
2007-05-25,88.0,63.0,76.0,54.0,63.0,0.02,88.0,66.0,77.0,53.0,...,62.1,43.7,52.9,0.0133,74.0,53.3,63.7,44.0,53.3,0.0022
2007-05-26,70.0,56.0,63.0,44.0,53.0,0.001,66.0,57.0,62.0,46.0,...,62.1,43.0,52.7,0.0133,74.2,52.9,63.6,43.2,52.9,0.0022
2007-05-27,65.0,49.0,57.0,55.0,56.0,1.01,70.0,50.0,60.0,55.0,...,63.6,43.6,53.6,0.0132,75.4,53.2,64.4,43.5,53.7,0.0022


In [5]:
df_agg.columns

Index(['Tmax_1_mean_l1_w1', 'Tmin_1_mean_l1_w1', 'Tavg_1_mean_l1_w1',
       'DewPoint_1_mean_l1_w1', 'WetBulb_1_mean_l1_w1',
       'PrecipTotal_1_mean_l1_w1', 'Tmax_2_mean_l1_w1', 'Tmin_2_mean_l1_w1',
       'Tavg_2_mean_l1_w1', 'DewPoint_2_mean_l1_w1',
       ...
       'Tavg_1_mean_l13_w10', 'DewPoint_1_mean_l13_w10',
       'WetBulb_1_mean_l13_w10', 'PrecipTotal_1_mean_l13_w10',
       'Tmax_2_mean_l13_w10', 'Tmin_2_mean_l13_w10', 'Tavg_2_mean_l13_w10',
       'DewPoint_2_mean_l13_w10', 'WetBulb_2_mean_l13_w10',
       'PrecipTotal_2_mean_l13_w10'],
      dtype='object', length=240)

### Feature selection

In [6]:
df_train.head()

Unnamed: 0,Date,Trap,Latitude,Longitude,WnvPresent,Month,Year,Week,Dayofyear,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS
0,2007-07-02,T002,41.95469,-87.800991,0,7,2007,27,183,0.0,1.0,0.0
1,2007-07-02,T002,41.95469,-87.800991,0,7,2007,27,183,0.0,0.0,1.0
2,2007-07-02,T015,41.974089,-87.824812,0,7,2007,27,183,0.0,1.0,0.0
3,2007-07-02,T045,41.9216,-87.666455,0,7,2007,27,183,0.0,1.0,0.0
4,2007-07-02,T045,41.9216,-87.666455,0,7,2007,27,183,0.0,0.0,1.0


In [7]:
df_train.shape

(6017, 12)

In [8]:
df_weather.head()

Unnamed: 0_level_0,Tmax_1,Tmin_1,Tavg_1,DewPoint_1,WetBulb_1,PrecipTotal_1,Tmax_2,Tmin_2,Tavg_2,DewPoint_2,WetBulb_2,PrecipTotal_2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2007-05-01,83,50,67,51,56.0,0.0,84,52,68,51,57.0,0.0
2007-05-02,59,42,51,42,47.0,0.0,60,43,52,42,47.0,0.0
2007-05-03,66,46,56,40,48.0,0.0,67,48,58,40,50.0,0.0
2007-05-04,66,49,58,41,50.0,0.001,78,51,64,42,50.0,0.0
2007-05-05,66,53,60,38,49.0,0.001,66,54,60,39,50.0,0.001


In [9]:
df_agg.head()

Unnamed: 0_level_0,Tmax_1_mean_l1_w1,Tmin_1_mean_l1_w1,Tavg_1_mean_l1_w1,DewPoint_1_mean_l1_w1,WetBulb_1_mean_l1_w1,PrecipTotal_1_mean_l1_w1,Tmax_2_mean_l1_w1,Tmin_2_mean_l1_w1,Tavg_2_mean_l1_w1,DewPoint_2_mean_l1_w1,...,Tavg_1_mean_l13_w10,DewPoint_1_mean_l13_w10,WetBulb_1_mean_l13_w10,PrecipTotal_1_mean_l13_w10,Tmax_2_mean_l13_w10,Tmin_2_mean_l13_w10,Tavg_2_mean_l13_w10,DewPoint_2_mean_l13_w10,WetBulb_2_mean_l13_w10,PrecipTotal_2_mean_l13_w10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-05-23,87.0,55.0,71.0,46.0,58.0,0.0,87.0,60.0,74.0,44.0,...,62.3,45.2,53.5,0.0133,74.6,53.2,64.0,45.2,54.0,0.0022
2007-05-24,89.0,60.0,75.0,49.0,61.0,0.0,89.0,66.0,78.0,48.0,...,61.7,44.3,53.0,0.0133,73.5,52.9,63.3,44.5,53.4,0.0022
2007-05-25,88.0,63.0,76.0,54.0,63.0,0.02,88.0,66.0,77.0,53.0,...,62.1,43.7,52.9,0.0133,74.0,53.3,63.7,44.0,53.3,0.0022
2007-05-26,70.0,56.0,63.0,44.0,53.0,0.001,66.0,57.0,62.0,46.0,...,62.1,43.0,52.7,0.0133,74.2,52.9,63.6,43.2,52.9,0.0022
2007-05-27,65.0,49.0,57.0,55.0,56.0,1.01,70.0,50.0,60.0,55.0,...,63.6,43.6,53.6,0.0132,75.4,53.2,64.4,43.5,53.7,0.0022


In [10]:
if UNDERSAMPLE:
    data_train = undersample(df_train)
else:
    data_train = df_train.copy()

In [24]:
feature_selector = FeatureSelector(df_weather, df_agg, NUM_WEATHER_FEATURES, NUM_AGG_FEATURES, SELECTOR)
data_train = feature_selector.fit_transform(data_train)
data_test = feature_selector.transform(df_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ]


In [25]:
data_train.head()

Unnamed: 0,Latitude,Longitude,WnvPresent,Week,Dayofyear,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Tmin_1,DewPoint_1,PrecipTotal_1,DewPoint_2,PrecipTotal_2,DewPoint_2_mean_l1_w1,WetBulb_2_mean_l1_w1,Tmin_1_mean_l10_w1,Tmin_2_mean_l10_w7,Tavg_2_mean_l13_w1
0,41.95469,-87.800991,0,29,199,0.0,1.0,0.0,69,68,1.55,69,0.92,65.0,69.0,75.0,67.0,79.0
1,41.95469,-87.800991,0,27,183,0.0,1.0,0.0,53,50,0.0,50,0.0,51.0,59.0,61.0,66.714286,73.0
2,41.95469,-87.800991,0,31,213,0.0,1.0,0.0,69,62,0.0,63,0.0,60.0,68.0,58.0,64.0,73.0
3,41.95469,-87.800991,0,32,219,1.0,0.0,0.0,72,73,1.31,73,0.06,72.0,74.0,66.0,66.142857,77.0
4,41.95469,-87.800991,0,39,267,1.0,0.0,0.0,68,63,0.0,62,0.0,50.0,59.0,45.0,54.714286,61.0


In [26]:
data_test.head()

Unnamed: 0,Id,Latitude,Longitude,Week,Dayofyear,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Tmin_1,DewPoint_1,PrecipTotal_1,DewPoint_2,PrecipTotal_2,DewPoint_2_mean_l1_w1,WetBulb_2_mean_l1_w1,Tmin_1_mean_l10_w1,Tmin_2_mean_l10_w7,Tavg_2_mean_l13_w1
0,3634,41.95469,-87.800991,27,183,0.0,1.0,0.0,58,51,0.0,50,0.0,49.0,58.0,61.0,58.714286,67.0
1,3635,41.95469,-87.800991,27,183,0.0,0.0,1.0,58,51,0.0,50,0.0,49.0,58.0,61.0,58.714286,67.0
2,3636,41.95469,-87.800991,27,183,1.0,0.0,0.0,58,51,0.0,50,0.0,49.0,58.0,61.0,58.714286,67.0
3,3650,41.974089,-87.824812,27,183,0.0,1.0,0.0,58,51,0.0,50,0.0,49.0,58.0,61.0,58.714286,67.0
4,3651,41.974089,-87.824812,27,183,0.0,0.0,1.0,58,51,0.0,50,0.0,49.0,58.0,61.0,58.714286,67.0


In [29]:
data_train.dtypes

Latitude                          float64
Longitude                         float64
WnvPresent                          int64
Week                               UInt32
Dayofyear                           int32
Species_CULEX PIPIENS             float64
Species_CULEX PIPIENS/RESTUANS    float64
Species_CULEX RESTUANS            float64
Tmin_1                              int64
DewPoint_1                          int64
PrecipTotal_1                     float64
DewPoint_2                          int64
PrecipTotal_2                     float64
DewPoint_2_mean_l1_w1             float64
WetBulb_2_mean_l1_w1              float64
Tmin_1_mean_l10_w1                float64
Tmin_2_mean_l10_w7                float64
Tavg_2_mean_l13_w1                float64
dtype: object

In [30]:
data_test.dtypes

Id                                  int64
Latitude                          float64
Longitude                         float64
Week                               UInt32
Dayofyear                           int32
Species_CULEX PIPIENS             float64
Species_CULEX PIPIENS/RESTUANS    float64
Species_CULEX RESTUANS            float64
Tmin_1                              int64
DewPoint_1                          int64
PrecipTotal_1                     float64
DewPoint_2                          int64
PrecipTotal_2                     float64
DewPoint_2_mean_l1_w1             float64
WetBulb_2_mean_l1_w1              float64
Tmin_1_mean_l10_w1                float64
Tmin_2_mean_l10_w7                float64
Tavg_2_mean_l13_w1                float64
dtype: object

In [None]:
data_train.to_pickle(PREPROCESSED_DATA_DIR / 'data_train.pkl')
data_test.to_pickle(PREPROCESSED_DATA_DIR / 'data_test.pkl')
