In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from src.paths import RAW_DATA_DIR, PREPROCESSED_DATA_DIR
from src.data import RowFilterTransformer, SpeciesEncoder, split_date, select_columns

sns.set_style('darkgrid')

### Cleaning weather data

In [3]:
weather = pd.read_csv(RAW_DATA_DIR / 'weather.csv')

print(f'\n---WEATHER DATA---\nNumber of rows: {weather.shape[0]}\nColumns: {weather.columns.to_list()}\n')

print(f'Number of "M": {(weather=="M").sum().sum()}')


---WEATHER DATA---
Number of rows: 2944
Columns: ['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'CodeSum', 'Depth', 'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed']

Number of "M": 7415


In [4]:
columns_to_stay = ['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'CodeSum','PrecipTotal']
weather = weather[columns_to_stay]

# look closer at missing values in weather data
missing_weather = pd.DataFrame((weather=='M').sum(), columns=['number'])
missing_weather['percent'] = (missing_weather.number/weather.shape[0]*100).round(1)
missing_weather

Unnamed: 0,number,percent
Station,0,0.0
Date,0,0.0
Tmax,0,0.0
Tmin,0,0.0
Tavg,11,0.4
DewPoint,0,0.0
WetBulb,4,0.1
CodeSum,0,0.0
PrecipTotal,2,0.1


In [5]:
weather.Tavg = np.where(
    weather.Tavg == 'M',
    weather[['Tmax', 'Tmin']].mean(axis=1),
    weather.Tavg
).astype('int')

weather[weather.WetBulb=='M']

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,CodeSum,PrecipTotal
848,1,2009-06-26,86,69,78,60,M,,0.0
2410,1,2013-08-10,81,64,73,57,M,,0.0
2412,1,2013-08-11,81,60,71,61,M,RA,0.01
2415,2,2013-08-12,85,69,77,63,M,RA,0.66


In [6]:
lr = LinearRegression()

train_weather = weather[weather.WetBulb!='M'][['Tmax', 'Tmin', 'DewPoint', 'WetBulb']]
test_weather = weather[weather.WetBulb=='M'][['Tmax', 'Tmin', 'DewPoint']]
y_train = train_weather.pop('WetBulb').astype(float)
X_train = train_weather
X_test = test_weather

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

missing_index = weather[weather.WetBulb=='M'].index
weather.loc[missing_index, 'WetBulb'] = y_pred.round()

weather.WetBulb = weather.WetBulb.astype(float)

weather[weather.PrecipTotal=='M']

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,CodeSum,PrecipTotal
117,2,2007-06-28,73,61,67,56,61.0,,M
119,2,2007-06-29,71,56,64,56,60.0,,M


In [7]:
missing_dates = weather[weather.PrecipTotal=='M'].Date
missing_dates
weather[weather.Date.isin(missing_dates)]

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,CodeSum,PrecipTotal
116,1,2007-06-28,74,58,66,55,60.0,,0.00
117,2,2007-06-28,73,61,67,56,61.0,,M
118,1,2007-06-29,70,56,63,55,59.0,,0.00
119,2,2007-06-29,71,56,64,56,60.0,,M


In [8]:
no_precip1_dates = weather[(weather.PrecipTotal=='0.00')&(weather.Station==1)].Date

weather[
    (weather.Station==2)&
    (weather.Date.isin(no_precip1_dates))&
    (weather.PrecipTotal!='0.00')&
    (weather.CodeSum==' ')&
    (weather.Date.str[-4]=='6')]

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,CodeSum,PrecipTotal
117,2,2007-06-28,73,61,67,56,61.0,,M
119,2,2007-06-29,71,56,64,56,60.0,,M
1173,2,2010-06-04,83,60,72,59,64.0,,T
1961,2,2012-06-30,89,70,80,68,72.0,,T
2667,2,2014-06-15,86,64,75,58,65.0,,T


In [9]:
weather.drop('CodeSum', axis=1, inplace=True)
weather.PrecipTotal = weather.PrecipTotal.str.strip().str.replace('T', '0.001').str.replace('M', '0.001').astype('float')
weather.Date = pd.to_datetime(weather.Date)
weather.dtypes

Station                 int64
Date           datetime64[ns]
Tmax                    int64
Tmin                    int64
Tavg                    int64
DewPoint                int64
WetBulb               float64
PrecipTotal           float64
dtype: object

In [10]:
weather_st1 = weather[weather.Station==1].drop('Station', axis=1)
weather_st2 = weather[weather.Station==2].drop('Station', axis=1)
weather = weather_st1.merge(weather_st2, on='Date', suffixes=['_1', '_2'])

In [11]:
weather.head()

Unnamed: 0,Date,Tmax_1,Tmin_1,Tavg_1,DewPoint_1,WetBulb_1,PrecipTotal_1,Tmax_2,Tmin_2,Tavg_2,DewPoint_2,WetBulb_2,PrecipTotal_2
0,2007-05-01,83,50,67,51,56.0,0.0,84,52,68,51,57.0,0.0
1,2007-05-02,59,42,51,42,47.0,0.0,60,43,52,42,47.0,0.0
2,2007-05-03,66,46,56,40,48.0,0.0,67,48,58,40,50.0,0.0
3,2007-05-04,66,49,58,41,50.0,0.001,78,51,64,42,50.0,0.0
4,2007-05-05,66,53,60,38,49.0,0.001,66,54,60,39,50.0,0.001


In [12]:
weather.to_pickle(PREPROCESSED_DATA_DIR / 'clean_weather.pkl')

### Transforming training data based on EDA findings

In [12]:
data_train = pd.read_csv(RAW_DATA_DIR / 'train.csv')
data_test = pd.read_csv(RAW_DATA_DIR / 'test.csv')

In [13]:
date_transformer = FunctionTransformer(split_date)
row_filter_transformer = RowFilterTransformer()
species_encoder = SpeciesEncoder()
cols_selector = FunctionTransformer(select_columns)

data_preprocessing_pipeline = make_pipeline(
    date_transformer,
    row_filter_transformer,
    species_encoder,
    cols_selector,
    memory='cache',
    verbose=True
)

data_train = data_preprocessing_pipeline.fit_transform(data_train)
data_test = data_preprocessing_pipeline.transform(data_test)

[Pipeline]  (step 2 of 4) Processing rowfiltertransformer, total=   0.0s
[Pipeline]  (step 4 of 4) Processing functiontransformer-2, total=   0.0s


In [17]:
print(data_train.shape)
data_train.head()

(6017, 12)


Unnamed: 0,Date,Trap,Latitude,Longitude,WnvPresent,Month,Year,Week,Dayofyear,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS
0,2007-07-02,T002,41.95469,-87.800991,0,7,2007,27,183,0.0,1.0,0.0
1,2007-07-02,T002,41.95469,-87.800991,0,7,2007,27,183,0.0,0.0,1.0
2,2007-07-02,T015,41.974089,-87.824812,0,7,2007,27,183,0.0,1.0,0.0
3,2007-07-02,T045,41.9216,-87.666455,0,7,2007,27,183,0.0,1.0,0.0
4,2007-07-02,T045,41.9216,-87.666455,0,7,2007,27,183,0.0,0.0,1.0


In [18]:
print(data_test.shape)
data_test.head()

(16269, 12)


Unnamed: 0,Id,Date,Trap,Latitude,Longitude,Month,Year,Week,Dayofyear,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS
0,3634,2008-07-01,T002,41.95469,-87.800991,7,2008,27,183,0.0,1.0,0.0
1,3635,2008-07-01,T002,41.95469,-87.800991,7,2008,27,183,0.0,0.0,1.0
2,3636,2008-07-01,T002,41.95469,-87.800991,7,2008,27,183,1.0,0.0,0.0
3,3650,2008-07-01,T015,41.974089,-87.824812,7,2008,27,183,0.0,1.0,0.0
4,3651,2008-07-01,T015,41.974089,-87.824812,7,2008,27,183,0.0,0.0,1.0


In [19]:
data_train.to_pickle(PREPROCESSED_DATA_DIR / 'preprocessed_train.pkl')
data_test.to_pickle(PREPROCESSED_DATA_DIR / 'preprocessed_test.pkl')