In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np

In [3]:
from src.data import read_raw_data


data = read_raw_data()

In [4]:
from src.data import clean_weather


data['weather'] = clean_weather(data['weather'])

In [5]:
from src.data import remove_address, split_date

data['train'] = split_date(data['train'])
data['train'] = remove_address(data['train'])
data['test'] = data['train'][data['train']['Year'].isin([2013])]
data['train'] = data['train'][data['train']['Year'].isin([2009, 2011])]

In [6]:
data['train'].shape, data['test'].shape

((4303, 12), (2392, 12))

In [7]:
data['train']['WnvPresent'].mean(), data['test']['WnvPresent'].mean()

(np.float64(0.017662096211945155), np.float64(0.09991638795986622))

In [8]:
from src.data import MonthSpeciesTrapTransformer

transformer = MonthSpeciesTrapTransformer()

data['train'] = transformer.fit_transform(data['train'])
data['test'] = transformer.transform(data['test'])

In [9]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd


species_oh_encoder = OneHotEncoder(sparse_output=False)
encoded_train = species_oh_encoder.fit_transform(data['train']['Species'].to_frame())
species_train = pd.DataFrame(
    encoded_train,
    columns=species_oh_encoder.get_feature_names_out()
)
data['train'] = pd.concat(
    [
        data['train'].reset_index(drop=True).drop('Species', axis=1),
        species_train
    ], 
    axis=1
)

encoded_test = species_oh_encoder.transform(data['test']['Species'].to_frame())
species_test = pd.DataFrame(
    encoded_test,
    columns=species_oh_encoder.get_feature_names_out()
)
data['test'] = pd.concat(
    [
        data['test'].reset_index(drop=True).drop('Species', axis=1),
        species_test
    ],
    axis=1
)

In [10]:
cols = ['Date', 'Latitude', 'Longitude', 'Dayofyear',
       'Species_CULEX PIPIENS', 'Species_CULEX PIPIENS/RESTUANS',
       'Species_CULEX RESTUANS', 'WnvPresent']
data['train'] = data['train'][cols]
data['test'] = data['test'][cols]

In [11]:
from src.features import aggregate_columns_with_lag


agg_df = aggregate_columns_with_lag(data['weather'], lags=[1,8,15], windows=[7], agg_func='mean')
data['weather'] = pd.concat(
    [data['weather'], agg_df],
    axis=1
).reset_index()

In [12]:
data['train'] = data['train'].merge(data['weather'].reset_index(), on='Date')
data['test'] = data['test'].merge(data['weather'].reset_index(), on='Date')
data['train'].drop('Date', axis=1, inplace=True)
data['test'].drop('Date', axis=1, inplace=True)

In [13]:
X_train = data['train'].copy()
y_train = X_train.pop('WnvPresent')

# data['test'].dropna(inplace=True)
X_test = data['test'].copy()
y_test = X_test.pop('WnvPresent')

print(y_train.mean(), y_test.mean())

0.02445302445302445 0.12236503856041131


In [14]:
# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, RandomUnderSampler, NeighbourhoodCleaningRule
from imblearn.over_sampling import SMOTE, ADASYN

from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold

from src.evaluate import evaluate


classifiers = {
    'lr': LogisticRegression(),
    # 'svc': SVC(probability=True),
    # 'rand_forest': RandomForestClassifier(),
    'lgbm_unb': LGBMClassifier(is_unbalance=True, colsample_bytree=0.6, verbose=-1),
    'lgbm': LGBMClassifier(is_unbalance=False),
    # 'xgb': XGBClassifier(),
    'bag_lr': BaggingClassifier(LogisticRegression(), n_estimators=10, max_samples=0.8, max_features=0.6)
}

def run_one_clf(clf):

    steps = [
        ('scaler', StandardScaler()),
        # ('smote', SMOTE(sampling_strategy=0.2)),
        # ('adasyn', ADASYN(sampling_strategy=0.2)),
        # ('tomek', TomekLinks()),
        # ('enn', EditedNearestNeighbours()),
        ('clf', clf)
    ]

    pipeline = Pipeline(steps=steps)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)

    cv_results = cross_validate(pipeline, X_train, y_train, scoring='roc_auc', cv=cv)

    cv_score = cv_results['test_score'].mean().round(4)
    
    pipeline.fit(X_train, y_train)
    
    return cv_score, pipeline

In [15]:
cv_score, pipeline = run_one_clf(classifiers['bag_lr'])
print(cv_score)

0.7786


In [16]:
from sklearn.metrics import roc_auc_score


y_pred = pipeline.predict_proba(X_test)[:, 1]

roc_auc_score(y_test, y_pred).round(4)

np.float64(0.5671)