In [5]:
import pandas as pd
import numpy as np
from pyHSICLasso import HSICLasso
from matrixprofile import *
import lightgbm as lgb
import plotly_express as px
from sklearn.metrics import *

In [6]:
def generate_features(df, features, windows=range(2, 8), functions=[('mean', np.mean),('std', np.std),('max', np.max),('min', np.min)]):
    for feature in features: 
        for window in windows:
            df[f'matrix_profile.{window}.{feature}'] = np.concatenate((np.zeros(window - 1), matrixProfile.stomp(df[feature].values, window)[0]), axis=0)
            for function in functions:
                df[f'{function[0]}.{window}.{feature}'] = df[feature].rolling(window).apply(function[1])
    return df

In [7]:
training_samples = pd.read_csv('https://s3.amazonaws.com:443/drivendata/data/44/public/dengue_features_train.csv')
training_labels = pd.read_csv('https://s3.amazonaws.com:443/drivendata/data/44/public/dengue_labels_train.csv')
testing_samples = pd.read_csv('https://s3.amazonaws.com:443/drivendata/data/44/public/dengue_features_test.csv')

In [8]:
training_split = 1456

In [14]:
samples = pd.read_csv('exploded_features.csv')
samples.drop('Unnamed: 0', inplace=True, axis=1)

In [9]:
if False:
    samples = pd.concat([training_samples, testing_samples])
    features = [column for column in samples.columns if column not in ['city', 'year', 'week_start_date', 'weekofyear']]
    samples = samples.interpolate()
    generate_features(samples, features)
    trainable_features = [column for column in samples.columns if column not in ['city', 'year', 'week_start_date']]
    samples['city_class'] = pd.factorize(samples['city'])[0]
    samples[trainable_features] = samples[trainable_features].interpolate().bfill()
    samples.to_csv('exploded_features.csv')

In [47]:
lasso = HSICLasso()
trainable_features = [column for column in samples.columns if column not in ['city', 'year', 'week_start_date']]
lasso.input(samples[trainable_features].iloc[:training_split].values, training_labels['total_cases'].values, featname=trainable_features)


Block HSIC Lasso B = 20.
M set to 3.
Using Gaussian kernel for the features, Gaussian kernel for the outcomes.



B 20 must be an exact divisor of the number of samples 1456. Number of blocks 72.8 will be approximated to 72.



True

In [56]:
def test_features(features):
    y_training = training_labels['total_cases'].values[:1100]
    y_validation = training_labels['total_cases'].values[1100:]
    x_training = samples[features].iloc[:1100]
    x_validation = samples[features].iloc[1100:training_split]

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(x_training, y_training)
    lgb_eval = lgb.Dataset(x_validation, y_validation, reference=lgb_train)

    # specify your configurations as a dict
    params = {
        'objective': 'regression',
        'metric': {'l1'},
        'num_leaves': 32,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5, 
        'verbose':0
    }

    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=60,
                    early_stopping_rounds=5,
                    valid_sets=lgb_eval)

    y_pred = gbm.predict(x_validation, num_iteration=gbm.best_iteration)
    return mean_absolute_error(y_validation, y_pred)

In [57]:
test_features(trainable_features)

[1]	valid_0's l1: 21.5584
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l1: 20.5923
[3]	valid_0's l1: 19.593
[4]	valid_0's l1: 18.6549
[5]	valid_0's l1: 18.032
[6]	valid_0's l1: 17.5497
[7]	valid_0's l1: 17.2754
[8]	valid_0's l1: 16.8484
[9]	valid_0's l1: 16.7138
[10]	valid_0's l1: 16.4835
[11]	valid_0's l1: 15.8829
[12]	valid_0's l1: 15.4834
[13]	valid_0's l1: 15.0537
[14]	valid_0's l1: 14.5942
[15]	valid_0's l1: 14.1352
[16]	valid_0's l1: 13.6777
[17]	valid_0's l1: 13.2438
[18]	valid_0's l1: 13.0048
[19]	valid_0's l1: 12.6867
[20]	valid_0's l1: 12.4209
[21]	valid_0's l1: 12.2834
[22]	valid_0's l1: 12.1504
[23]	valid_0's l1: 11.9002
[24]	valid_0's l1: 11.6406
[25]	valid_0's l1: 11.3399
[26]	valid_0's l1: 11.3135
[27]	valid_0's l1: 11.277
[28]	valid_0's l1: 11.2832
[29]	valid_0's l1: 11.2728
[30]	valid_0's l1: 11.2657
[31]	valid_0's l1: 11.0592
[32]	valid_0's l1: 10.8621
[33]	valid_0's l1: 10.6553
[34]	valid_0's l1: 10.4665
[35]	valid_0's l1: 10.3152
[36]	v

10.078008952243577

In [59]:
test_features(lasso.get_features())

[1]	valid_0's l1: 21.7609
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l1: 20.7669
[3]	valid_0's l1: 19.9261
[4]	valid_0's l1: 19.0925
[5]	valid_0's l1: 18.3671
[6]	valid_0's l1: 17.7698
[7]	valid_0's l1: 17.0796
[8]	valid_0's l1: 16.4766
[9]	valid_0's l1: 15.8094
[10]	valid_0's l1: 15.2683
[11]	valid_0's l1: 14.9449
[12]	valid_0's l1: 14.5762
[13]	valid_0's l1: 14.222
[14]	valid_0's l1: 13.7928
[15]	valid_0's l1: 13.5307
[16]	valid_0's l1: 13.1986
[17]	valid_0's l1: 12.8428
[18]	valid_0's l1: 12.4441
[19]	valid_0's l1: 12.1487
[20]	valid_0's l1: 11.8782
[21]	valid_0's l1: 11.6026
[22]	valid_0's l1: 11.3279
[23]	valid_0's l1: 11.0825
[24]	valid_0's l1: 10.9277
[25]	valid_0's l1: 10.6532
[26]	valid_0's l1: 10.4049
[27]	valid_0's l1: 10.1859
[28]	valid_0's l1: 9.93326
[29]	valid_0's l1: 9.76791
[30]	valid_0's l1: 9.58436
[31]	valid_0's l1: 9.4219
[32]	valid_0's l1: 9.29935
[33]	valid_0's l1: 9.11777
[34]	valid_0's l1: 8.92957
[35]	valid_0's l1: 8.87137
[36]	

7.9220323563136255

In [112]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(x_train[features], y_train['total_cases'])

# specify your configurations as a dict
params = {
    'objective': 'regression',
    'metric': {'l1'},
    'num_leaves': 32,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=40)

In [115]:
features = [column for column in test.columns if column not in ['city', 'year', 'week_start_date', 'weekofyear']]
test = test.interpolate()
info  = test[['city', 'year', 'week_start_date', 'weekofyear']]
test = generate_features(test, features)


divide by zero encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in less


invalid value encountered in minimum


divide by zero encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in sqrt



In [117]:
features = [column for column in test.columns if column not in ['city', 'year', 'week_start_date']]

In [116]:
test = info.merge(test)

In [140]:
y_pred = gbm.predict(test[features], num_iteration=gbm.best_iteration)

In [141]:
test['total_cases'] = np.around(y_pred).astype(np.int64)

In [142]:
submission = test[['city', 'year', 'weekofyear', 'total_cases']]
submission.to_csv('submission.csv',index=False)