In [1]:
import pandas as pd
import numpy as np
from pyHSICLasso import HSICLasso
from matrixprofile import *
import lightgbm as lgb
import plotly_express as px
from sklearn.metrics import *

In [2]:
def generate_features(df, features, windows=range(2, 8), functions=[('mean', np.mean),('std', np.std),('max', np.max),('min', np.min)]):
    df = df.interpolate()
    for feature in features: 
        for window in windows:
            df[f'matrix_profile.{window}.{feature}'] = np.pad(matrixProfile.stomp(df[feature].values, window)[0], (window-1, 0), 'constant', constant_values=np.nan)
            for function in functions:
                df[f'{function[0]}.{window}.{feature}'] = df[feature].rolling(window).apply(function[1])
    df = df.interpolate().bfill()
    df = df.dropna(axis=1)
    return df

In [3]:
def train_model(sample, label, features):
    lgb_train = lgb.Dataset(sample[features].iloc[:len(label)], label['total_cases'])
    params = {
        'objective': 'regression',
        'metric': {'l1'},
        'num_leaves': 32,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 1
    }
    gbm = lgb.train(params, lgb_train, num_boost_round=80)
    return gbm

In [4]:
training_samples = pd.read_csv('https://s3.amazonaws.com:443/drivendata/data/44/public/dengue_features_train.csv')
training_labels = pd.read_csv('https://s3.amazonaws.com:443/drivendata/data/44/public/dengue_labels_train.csv')
testing_samples = pd.read_csv('https://s3.amazonaws.com:443/drivendata/data/44/public/dengue_features_test.csv')

In [5]:
explodable_features =  [column for column in training_samples.columns if column not in ['city', 'year', 'week_start_date', 'weekofyear']]
trainable_features =  [column for column in training_samples.columns if column not in ['city', 'year', 'week_start_date']]
exploded_samples = generate_features(training_samples, explodable_features)
exploded_test = generate_features(testing_samples, explodable_features)


divide by zero encountered in true_divide


divide by zero encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in less


invalid value encountered in minimum


invalid value encountered in true_divide


invalid value encountered in sqrt



In [6]:
def get_lasso_features(sample, label, num_features):
    lasso = HSICLasso()
    trainable_features =  [column for column in sample.columns if column not in ['city', 'year', 'week_start_date']]
    lasso.input(sample[trainable_features].iloc[:len(label)].values, label['total_cases'].values, featname=trainable_features)
    lasso.regression(num_features)
    return lasso.get_features()

In [7]:
features = get_lasso_features(exploded_samples, training_labels, 100)

Block HSIC Lasso B = 20.
M set to 3.
Using Gaussian kernel for the features, Gaussian kernel for the outcomes.



B 20 must be an exact divisor of the number of samples 1456. Number of blocks 72.8 will be approximated to 72.


invalid value encountered in true_divide


invalid value encountered in less_equal


invalid value encountered in less



In [23]:
model = train_model(exploded_samples, training_labels, features)
predictions = model.predict(exploded_test[features], num_iteration=model.best_iteration)

In [24]:
testing_samples['total_cases'] = np.around(predictions, decimals=0).astype(np.int64)

In [25]:
submission = testing_samples[['city', 'year','weekofyear', 'total_cases']]
submission.to_csv('submission_mixed_data.csv',index=False)