In [1]:
import pandas as pd
import numpy as np
from pyHSICLasso import HSICLasso
from matrixprofile import *
import lightgbm as lgb
import plotly_express as px
from sklearn.metrics import *

In [2]:
def generate_features(df, features, windows=range(2, 8), functions=[('mean', np.mean),('std', np.std),('max', np.max),('min', np.min)]):
    for feature in features: 
        for window in windows:
            df[f'matrix_profile.{window}.{feature}'] = np.pad(matrixProfile.stomp(df[feature].values, window)[0], (window-1, 0), 'constant', constant_values=np.nan)
            for function in functions:
                df[f'{function[0]}.{window}.{feature}'] = df[feature].rolling(window).apply(function[1])
    return df

In [3]:
def get_lasso_features(sample, label, regression):
    sample = sample.dropna(axis=1)
    lasso = HSICLasso()
    trainable_features = [column for column in sample.columns if column not in ['city', 'year', 'week_start_date']]
    lasso.input(sample[trainable_features].iloc[:len(label)].values, label['total_cases'].values, featname=trainable_features)
    lasso.regression(regression)
    return lasso.get_features()

In [4]:
training_samples = pd.read_csv('https://s3.amazonaws.com:443/drivendata/data/44/public/dengue_features_train.csv')
training_labels = pd.read_csv('https://s3.amazonaws.com:443/drivendata/data/44/public/dengue_labels_train.csv')
testing_samples = pd.read_csv('https://s3.amazonaws.com:443/drivendata/data/44/public/dengue_features_test.csv')

In [5]:
training_samples = [sample[1] for sample in training_samples.groupby('city')]
training_labels = [label[1] for label in training_labels.groupby('city')]
testing_samples = [sample[1] for sample in testing_samples.groupby('city')]

In [6]:
dengue_samples = [pd.read_csv('iq_exploded_features.csv'), pd.read_csv('sj_exploded_features.csv')]

In [7]:
lasso_features = [get_lasso_features(sample, label, 100) for sample, label in zip(dengue_samples, training_labels)]

Block HSIC Lasso B = 20.
M set to 3.
Using Gaussian kernel for the features, Gaussian kernel for the outcomes.



divide by zero encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in less_equal


invalid value encountered in less



Block HSIC Lasso B = 20.
M set to 3.
Using Gaussian kernel for the features, Gaussian kernel for the outcomes.



B 20 must be an exact divisor of the number of samples 936. Number of blocks 46.8 will be approximated to 46.


divide by zero encountered in true_divide


divide by zero encountered in true_divide



In [8]:
if False:
    for training_sample, testing_sample in zip(training_samples, testing_samples):
        dengue_samples.append(pd.concat([training_sample, testing_sample]))
    for i, sample in enumerate(dengue_samples):
        sample = sample.interpolate()
        generate_features(sample, [column for column in sample.columns if column not in ['city', 'year', 'week_start_date', 'weekofyear']])
        trainable_features = [column for column in sample.columns if column not in ['city', 'year', 'week_start_date']]
        sample[trainable_features] = sample[trainable_features].interpolate().fillna(method ='bfill')
        city = sample['city'].iloc[0]
        sample.to_csv(f'{city}_exploded_features.csv', index=False)
        dengue_samples[i] = sample

In [9]:
def test_features(sample, label, features, split=0.7):
    y_training = label['total_cases'].values[:int(len(label) * split)]
    y_validation = label['total_cases'].values[int(len(label) * split):]
    x_training = sample[features].iloc[:int(len(label) * split)]
    x_validation = sample[features].iloc[int(len(label) * split):len(label)]

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(x_training, y_training)
    lgb_eval = lgb.Dataset(x_validation, y_validation, reference=lgb_train)

    # specify your configurations as a dict
    params = {
        'objective': 'regression',
        'metric': {'l1'},
        'num_leaves': 32,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5, 
        'verbose':0
    }

    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=60,
                    early_stopping_rounds=5,
                    valid_sets=lgb_eval)

    y_pred = gbm.predict(x_validation, num_iteration=gbm.best_iteration)
    return mean_absolute_error(y_validation, y_pred)

In [33]:
def train_model(sample, label, features):
    lgb_train = lgb.Dataset(sample[features].iloc[:len(label)], label['total_cases'])
    params = {
        'objective': 'regression',
        'metric': {'l1'},
        'num_leaves': 32,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 1
    }
    gbm = lgb.train(params, lgb_train, num_boost_round=80)
    return gbm

In [34]:
def predict_values(sample, label, features, model):
    return  model.predict(sample[features].iloc[len(label):], num_iteration=model.best_iteration)

In [35]:
models = []
for sample, label, features in zip(dengue_samples, training_labels, lasso_features):
    models.append(train_model(sample, label, features))

In [36]:
predictions = []
for sample, label, features, model in zip(dengue_samples, training_labels, lasso_features, models):
    predictions.append(np.around(predict_values(sample, label, features, model)).astype(np.int64))

In [37]:
for i in range(2):
    testing_samples[i]['total_cases'] = predictions[i]

In [38]:
testing_samples.reverse()

In [39]:
submission = pd.concat(testing_samples)[['city', 'year', 'weekofyear', 'total_cases']]
submission.to_csv('submission.csv',index=False)