# TODO
- try different wavelets and classifier parameters
- stacking classifier?
- sample from classes such that they all have same cardinality
- try with CNN (or RNN)

# Imports & setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import scipy
from collections import Counter
import pywt
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

In [2]:
n_cores = 1
random_state = 71
data_directory = 'data/'

waveletname = 'db31'
level = 10

# Data import

In [3]:
X_train_df = pd.read_csv(data_directory + 'X_train.csv', index_col='id')
y_train_df = pd.read_csv(data_directory + 'y_train.csv', index_col='id')
X_test_df = pd.read_csv(data_directory + 'X_test.csv', index_col='id')

# Length adjustments

### Drop trailing NaN

In [4]:
def drop_trailing_na(df: pd.DataFrame):
    return [df.loc[i].dropna().to_numpy() for i in range(df.shape[0])]

X_train_full = drop_trailing_na(X_train_df)
y_train_full = y_train_df['y'].to_numpy()
X_test = drop_trailing_na(X_test_df)

# Separation in training and validation

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=random_state)

# Features extraction

### Features to extract

In [6]:
def calculate_entropy(list_values):
    counter_values = Counter(list_values).most_common()
    probabilities = [elem[1]/len(list_values) for elem in counter_values]
    entropy = scipy.stats.entropy(probabilities)
    return entropy

def calculate_crossings(list_values):
    zero_crossing_indices = np.nonzero(np.diff(np.array(list_values) > 0))[0]
    no_zero_crossings = len(zero_crossing_indices)
    mean_crossing_indices = np.nonzero(np.diff(np.array(list_values) > np.nanmean(list_values)))[0]
    no_mean_crossings = len(mean_crossing_indices)
    return [no_zero_crossings, no_mean_crossings]
 
def calculate_statistics(list_values):
    n5 = np.nanpercentile(list_values, 5)
    n25 = np.nanpercentile(list_values, 25)
    n75 = np.nanpercentile(list_values, 75)
    n95 = np.nanpercentile(list_values, 95)
    median = np.nanpercentile(list_values, 50)
    mean = np.nanmean(list_values)
    std = np.nanstd(list_values)
    var = np.nanvar(list_values)
    rms = np.nanmean(np.sqrt(list_values**2))
    return [n5, n25, n75, n95, median, mean, std, var, rms]
 
def get_features(list_values):
    entropy = calculate_entropy(list_values)
    crossings = calculate_crossings(list_values)
    statistics = calculate_statistics(list_values)
    return [entropy] + crossings + statistics

### Noise handling

In [7]:
def wavelet_transform(signal):
    return pywt.wavedec(signal, waveletname, level=level)

## Feature extraction

In [8]:
def get_dataset_features(data):
    list_features = []
    for signal in data:
        list_coeff = wavelet_transform(signal)
        features = []
        for coeff in list_coeff:
            features += get_features(coeff)
        list_features.append(features)
    return list_features

X_train_extracted = get_dataset_features(X_train)
X_val_extracted = get_dataset_features(X_val)



# Classification

### Classifier definition

In [9]:
#cls = GradientBoostingClassifier(n_estimators=100, verbose=1, random_state=random_state)
cls = XGBClassifier(seed=random_state) #(n_estimators=100, gamma=1, reg_alpha=3, reg_lambda=0, max_depth=10, min_child_weight=0, colsample_bytree=0.85, seed=random_state)

## Classifier fit and evaluation

In [10]:
cls.fit(X_train_extracted, y_train.T.ravel())

y_train_pred = cls.predict(X_train_extracted)
y_val_pred = cls.predict(X_val_extracted)

In [11]:
train_score = f1_score(y_train, y_train_pred, average='micro')
val_score = f1_score(y_val, y_val_pred, average='micro')

print(train_score, val_score)

1.0 0.673828125


# Final classification

In [12]:
X_train_full_extracted = get_dataset_features(X_train_full)
X_test_extracted = get_dataset_features(X_test)



In [None]:
y_test_pred = cls.fit(X_train_full_extracted, y_train_full.T.ravel()).predict(X_test_extracted)

### Writing results

In [None]:
table = pd.DataFrame({'id': np.arange(0, y_test_pred.shape[0]), 'y': y_test_pred.flatten()})
table.to_csv(data_directory + 'y_test_pred.csv', index=False)