In [1]:
import os
import pandas as pd
import rampwf as rw
import pickle
from rampwf.score_types import BaseScoreType
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

In [2]:
problem_title = 'Teleray alert: Radiological events classification'

#lalel names for the classification
_prediction_label_names = ['Weather','AtmosphericRejection','ElectronicPeak1', 'ElectronicPeak2', 'ElectronicPeak3']

Predictions = rw.prediction_types.make_multiclass(
label_names=_prediction_label_names)


In [3]:
#Workflow
workflow = rw.workflows.FeatureExtractorClassifier()

In [4]:
class weighted_ROCAUC(BaseScoreType):
    is_lower_the_better = False
    minimum = 0.0
    maximum = 1.0

    def __init__(self, name='weighted_roc_auc', precision=2):
        self.name = name
        self.precision = precision

    def score_function(self, ground_truths, predictions):
        """
            Weighted average AUC:
            Calculate metrics for each label, and find their average, weighted by support. 
        """
        y_proba = predictions.y_pred #shape (n_samples, n_classes)
        y_true_proba = ground_truths.y_pred_label_index #shape (n_samples, 1)
        y_true_proba= label_binarize(y_true_proba, classes=np.unique(y_true_proba)) #shape (n_samples, n_classes)
        self.check_y_pred_dimensions(y_true_proba, y_proba)
        return self.__call__(y_true_proba, y_proba)

    def __call__(self, y_true_proba, y_proba):
        return roc_auc_score(y_true_proba, y_proba, average='weigthed')


#Scores
score_types = [
    weighted_ROCAUC(),
    rw.score_types.BalancedAccuracy(name='acc')]

In [5]:
#Cross-validation scheme
def get_cv(X, y):
    cv = StratifiedKFold(n_splits=5, random_state=2)
    return cv.split(X, y)

In [10]:
#Data reading 
def _read_data(f_name_events, f_name_timeSeries):
    df_events= pickle.load(open(f_name_events, 'rb'))
    df_timeSeries= pickle.load(open(f_name_timeSeries, 'rb'))
    y= df_events.Label
    return df_events, df_timeSeries, y

def get_train_data(path='.'):
    f_name_events = 'train_events.pickle'
    f_name_timeSeries = 'train_time_series.pickle'
    
    f_name_events=os.path.join(path,'data/public',f_name_events)
    f_name_timeSeries=os.path.join(path,'data/public',f_name_timeSeries)
    return _read_data(f_name_events, f_name_timeSeries)


def get_test_data():
    f_name_events = 'test_events_ramp.pickle'
    f_name_timeSeries = 'test_time_series.pickle'
    
    f_name_events=os.path.join(path,'data/public',f_name_events)
    f_name_timeSeries=os.path.join(path,'data/public',f_name_timeSeries)
    return _read_data(f_name_events, f_name_timeSeries)