In [2]:
#Import required python packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import importlib
import statistics
import time
from sklearn import metrics
from skmultiflow.data.data_stream import DataStream

In [None]:
# Import helper scripts
from helper_functions import keras_NN
from helper_functions import methods_sampling
from helper_functions import control_methods
from helper_functions import SQA_preprocessing
from helper_functions import params

In [None]:
# Load datasets
df_sqa = pd.read_csv(params.filepath_project_folder + '\SQA_full_prepro_data.csv', index_col=0)
df_features_kaco = pd.read_csv(params.filepath_project_folder  + '\\features_kaco.csv', index_col=0)
df_A600 = pd.read_csv(params.filepath_project_folder + '\extra_features\A600_Date.csv', index_col=0)

In [None]:
# Set parameters
target_col = 'delta_A600_E100'
threshold = 400

# Binarize target label
df_SQA_cat = SQA_preprocessing.get_binary_SQA_Cats(df_sqa, target_col, threshold)

In [None]:
# Join features and label
data_full = df_features_kaco.join(df_SQA_cat['delta_A600_E100_cat']).join(df_A600)
data_full.dropna(inplace=True)

In [None]:
# Sort dataset chronologically
data_full = data_full.sort_values(by='Datetime_A600')
data_full.drop(['Datetime_A600'], axis=1, inplace=True)

In [None]:
# Cut-away first and last 2000 samples
data_full = data_full.astype(int)
data_full = data_full.iloc[2000:-2000,:]

In [None]:
# Method to call the method to get an initialized aNN classifier
importlib.reload(control_methods)
def get_clf ():
    input_shape = len(df_features_kaco.columns)
    clf = control_methods.get_binary_NN_clf(input_shape)
    
    return clf

In [None]:
# Put the dataset into a stream and start the stream
stream = DataStream(data_full)
stream.prepare_for_use()

In [None]:
# How many samples in the stream?
stream.n_remaining_samples()

In [None]:
# Method to calculate statistics for the page-hickley-test
sample_count = 1
def compute_ph_statistics(mean_score, ph_sum, score, ph_sum_list):
    # Compute new mean
    mean_new = mean_score + ((score - mean_score) / sample_count)
    
    # Compute new sum of deviations
    ph_sum_new = ph_sum + (score - mean_new)
    ph_sum_list.append(ph_sum_new)
    
    # Compute change statistics
    statistic_max = max(ph_sum_list) - ph_sum_new
    statistic_min = ph_sum_new - min(ph_sum_list)
    
    return (mean_new, ph_sum_new, ph_sum_list, statistic_max, statistic_min)

In [None]:
# This function resets the values of the variables in case a drift is detected
def reset_values():
    ph_mean = 0
    ph_sum = 0
    ph_sum_list = []
    sample_count = 1
    return(ph_mean, ph_sum, ph_sum_list, sample_count)

In [None]:
# Set parameters for the online learning model
init_training = 50000
retrain_batch_size = 20
batch_size = 16
epochs = 6
C = 10
iters = int((len(data_full)-init_training)/retrain_batch_size)

In [None]:
# Method to perform sampling before training
def sample(X, y):
    df_x = pd.DataFrame(X)
    df_y = pd.DataFrame(y)
    df = df_x.join(df_y, rsuffix='_target')
    df_sampled = methods_sampling.sample_data(df, '0_target', 'SMOTEENN')

    return df_sampled

In [None]:
# Notrain scenario
# Initialize
sum = 0
counter = 0
iterator = 0
X_list = []
y_list = []

# Initial training
stream.restart()
X, y = stream.next_sample(init_training)
clf = get_clf()
df_sampled = sample(X,y)
clf.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs = 1)

scores = []

#Stream Simulation
while (stream.n_remaining_samples() > 1):
    # Get the next item from the stream
    X, y = stream.next_sample(1)
    X_list.append(X)
    y_list.append(y)
    
    # Sum of the instances of the late class so far
    sum = sum+y[0]
    counter = counter + 1
    
    # Check if at least C instances per class are present
    if (sum >= C) & (counter >= C):
        counter = 0
        iterator = iterator + 1
        sum = 0
        
        X = np.vstack(X_list)
        y = np.vstack(y_list)
        
        # Perform prediction using the current predictor
        prediction = clf.predict(X)
        y_pred = (prediction > 0.5)
        
        # Calculate the F2-Score
        current_f2_score = metrics.fbeta_score(y_pred, y, average=None, beta=2)
        scores.append(current_f2_score[1])
          
        X_list = []
        y_list = []

In [None]:
# Batch retrain scenario
# Initialize
sum = 0
counter = 0
iterator = 0
X_list = []
y_list = []

# Initial training
stream.restart()
X, y = stream.next_sample(init_training)
clf = get_clf()
df_sampled = sample(X,y)
clf.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs = 1)

scores_retraining = []

#Continuous Training
while (stream.n_remaining_samples() > 1):
    # get next sample from the stream
    X, y = stream.next_sample(1)
    X_list.append(X)
    y_list.append(y)
    
    # Sum up the samples of the late class
    sum = sum+y[0]
    counter = counter + 1
    
    # Check if at least C samples per class are present
    if (sum >= C) & (counter-sum >= C):
        counter = 0
        iterator = iterator + 1
        sum = 0
        print(counter)
        print(iterator)
        
        X = np.vstack(X_list)
        y = np.vstack(y_list)
        
        # Perform prediction using the current predictor
        prediction = clf.predict(X)
        y_pred = (prediction > 0.5)
        
        # Calculate the F2-Score on the data sample
        current_f2_score = metrics.fbeta_score(y_pred, y, average=None, beta=2)
        scores_retraining.append(current_f2_score[1])
        
        # Initialize a new classifier and train on the last sample
        clf = get_clf()    
        df_sampled = sample(X,y)
        clf.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:], batch_size = batch_size, epochs = epochs)
        
        X_list = []
        y_list = []

In [None]:
# Partial fit scenario
# Initilaize
sum = 0
counter = 0
iterator = 0
X_list = []
y_list = []

# Initial training
stream.restart()
X, y = stream.next_sample(init_training)
clf = get_clf()
df_sampled = sample(X,y)
clf.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs = 1)

scores_partial_fit = []

#Continuous Training
while (stream.n_remaining_samples() > 1):
    # get next sample from the stream
    X, y = stream.next_sample(1)
    X_list.append(X)
    y_list.append(y)
    
    # Sum up the samples of the late class
    sum = sum+y[0]
    counter = counter + 1
    
    # Check if at least C samples per class are present
    if (sum >= C) & (counter-sum >= C):
        counter = 0
        iterator = iterator + 1
        sum = 0
        print(counter)
        print(iterator)
        
        X = np.vstack(X_list)
        y = np.vstack(y_list)
        
        # Perform prediction using the current predictor
        prediction = clf.predict(X)
        y_pred = (prediction > 0.5)
        
        # Calculate the F2-Score on the data sample
        current_f2_score = metrics.fbeta_score(y_pred, y, average=None, beta=2)
        scores_partial_fit.append(current_f2_score[1])
        
        # Update thecurrent predictor with the new samples
        df_sampled = sample(X,y)
        clf.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:], batch_size = batch_size, epochs = epochs)
        
        X_list = []
        y_list = []      

In [None]:
# Active learning scenario
# Initialize
sum = 0
counter = 0
iterator = 0
X_list = []
y_list = []

# Set initial values for PH scenarios (lambdas)
sample_count_005 = 1
ph_mean_005 = 0
ph_sum_005 = 0

sample_count_01 = 1
ph_mean_01 = 0
ph_sum_01 = 0

sample_count_025 = 1
ph_mean_025 = 0
ph_sum_025 = 0

sample_count_1 = 1
ph_mean_1 = 0
ph_sum_1 = 0

sample_count_2 = 1
ph_mean_2 = 0
ph_sum_2 = 0

sample_count_5 = 1
ph_mean_5 = 0
ph_sum_5 = 0

sample_count_10 = 1
ph_mean_10 = 0
ph_sum_10 = 0

# Threshold for change detection
ph_lambda_005 = 0.05
ph_sum_list_005 = []
ph_score_list_005 = []

ph_lambda_01 = 0.1
ph_sum_list_01 = []
ph_score_list_01 = []

ph_lambda_025 = 0.25
ph_sum_list_025 = []
ph_score_list_025 = []

ph_lambda_1 = 1
ph_sum_list_1 = []
ph_score_list_1 = []

ph_lambda_2 = 2
ph_sum_list_2 = []
ph_score_list_2 = []

ph_lambda_5 = 5
ph_sum_list_5 = []
ph_score_list_5 = []

ph_lambda_10 = 10
ph_sum_list_10 = []
ph_score_list_10 = []

#Pretrain the initial classifiers
stream.restart()
X, y = stream.next_sample(init_training)
df_sampled = sample(X,y)

clf_005 = get_clf()
clf_005.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs= epochs)

clf_01 = get_clf()
clf_01.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs= epochs)

clf_025 = get_clf()
clf_025.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs= epochs)

clf_1 = get_clf()
clf_1.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs= epochs)

clf_2 = get_clf()
clf_2.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs= epochs)

clf_5 = get_clf()
clf_5.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs= epochs)

clf_10 = get_clf()
clf_10.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs= epochs)

# Get next sample from stream
while (stream.n_remaining_samples() > 1):
    X, y = stream.next_sample(1)
    X_list.append(X)
    y_list.append(y)
    
    # Capture number of samples from the late class
    sum = sum+y[0]
    counter = counter + 1
    
    # If at least C instances per class are present
    if (sum >= C) & (counter-sum >= C):
        counter = 0
        iterator = iterator + 1
        sum = 0
        print(counter)
        print(iterator)
        
        X = np.vstack(X_list)
        y = np.vstack(y_list)
        df_sampled = sample(X,y)
        
        # Predict on the current classifier
        prediction = clf_005.predict(X)
        y_pred = (prediction > 0.5)
        
        # Calculate the F2-Score
        current_f2_score = metrics.fbeta_score(y_pred, y, average=None, beta=2)
        ph_score_list_005.append(current_f2_score[1])
        
        # Calculate the PH statistics
        ph_mean_005, ph_sum_005, ph_sum_list_005, statistic_max, statistic_min = compute_ph_statistics(ph_mean_005, ph_sum_005, current_f2_score[1], ph_sum_list_005)
        sample_count_005 += 1
    
        # Check, if a retraining or update is recommended
        if ((statistic_max > ph_lambda_005) | (statistic_min > ph_lambda_005)):
            ph_mean_005, ph_sum_005, ph_sum_list_005, sample_count_005 = reset_values()
            print('Drift, No. of iterations:', iterator)
            
            # Get a new instance of the classifier
            clf_005 = get_clf()

        #Update/train the current classifier on the last samples from the stream
        clf_005.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs = epochs)
        
        # Repeat same procedure for different lambdas
        prediction = clf_01.predict(X)
        y_pred = (prediction > 0.5)
        current_f2_score = metrics.fbeta_score(y_pred, y, average=None, beta=2)
        ph_score_list_01.append(current_f2_score[1])
        
        ph_mean_01, ph_sum_01, ph_sum_list_01, statistic_max, statistic_min = compute_ph_statistics(ph_mean_01, ph_sum_01, current_f2_score[1], ph_sum_list_01)
        sample_count_01 += 1
    
        if ((statistic_max > ph_lambda_01) | (statistic_min > ph_lambda_01)):
            ph_mean_01, ph_sum_01, ph_sum_list_01, sample_count_01 = reset_values()
            print('Drift, No. of iterations:', iterator)
            clf_01 = get_clf()

        clf_01.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs = epochs)

        
        prediction = clf_025.predict(X)
        y_pred = (prediction > 0.5)
        current_f2_score = metrics.fbeta_score(y_pred, y, average=None, beta=2)
        ph_score_list_025.append(current_f2_score[1])
        
        ph_mean_025, ph_sum_025, ph_sum_list_025, statistic_max, statistic_min = compute_ph_statistics(ph_mean_025, ph_sum_025, current_f2_score[1], ph_sum_list_025)
        sample_count_025 += 1
    
        if ((statistic_max > ph_lambda_025) | (statistic_min > ph_lambda_025)):
            ph_mean_025, ph_sum_025, ph_sum_list_025, sample_count_025 = reset_values()
            print('Drift, No. of iterations:', iterator)
            clf_025 = get_clf()

        clf_025.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs = epochs)


        prediction = clf_1.predict(X)
        y_pred = (prediction > 0.5)
        current_f2_score = metrics.fbeta_score(y_pred, y, average=None, beta=2)
        ph_score_list_1.append(current_f2_score[1])
        
        ph_mean_1, ph_sum_1, ph_sum_list_1, statistic_max, statistic_min = compute_ph_statistics(ph_mean_1, ph_sum_1, current_f2_score[1], ph_sum_list_1)
        sample_count_1 += 1
    
        if ((statistic_max > ph_lambda_1) | (statistic_min > ph_lambda_1)):
            ph_mean_1, ph_sum_1, ph_sum_list_1, sample_count_1 = reset_values()
            print('Drift, No. of iterations:', iterator)
            clf_1 = get_clf()

        clf_1.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs = epochs)


        prediction = clf_2.predict(X)
        y_pred = (prediction > 0.5)
        current_f2_score = metrics.fbeta_score(y_pred, y, average=None, beta=2)
        ph_score_list_2.append(current_f2_score[1])
        
        ph_mean_2, ph_sum_2, ph_sum_list_2, statistic_max, statistic_min = compute_ph_statistics(ph_mean_2, ph_sum_2, current_f2_score[1], ph_sum_list_2)
        sample_count_2 += 1
    
        if ((statistic_max > ph_lambda_2) | (statistic_min > ph_lambda_2)):
            ph_mean_2, ph_sum_2, ph_sum_list_2, sample_count_2 = reset_values()
            print('Drift, No. of iterations:', iterator)
            clf_2 = get_clf()

        clf_2.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs = epochs)
        
        
        prediction = clf_5.predict(X)
        y_pred = (prediction > 0.5)
        current_f2_score = metrics.fbeta_score(y_pred, y, average=None, beta=2)
        ph_score_list_5.append(current_f2_score[1])
        
        ph_mean_5, ph_sum_5, ph_sum_list_5, statistic_max, statistic_min = compute_ph_statistics(ph_mean_5, ph_sum_5, current_f2_score[1], ph_sum_list_5)
        sample_count_5 += 1
    
        if ((statistic_max > ph_lambda_5) | (statistic_min > ph_lambda_5)):
            ph_mean_5, ph_sum_5, ph_sum_list_5, sample_count_5 = reset_values()
            print('Drift, No. of iterations:', iterator)
            clf_5 = get_clf()

        clf_5.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs = epochs)
        
        
        prediction = clf_10.predict(X)
        y_pred = (prediction > 0.5)
        current_f2_score = metrics.fbeta_score(y_pred, y, average=None, beta=2)
        ph_score_list_10.append(current_f2_score[1])
        
        ph_mean_10, ph_sum_10, ph_sum_list_10, statistic_max, statistic_min = compute_ph_statistics(ph_mean_10, ph_sum_10, current_f2_score[1], ph_sum_list_10)
        sample_count_10 += 1
    
        if ((statistic_max > ph_lambda_10) | (statistic_min > ph_lambda_10)):
            ph_mean_10, ph_sum_10, ph_sum_list_10, sample_count_10 = reset_values()
            print('Drift, No. of iterations:', iterator)
            clf_10 = get_clf()

        clf_10.fit(df_sampled.iloc[:,:-1],df_sampled.iloc[:,-1:],batch_size = batch_size,epochs = epochs)
        
        X_list = []
        y_list = []  

In [None]:
len(ph_score_list_5)

In [None]:
# Prepare F2-Scores for plotting
data = pd.DataFrame()
data['No Online Learning'] = scores[1:]
data['Batch Retraining'] = scores_retraining
data['Incremental Learning'] = scores_partial_fit
data['Active Decision Learning'] = ph_score_list_5
data.describe()

In [None]:
# Plot the data for the different scenarios
data.plot()

In [None]:
# Plot all scenarios and print mean F2-Scores
plt.figure(figsize=(10,4))
plt.plot(scores_retraining, label = 'Entire Retraining for every batch')
plt.plot(scores_partial_fit, label = 'Partial Fit for every batch')
plt.plot(ph_score_list_005, label = '"Smart" Retraining: PH-Change Detection 0,05')
plt.plot(ph_score_list_01, label = '"Smart" Retraining: PH-Change Detection 0,1')
plt.plot(ph_score_list_025, label = '"Smart" Retraining: PH-Change Detection 0,25')
plt.plot(ph_score_list_1, label = '"Smart" Retraining: PH-Change Detection 1')
plt.plot(ph_score_list_2, label = '"Smart" Retraining: PH-Change Detection 2')
plt.plot(ph_score_list_5, label = '"Smart" Retraining: PH-Change Detection 5')
plt.plot(ph_score_list_10, label = '"Smart" Retraining: PH-Change Detection 10')
plt.plot(scores, label = 'No Retraining')
plt.title('Development of F2-Score of late-class over time')
plt.legend()
plt.show()
filepath = params.filepath_project_folder + '\concept_drift_f1_scores.png'
plt.savefig(filepath, bbox_inches = 'tight')

print('Mean accuracy retraining', np.mean(scores_retraining))
print('Mean accuracy partial fit', np.mean(scores_partial_fit))
print('Mean accuracy single training', np.mean(scores))
print('Mean accuracy ph training 0,05', np.mean(ph_score_list_005))
print('Mean accuracy ph training 0,1', np.mean(ph_score_list_01))
print('Mean accuracy ph training 0,25', np.mean(ph_score_list_025))
print('Mean accuracy ph training 1', np.mean(ph_score_list_1))
print('Mean accuracy ph training 2', np.mean(ph_score_list_2))
print('Mean accuracy ph training 5', np.mean(ph_score_list_5))
print('Mean accuracy ph training 10', np.mean(ph_score_list_10))