In [None]:
!pip install gdown==4.6.0
!pip install scikit-learn==1.0
!pip install scipy==1.7.1

In [None]:
!gdown 1RR4VvIQ6jiBneVuPXwdq1E5FK8USI_-U

In [None]:
!tar -zxvf finger_in_the_power_datasets.tar.gz

In [None]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from tqdm import tqdm
from scipy.stats import skew, entropy, median_abs_deviation
from scipy.signal import find_peaks
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

PATHS = {
    'Desk-4590': 'finger_in_the_power_datasets/Desk-4590.csv',
    'Srv-2630-v3': 'finger_in_the_power_datasets/Srv-2630-v3.csv',
    'Srv-2630L-v4': 'finger_in_the_power_datasets/Srv-2630L-v4.csv',
    'Srv-5220': 'finger_in_the_power_datasets/Srv-5220.csv',
    'Srv-6130': 'finger_in_the_power_datasets/Srv-6130.csv',
    'Srv-AMD': 'finger_in_the_power_datasets/Srv-AMD.csv'
}

In [None]:
class Clipper(BaseEstimator, TransformerMixin):

    def __init__(self, low_quantile, high_quantile):
        self.low_quantile = low_quantile
        self.high_quantile = high_quantile

        self.low_quantile_val = None
        self.high_quantile_val = None

    def fit(self, X, y=None):
        self.low_quantile_val = np.quantile(X, self.low_quantile)
        self.high_quantile_val = np.quantile(X, self.high_quantile)
    
    def transform(self, X, y=None):
        X = np.copy(X)
        X[X < self.low_quantile_val] = self.low_quantile_val
        X[X > self.high_quantile_val] = self.high_quantile_val

        return X

In [None]:
def mean_temps(df):
    power_consumption_clms = [clm for clm in df.columns if 'power_consumption' in clm] + ['machine']
    temp_clms = [clm for clm in df.columns if 'temperature' in clm]
    
    temperatures = df[temp_clms].mean(axis=1)
    df = df[power_consumption_clms].copy()
    df['temperature'] = temperatures
        
    return df

def get_consumption_with_temp_dfs(dfs):
    power_consumption_clms = [clm for clm in dfs[0].columns if 'power_consumption' in clm] + ['machine']
    temp_clms = [clm for clm in dfs[0].columns if 'temperature' in clm]
    
    dfs_of_consumption = [df[power_consumption_clms] for df in dfs]
    dfs_of_temp = [df[temp_clms] for df in dfs]
    
    dfs_consumption_and_temp = []
    for consumption_df, temp_df in zip(dfs_of_consumption, dfs_of_temp):
        df = consumption_df.copy()
        df['temperature'] = temp_df.mean(axis=1).values
        dfs_consumption_and_temp.append(df)
        
    return dfs_consumption_and_temp
 
 
def generate_trace_level_fetures(X):
    
    def get_sequence_of_diffs(trace, step_size):
        left_side_idxs = np.arange(0, trace.shape[0] - step_size, step_size)
        right_side_idx = np.arange(step_size, trace.shape[0], step_size)

        sequence_of_diffs = np.abs(trace[left_side_idxs] - trace[right_side_idx])
        return sequence_of_diffs

    features = {}
    features['mean'] = np.mean(X, axis=1)
    features['std'] = np.std(X, axis=1)
    features['skew'] = skew(X, axis=1)
    features['entropy'] = entropy(X, axis=1)

    features['precentile_10'] = np.percentile(X, 10, axis=1)
    features['precentile_20'] = np.percentile(X, 20, axis=1)
    features['precentile_30'] = np.percentile(X, 30, axis=1)
    features['precentile_40'] = np.percentile(X, 40, axis=1)
    features['precentile_50'] = np.percentile(X, 50, axis=1)
    features['precentile_60'] = np.percentile(X, 60, axis=1)
    features['precentile_70'] = np.percentile(X, 70, axis=1)
    features['precentile_80'] = np.percentile(X, 80, axis=1)
    features['precentile_90'] = np.percentile(X, 90, axis=1)

    features['dist_mean_from_median'] = np.abs(features['mean'] - features['precentile_50'])
    features['num_of_peaks'] = []

    features['median_abs_deviation'] = median_abs_deviation(X, axis=1)

    features['diff_seq_mean'] = []
    features['diff_seq_median'] = []
    features['diff_seq_std'] = []
    
    for trace in tqdm(X):
        peaks = find_peaks(trace)[0]
        features['num_of_peaks'].append(len(peaks))

        sequence_of_diffs = get_sequence_of_diffs(trace, 1)
        features['diff_seq_mean'].append(np.mean(sequence_of_diffs))
        features['diff_seq_median'].append(np.median(sequence_of_diffs))
        features['diff_seq_std'].append(np.std(sequence_of_diffs))

    return pd.DataFrame(features).values


def train_test_split_by_time(X, y, test_size):
    X_train, X_test = [], []
    y_train, y_test = [], []
    for lbl in np.unique(y):
        indices_of_lbl = y == lbl
        X_of_lbl = X[indices_of_lbl]
        y_of_lbl = y[indices_of_lbl]

        first_idx_of_train = int((1 - test_size) * X_of_lbl.shape[0])
        X_train_of_lbl = X_of_lbl[:first_idx_of_train]
        X_test_of_lbl = X_of_lbl[first_idx_of_train:]
        y_train_of_lbl = y_of_lbl[:first_idx_of_train]
        y_test_of_lbl = y_of_lbl[first_idx_of_train:]

        X_train.append(X_train_of_lbl)
        X_test.append(X_test_of_lbl)
        y_train.append(y_train_of_lbl)
        y_test.append(y_test_of_lbl)
    
    X_train = pd.DataFrame(np.concatenate(X_train), columns=X.columns)
    X_test = pd.DataFrame(np.concatenate(X_test), columns=X.columns)
    y_train = pd.Series(np.concatenate(y_train))
    y_test = pd.Series(np.concatenate(y_test))

    return X_train, X_test, y_train, y_test

def from_df_into_X_y(df, has_temperature=True):
    """
    for every trace, mean the temperature of all its samples
    split into train and test
    train a clipper on the train, apply it on all the data
    """
    # train a clipper on the train part of the first df and apply it to all the data
    # returns a list of (X, y) tuples.
    # The first is train, the second is test of first collection, the third and later are the whole collection per tuple
    
    if has_temperature:
        df = mean_temps(df)    
    X_train, X_test, y_train, y_test = train_test_split_by_time(df.drop(['machine'], axis=1),
                                                                df['machine'],
                                                                0.2)

    if has_temperature:
        X_consumption_train = X_train.drop(['temperature'], axis=1)
        X_consumption_test = X_test.drop(['temperature'], axis=1)
    else:
        X_consumption_train = X_train
        X_consumption_test = X_test
    
    clipper = Clipper(0.01, 0.99)
    clipper.fit(X_consumption_train)
    
    X_consumption_train = clipper.transform(X_consumption_train)
    X_consumption_test = clipper.transform(X_consumption_test)
    
    X_train_features = generate_trace_level_fetures(X_consumption_train)
    X_test_features = generate_trace_level_fetures(X_consumption_test)
    
    if has_temperature:
        X_train_features = np.hstack([X_train_features, X_train['temperature'].values.reshape(-1, 1)])
        X_test_features = np.hstack([X_test_features, X_test['temperature'].values.reshape(-1, 1)])
    
    return X_train_features, y_train, X_test_features, y_test

def train_classifier_and_report(X_train, y_train, X_test, y_test, use_temperature, plot_conf_mat, group_name):
    # the last clm is the temperature clm   
    if not use_temperature:
        X_train = X_train[:, :-1]
        X_test = X_test[:, :-1]
    
    clf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    if plot_conf_mat:
      conf_mat_labels = sorted(clf.classes_)
      conf_mat = confusion_matrix(y_test, y_pred, labels=conf_mat_labels, normalize='true')

      plt.figure(figsize=(10, 7))
      ax = sns.heatmap(conf_mat,
                      cmap=sns.cubehelix_palette(start=2, rot=0, dark=0.4, light=1, as_cmap=True))
      
      for _, spine in ax.spines.items():
          spine.set_visible(True)

      plt.xticks([])
      plt.yticks([])

      plt.title(f'Confusion Matrix for {group_name}')
      plt.show()
    
    return clf, acc
    

In [None]:
for group_name, path in PATHS.items():
    print(group_name)
    df = pd.read_csv(path)
    base_rate = 1 / df['machine'].nunique()
    
    has_temperature = True
    if group_name == 'Srv-AMD':
        has_temperature = False
    
    X_train, y_train, X_test, y_test = from_df_into_X_y(df, has_temperature)
    
    acc_with_temp = None
    if has_temperature:
        clf_with_temp, acc_with_temp = train_classifier_and_report(X_train, y_train, X_test, y_test, use_temperature=True, plot_conf_mat=True, group_name=group_name)

    clf_without_temp, acc_without_temp = train_classifier_and_report(X_train, y_train, X_test, y_test, use_temperature=False, plot_conf_mat=not has_temperature, group_name=group_name)
    print(f'Group {group_name}, acc with temperature: {acc_with_temp}, acc without temperature: {acc_without_temp}, base rate: {base_rate}')
    print('-----------')