# Self Organized Map

In [None]:
import numpy as np
import pandas as pd
import sys
import os
module_path = os.path.abspath(os.path.join('../..'))
sys.path.insert(1, module_path + '/src/')
from sklearn.model_selection import KFold

import warnings
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning, FitFailedWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=FitFailedWarning)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sktime.transformations.panel.rocket import Rocket
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import time


import utility
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import seaborn as sns
import matplotlib
from matplotlib.colors import LinearSegmentedColormap
font = FontProperties(fname = module_path + '/src/visualization/CharterRegular.ttf', size = 10, weight = 1000)
font_small = FontProperties(fname = module_path + '/src/visualization/CharterRegular.ttf', size = 8, weight = 1000)


from sktime.datasets import load_UCR_UEA_dataset
from sktime.transformations.panel.catch22_features import Catch22


def get_dataset_partitions(X, y, ft = 'rocket', fs = 'pca'):
    X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
    
    norm = MinMaxScaler()
    X_train_full = pd.DataFrame(norm.fit_transform(X_train_full))
    X_test = pd.DataFrame(norm.fit_transform(X_test))
    k = min([X_test.shape[0], 100])
    print('Chosen number of components: ', k)
    if (ft == 'rocket') and (fs == 'pca'):  
        pca = PCA(n_components = k)
        X_train_full = pca.fit_transform(X_train_full)
        X_train_full = pd.DataFrame(X_train_full)
        
        X_test = pca.fit_transform(X_test)
        X_test = pd.DataFrame(X_test)
    elif (ft == 'rocket') and (fs == 'select'):
        select = SelectKBest(chi2, k=k)
        X_train_full = select.fit_transform(X_train_full, y_train_full)
        indices = select.get_support(indices = True)
        X_test = X_test[indices]
    return pd.DataFrame(X_train_full), y_train_full, pd.DataFrame(X_test), y_test


def get_initial_classifier(X_train_full,y_train_full, clf, grid, no_grid_search = False):
    grid_cv=GridSearchCV(clf,grid,cv=5)
    grid_cv.fit(X_train_full,y_train_full)
    return grid_cv.best_estimator_

def get_helping_classifier(X_train, y_train, X_val, y_val, clf, grid):
    '''Get the helping classifier
    
    returns a classifier, which classifies if something is likely to be classified wrongly
    
    '''
    new_X = X_val

    grid_cv=GridSearchCV(clf,grid,cv=5)
    grid_cv.fit(X_train,y_train)
    clf_2 = grid_cv.best_estimator_
        
    y_pred = clf_2.predict(X_val)
    new_y = pd.Series(np.hstack([y_pred != y_val]))
    
    grid_cv=GridSearchCV(clf,grid,cv=5)
    grid_cv.fit(new_X,new_y)
    return grid_cv.best_estimator_ , grid_cv.best_score_



def get_helping_classifier_alt(X_train, y_train, X_val, y_val, clf, grid):
    '''Get the helping classifier
    
    returns a classifier, which classifies if something is likely to be classified wrongly
    
    ALT: Here a Logisic regression classifier is always used 
    to detect if a sample is likely to be classified incorrectly
    '''
    grid_cv=GridSearchCV(clf,grid,cv=5)
    grid_cv.fit(X_train,y_train)
    clf = grid_cv.best_estimator_
    
    y_pred = clf.predict(X_val)
    new_X = X_val
    new_y = pd.Series(np.hstack([y_pred != y_val]))
    
    grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
    clf=LogisticRegression()
    grid_cv=GridSearchCV(clf,grid,cv=5)
    grid_cv.fit(new_X,new_y)
    
    return grid_cv.best_estimator_ , grid_cv.best_score_

def get_ensemble_helping_classifier(X_train, y_train, clf, grid):
    kf = KFold(n_splits=5, random_state=None, shuffle=False)
    #kf.get_n_splits(X_train)
    classifiers = []
    for train_index, val_index in kf.split(X_train):
        clf_temp, score_temp = get_helping_classifier(X_train.iloc[train_index], y_train.iloc[train_index], X_train.iloc[val_index], y_train.iloc[val_index], clf, grid)
        classifiers.append((clf_temp, score_temp))
    return classifiers

def make_ensemble_classification(X_test, ensemble):
    y_pred = []
    for e in ensemble: 
        clf = e[0]
        score = e[1]
        y_pred_temp = clf.predict(X_test)
        y_pred.append((score * (y_pred_temp + 1)) -1)
    y_pred = np.array(y_pred)
    y_pred_sum = y_pred.sum(axis = 0)
    idx_true = np.where(y_pred_sum > 0)[0]
    
    y_pred = y_pred.mean(axis = 0)
    y_pred = np.round(y_pred).astype(int)
    
    y_pred[idx_true] = 1
    return y_pred

def filter_test_set_ensemble(X_test, y_test, ensemble):
    y_pred = make_ensemble_classification(X_test, ensemble)
    to_del = np.where(y_pred == 1)[0]
    X_test = X_test.reset_index(drop = True)
    y_test = y_test.reset_index(drop = True)
    X_test_new = X_test[~X_test.index.isin(to_del)]
    y_test_new = y_test[~y_test.index.isin(to_del)]
    return X_test_new, y_test_new


def plot_cm(clf, X_test, y_test, class_names, color_index = 2):
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    colors = ["#F94144", "#F3722C", '#F8961E', '#F9C74F','#90BE6D', '#43AA8B','#577590']

    font = FontProperties(fname = module_path + '/src/visualization/CharterRegular.ttf', size = 10, weight = 1000)

    colors_2 = ['#FFFFFF', colors[color_index]]
    cmap_name = 'my colormap'
    font_small = FontProperties(fname =  module_path + '/src/visualization/CharterRegular.ttf', size = 6, weight = 1000)

    cm_map = LinearSegmentedColormap.from_list(cmap_name, colors_2)


    f, ax = plt.subplots(1,1) # 1 x 1 array , can also be any other size
    f.set_size_inches(5, 5)

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    ax = sns.heatmap(cm, annot=True,
                fmt='.2%', cmap=cm_map, xticklabels=class_names,yticklabels=class_names )
    cbar = ax.collections[0].colorbar
    for label in ax.get_yticklabels() :
        label.set_fontproperties(font_small)
    for label in ax.get_xticklabels() :
        label.set_fontproperties(font_small)
    ax.set_ylabel('True Label', fontproperties = font)
    ax.set_xlabel('Predicted Label', fontproperties = font)
    ax.set_xticklabels(ax.get_xticklabels(), rotation = 0)

    for child in ax.get_children():
        if isinstance(child, matplotlib.text.Text):
            child.set_fontproperties(font)
    for l in cbar.ax.yaxis.get_ticklabels():
        l.set_fontproperties(font_small)
        
    plt.show()


def print_and_plot_to_compare(X_test, y_test, X_test_2, y_test_2, clf, class_names):
    y_pred = clf.predict(X_test)
    acc_1 = accuracy_score(y_test, y_pred)
    print(f'Accuracy score for the initial classification: {acc_1} ')
    plot_cm(clf, X_test, y_test, class_names)
    
    y_pred_2 = clf.predict(X_test_2)
    acc_2 = accuracy_score(y_test_2, y_pred_2)
    print(f'Accuracy score for the double classification: {acc_2} ')
    plot_cm(clf, X_test_2, y_test_2, class_names, color_index = 5)
    return acc_1 , acc_2
    
def compare_single_double_classifier(name_UCR, class_names, result_dict, feature_transform = 'rocket', feature_select = 'pca'):    
    if name_UCR != 'lungSound':
        X, y = load_UCR_UEA_dataset(name_UCR, return_X_y=True)
    else: 
        X, y = load_from_tsfile_to_dataframe(module_path + '/data/ts_files/crackleNoCrackleSamleLength3000.ts')
    if (feature_transform == 'rocket'):
        rocket = Rocket()  # by default, ROCKET uses 10,000 kernels
        rocket.fit(X)
        X = rocket.transform(X)
    else:
        c22f = Catch22()
        c22f.fit(X, y)
        X = c22f.transform(X)
    X_train_full, y_train_full , X_test, y_test = get_dataset_partitions(X,y, feature_transform, feature_select)
    ############ K nearest neighbours 
    max_nn = round((0.8*X.shape[0])//8)
    k_range = list(range(1,max_nn))
    weight_options = ["uniform", "distance"]
    grid = dict(n_neighbors = k_range, weights = weight_options)
    clf = KNeighborsClassifier()

    clf_full = get_initial_classifier(X_train_full, y_train_full, clf, grid)
    clf_ensemble = get_ensemble_helping_classifier(X_train_full, y_train_full, clf, grid)
    X_test_2, y_test_2 = filter_test_set_ensemble(X_test, y_test, clf_ensemble)

    print('K-NN results:')
    acc_1, acc_2 = print_and_plot_to_compare(X_test, y_test, X_test_2, y_test_2, clf_full, class_names)
    result_dict[ f'K-nn, {feature_transform}, {feature_select}'] = acc_1
    result_dict[ f'K-nn double, {feature_transform}, {feature_select}'] = acc_2


    grid={"C":np.logspace(-3,3,7), "penalty":["l2"]}
    clf=LogisticRegression()

    clf_full = get_initial_classifier(X_train_full, y_train_full, clf, grid)
    clf_ensemble = get_ensemble_helping_classifier(X_train_full, y_train_full, clf, grid)
    X_test_2, y_test_2 = filter_test_set_ensemble(X_test, y_test, clf_ensemble)

    print('Logistic Regression results: ')
    acc_1, acc_2 = print_and_plot_to_compare(X_test, y_test, X_test_2, y_test_2, clf_full, class_names)
    result_dict[ f'Logistic reg, {feature_transform}, {feature_select}'] = acc_1
    result_dict[ f'Logistic reg double, {feature_transform}, {feature_select}'] = acc_2
    return result_dict

def compare_rocket_c22(name_UCR, class_names):
    print('%%%%%%%%%%%%%%%% \n\n FEATURE SELECTION = PCA ')
    print('%%%%%%%%%%%%%%%% \n\n ROCKET ')
    start = time.time()
    ret_dict = {}
    ret_dict = compare_single_double_classifier(name_UCR, class_names, ret_dict, feature_transform='rocket', feature_select = 'pca')
    
    print('Time to compute ROCKET: ', (time.time() - start))

    print('%%%%%%%%%%%%%%%% \n\n C22 ')
    start = time.time()
    ret_dict = compare_single_double_classifier(name_UCR, class_names,ret_dict, feature_transform='c22')
    print('Time to compute c22: ', (time.time() - start))
    
    print('%%%%%%%%%%%%%%%% \n\n FEATURE SELECTION = SELECT K BEST ')
    print('%%%%%%%%%%%%%%%% \n\n ROCKET ')
    start = time.time()
    ret_dict = compare_single_double_classifier(name_UCR, class_names,ret_dict, feature_transform='rocket', feature_select = 'select')
    print('Time to compute ROCKET: ', (time.time() - start))

    print('%%%%%%%%%%%%%%%% \n\n c22 ')
    start = time.time()
    ret_dict = compare_single_double_classifier(name_UCR, class_names, ret_dict, feature_transform='c22')
    print('Time to compute c22: ', (time.time() - start))
    return ret_dict

def classify(som, data, X_train, y_train):
    """Classifies each sample in data in one of the classes definited
    using the method labels_map.
    Returns a list of the same length of data where the i-th element
    is the class assigned to data[i].
    """
    winmap = som.labels_map(X_train, y_train)
    default_class = np.sum(list(winmap.values())).most_common()[0][0]
    result = []
    for d in data:
        win_position = som.winner(d)
        if win_position in winmap:
            result.append(winmap[win_position].most_common()[0][0])
        else:
            result.append(default_class)
    return result

In [70]:
metrics = {}

import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from matplotlib.patches import Patch
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest

from minisom import MiniSom
from sklearn.preprocessing import minmax_scale, scale

import seaborn as sns
from matplotlib.font_manager import FontProperties
from sklearn.metrics import confusion_matrix
from matplotlib.colors import LinearSegmentedColormap
import matplotlib
from sklearn.metrics import classification_report
import sys
module_path = os.path.abspath(os.path.join('../..'))
sys.path.insert(1, module_path + '/src')

import utility
    
def classify(som, data, X_train, y_train):
    """Classifies each sample in data in one of the classes definited
    using the method labels_map.
    Returns a list of the same length of data where the i-th element
    is the class assigned to data[i].
    """
    winmap = som.labels_map(X_train, y_train)
    default_class = np.sum(list(winmap.values())).most_common()[0][0]
    result = []
    for d in data:
        win_position = som.winner(d)
        if win_position in winmap:
            result.append(winmap[win_position].most_common()[0][0])
        else:
            result.append(default_class)
    return result

In [75]:
size_som = 13
decomp_methods = ['noDecomp', 'EMD', 'EEMD', 'DWT',  'EMD_DWT','EEMD_DWT']
metrics = {}
for decomp in decomp_methods:
    
    X,y = utility.get_X_y('noDecomp', fs_pca = True, k = 30)

    try:
        data , labels = X.to_numpy(),y.to_numpy() 
    except AttributeError:
        data , labels = X,y.to_numpy() 
           
    X_train, X_test, y_train, y_test = train_test_split(data, labels, stratify=labels)
    X_train, X_val, y_train, y_val =  train_test_split(X_train, y_train, stratify=y_train)
    
    print(X_val.shape)
    print(y_val.shape)
    som = MiniSom(size_som, size_som, data.shape[1], sigma=5, learning_rate=0.5, 
                  neighborhood_function='triangle', random_seed=10)
    som.pca_weights_init(X_train)
    som.train_random(X_train, 2500)

    y_pred = classify(som, X_test, X_train, y_train)
    print(len(y_train))
    rep = classification_report(y_test, y_pred, output_dict=True)
    
    metrics['SOM & '+ decomp] = {'accuracy': rep['accuracy'], 
                       'precision': rep['macro avg']['precision'],
                       'recall': rep['macro avg']['recall'],
                       'f1-score': rep['macro avg']['f1-score']}

    print(f'Decomposition method: {decomp} \n \n ')
    #utility.report_to_latex_table(rep)
    #utility.plot_conf_matrix(y_test, y_pred, decomp, 'SOM')
print(metrics)
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')

(594, 30)
(594,)
1780
Decomposition method: noDecomp 
 
 
(594, 30)
(594,)
1780
Decomposition method: EMD 
 
 
(594, 30)
(594,)
1780
Decomposition method: EEMD 
 
 
(594, 30)
(594,)
1780
Decomposition method: DWT 
 
 
(594, 30)
(594,)
1780
Decomposition method: EMD_DWT 
 
 
(594, 30)
(594,)
1780
Decomposition method: EEMD_DWT 
 
 
{'SOM & noDecomp': {'accuracy': 0.6654040404040404, 'precision': 0.6668217253216131, 'recall': 0.6658163265306123, 'f1-score': 0.6650147238382531}, 'SOM & EMD': {'accuracy': 0.6363636363636364, 'precision': 0.63922081961689, 'recall': 0.637015306122449, 'f1-score': 0.6351328005528292}, 'SOM & EEMD': {'accuracy': 0.6590909090909091, 'precision': 0.6618616677440207, 'recall': 0.6596938775510204, 'f1-score': 0.6581294964028777}, 'SOM & DWT': {'accuracy': 0.6691919191919192, 'precision': 0.6748917748917749, 'recall': 0.6700510204081632, 'f1-score': 0.667152160662154}, 'SOM & EMD_DWT': {'accuracy': 0.6224747474747475, 'precision': 0.6267178273948004, 'recall': 0.6

In [68]:
decomp = 'noDecomp'
X_Y = {
   'Filtering': utility.get_X_y(decomp, fs_filter= True,k = 30),
    'PCA': utility.get_X_y(decomp, fs_pca= True,k = 30),

    'Autoencoder': utility.get_X_y(decomp, fs_auto_encoder= True,k = 30), 
    'No feature selection': utility.get_X_y(decomp), 
    
}
metrics = {}
for key, val in X_Y.items():
    X, y = val
    
    try:
        data , labels = X.to_numpy(),y.to_numpy() 
    except AttributeError:
        data , labels = X,y.to_numpy() 
    
    X_train, X_test, y_train, y_test = train_test_split(data, labels, stratify=labels)
    X_train, X_val, y_train, y_val =  train_test_split(X_train, y_train, stratify=y_train)
    
    som = MiniSom(size_som, size_som, data.shape[1], sigma=5, learning_rate=0.5, 
                  neighborhood_function='triangle', random_seed=10)
    som.pca_weights_init(X_train)
    som.train_random(X_train, 2500)

    y_pred = classify(som, X_test, X_train, y_train)

    rep = classification_report(y_test, y_pred, output_dict=True)
    
    metrics['SOM, ' + key] = {'accuracy': rep['accuracy'], 
                       'precision': rep['macro avg']['precision'],
                       'recall': rep['macro avg']['recall'],
                       'f1-score': rep['macro avg']['f1-score']}

In [69]:
metrics

{'SOM, Filtering': {'accuracy': 0.6477272727272727,
  'precision': 0.6496764222930551,
  'recall': 0.6482397959183674,
  'f1-score': 0.6470379640884993},
 'SOM, PCA': {'accuracy': 0.6755050505050505,
  'precision': 0.6777262629189607,
  'recall': 0.6760204081632653,
  'f1-score': 0.6748700959524886},
 'SOM, Autoencoder': {'accuracy': 0.6161616161616161,
  'precision': 0.6161616161616161,
  'recall': 0.6161734693877551,
  'f1-score': 0.6161518251154248},
 'SOM, No feature selection': {'accuracy': 0.6679292929292929,
  'precision': 0.6693625485226653,
  'recall': 0.6683418367346938,
  'f1-score': 0.6675429146017382}}