In [2]:
from models import AutoEncoder

In [3]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
import seaborn as sns
import sys

In [4]:
%run extract_features.ipynb

## Input Data Read

In [5]:
nbaiot_data = get_data_nbaiot()

In [6]:
def confusion_matrix(data, pca_enabled=False, percentile=98, number_of_clusters=3, max_iter=sys.maxsize):
    res = {}
    num_samples = int(min([data[id].shape[0] for id in data.keys()]) / 2)
    print(f"using {num_samples} points for each device") 
    for id_1 in data.keys():
        if max_iter < 0:
            break
        max_iter -= 1
        dev_id = id_1
        train_data, test_data, _, labels = train_test_split(data[dev_id], np.full(len(data[dev_id]), dev_id), test_size=num_samples, random_state=42)
        print(test_data.shape)
        for id_2 in data.keys():
            if id_1 != id_2:
                _, tmp_data, _, tmp_labels = train_test_split(data[id_2], np.full(len(data[id_2]), id_2), test_size=num_samples, random_state=42)
                test_data = np.concatenate((test_data, tmp_data), axis=0)
                labels = np.concatenate((labels, tmp_labels), axis=0)
        print(f"Amount of test data per class {Counter(labels)}")
        print("Standardizing data")
        scaler = StandardScaler()
        train_scaled = scaler.fit_transform(train_data)
        test_scaled = scaler.transform(test_data)
        
        if pca_enabled:
            print("Using PCA")
            pca_component_num = 100
            pca = PCA(copy=True, iterated_power='auto', n_components=pca_component_num, random_state=None, whiten=False, svd_solver='auto', tol=0.0)
            pca.fit(train_scaled)

            train_scaled = pca.transform(train_scaled)
            test_scaled = pca.transform(test_scaled)
        else:
            print("NOT using PCA")
            
        #KMEANS
        kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++')
        kmeans.fit(train_scaled)
        
        
        min_distances = kmeans.transform(train_scaled).min(axis=1)
        threshold = np.percentile(min_distances, percentile)
        print(f"KMeans threshold is {threshold}")
        
        min_distances = kmeans.transform(test_scaled).min(axis=1)
        res[id_1] = Counter(labels[min_distances < threshold])
        print(res[id_1])
    for key1 in res:
        for key2 in res[key1]:
            res[key1][key2] /= num_samples
    return res

In [7]:
%%capture
res = confusion_matrix(nbaiot_data)

In [8]:
final_res = {}
for id1 in res:
    final_res[id1] = {}
    for id2 in res:
        final_res[id1][id2] =  res[id1][id2]

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(pd.DataFrame(final_res),annot=True)
plt.xlabel("Train Device")
plt.ylabel("Test Device")
plt.tight_layout()
plt.savefig("/data/thomas/Principals/FL/nbaiot.pdf") 


In [9]:
iot_lab_data = get_astrolavos_data("/data/thomas/Principals/testcases/FL4IOT/data/IOTLab/grouped_bigram_features_big")

In [None]:
%%time
%%capture
res = confusion_matrix(iot_lab_data, max_iter=8)

In [None]:
plt.figure(figsize=(20,20))
_ = sns.heatmap(pd.DataFrame(res),annot=True)