In [1]:
from models import AutoEncoder
import torch
import torch.nn as nn

In [2]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
import seaborn as sns
import sys
from tqdm.auto import tqdm

In [3]:
%run extract_features.ipynb

## Input Data Read

In [4]:
def get_astrolavos_data(directory, min_samples=1000):
    files = glob.glob(f"{directory}/*")
    res = {}
    for f in files:
        id = f.split("/")[-1].split("_")[0]
        with open(f, 'rb') as f:
            loaded_data = np.array(pickle.load(f))
        if loaded_data.shape[0] < min_samples:
            print(f"{f} not enough data")
        else:
            res[id] = loaded_data
            print(id, loaded_data.shape[0])
    return res

In [5]:
#res = get_astrolavos_data("/data/thomas/Principals/FL/deployment/iotlab_data_filtered")
raw_data = get_astrolavos_data2("/data/thomas/Principals/FL/deployment/iotlab_data_bigrams")

In [9]:
def data_preprocess(data, pca_enabled=True, max_iter=sys.maxsize):
    res = {}
    num_samples = int(min([data[id].shape[0] for id in data.keys()]) / 2)
    print(f"using {num_samples} points for each device") 
    test_data = {}
    train_data = {}
    labels = {}
    for id in data.keys():
        print(f"Getting train/test data for {id}")
        train_data[id], test_data[id], _, labels[id] = train_test_split(data[id], np.full(len(data[id]), id), test_size=num_samples, train_size=num_samples, random_state=42)
        print(f"test data for {id} {train_data[id].shape}")
        print(f"train data for {id} {test_data[id].shape}")
        print("-------------------------")

    res = {}
    print("Generating train/test split")
    for id in tqdm(data.keys()):
        res[id] = {}
        res[id]["train_data"] = train_data[id]
        res[id]["test_data"] = test_data[id]
        res[id]["test_labels"] = labels[id]
        for id2 in data.keys():
            if id2 != id:
                res[id]["test_data"] = np.concatenate((res[id]["test_data"], test_data[id2]), axis=0)
                res[id]["test_labels"] = np.concatenate((res[id]["test_labels"], labels[id2]), axis=0)
        scaler = StandardScaler(copy=False)
        print(f"Standardizing data for {id} ..")
        res[id]["train_data"] = scaler.fit_transform(res[id]["train_data"])
        res[id]["test_data"] = scaler.transform(res[id]["test_data"])
        pca_component_num = 100
        if pca_enabled==True:
            print(f"using PCA with {pca_component_num} components")
            pca = PCA(copy=True, iterated_power='auto', n_components=pca_component_num, random_state=None, whiten=False, svd_solver='auto', tol=0.0)
            #pca.fit(res[id]["train_data"])
            res[id]["train_data"] = pca.fit_transform(res[id]["train_data"])
            res[id]["test_data"] = pca.transform(res[id]["test_data"])
    print("Done")

    return res
                

In [10]:
def data_preprocess_new(data, pca_enabled=True, max_iter=sys.maxsize):
    res = {}
    num_samples = int(min([data[id].shape[0] for id in data.keys()]) / 2)
    print(f"using {num_samples} points for each device") 
    test_data = {}
    train_data = {}
    labels = {}
    for id in data.keys():
        print(f"Getting train/test data for {id}")
        train_data[id], test_data[id], _, labels[id] = train_test_split(data[id], np.full(len(data[id]), id), test_size=num_samples, train_size=num_samples, random_state=42)
        print(f"test data for {id} {train_data[id].shape}, train data for {id} {test_data[id].shape}")
        print("-------------------------")

    res = {}
    print("Generating train/test split")
    for id in data.keys():
        res[id] = {}
        res[id]["train_data"] = train_data[id]
        res[id]["test_data"] = test_data[id]
        res[id]["test_labels"] = labels[id]
        
    #fit on standard scaler and PCA on full training data
    
    full_training_data = np.empty((0, data[id].shape[1]))
    for id in data.keys():
        full_training_data = np.concatenate((full_training_data, res[id]["train_data"]), axis=0)
    print(f"Concatenated training data has shape {full_training_data.shape}")
    
    full_test_data = np.empty((0, data[id].shape[1]))
    full_test_labels = np.array([])
    for id in data.keys():
        full_test_data = np.concatenate((full_test_data, res[id]["test_data"]), axis=0)
        full_test_labels =  np.concatenate((full_test_labels, res[id]["test_labels"]), axis=0)
    print(f"Concatenated test data has shape {full_test_data.shape}")
    scaler = StandardScaler(copy=False)
    scaler.fit(full_training_data)
    
    output = {}
    print("Scaling data...")
    for id in data.keys():
        output[id] = {}
        output[id]["train_data"] = scaler.transform(res[id]["train_data"])
        output[id]["test_data"] = scaler.transform(full_test_data)
        output[id]["test_labels"] = full_test_labels
    
    print("Scaling done")
    if pca_enabled == True:
        print("PCA enabled")
        pca_component_num = 100
        pca = PCA(copy=True, iterated_power='auto', n_components=pca_component_num, random_state=None, whiten=False, svd_solver='auto', tol=0.0)
        pca.fit(full_training_data)
        for id in data.keys():
            output[id]["train_data"] = pca.transform(output[id]["train_data"])
            output[id]["test_data"] = pca.transform(output[id]["test_data"])
        
    return output, full_training_data

In [None]:
%%time
input_d = data_preprocess(raw_data, pca_enabled=False)
#input_d, full_training_data = data_preprocess_new(raw_data, pca_enabled=False)

In [None]:

LOSS_FUNC = nn.KLDivLoss
# LOSS_FUNC = nn.BCELoss()

def get_local_threshold(model, train_data):
    device='cpu'
    model.to(device)
    model.eval()
    mse = list()
    threshold_func = LOSS_FUNC()
    #threshold_func = nn.MSELoss(reduction="none")
    for batch_idx, x in enumerate(train_data):
        x = x.to(device).float()
        diff = threshold_func(model(x), x)
        mse.append(diff)
    mse_global = torch.cat(mse).mean(dim=1)
    threshold_global = torch.mean(mse_global) + 3* torch.std(mse_global)
    return threshold_global


res = {}
for id in input_d.keys():
    res[id] = {}
    #input_size = input_d[id]['train_data'].shape[1]
    #input_size=input_d[id]['train_data'].shape[1]
    input_size=100
    sample_num = input_d[id]['train_data'].shape[0]
    print(f"input size {input_size}, num samples {sample_num}")
    model = AutoEncoder(input_size)
    learning_rate =  0.03#0.03
    device='cpu'
    epochs = 3
    batch_size = 64

    model.to(device)
    model.train()

    criterion = LOSS_FUNC().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    epoch_loss = []
    train_data = torch.utils.data.DataLoader(input_d[id]['train_data'][:,:input_size], batch_size=batch_size, shuffle=False, num_workers=0)
    for epoch in range(epochs):
        batch_loss = []
        for batch_idx, x in tqdm(enumerate(train_data), total=5000//batch_size):
            x = x.to(device).float()
            optimizer.zero_grad()
            decode = model(x)
            loss = criterion(decode, x)
            loss.backward()
            optimizer.step()
            batch_loss.append(loss.item())
        epoch_loss.append(sum(batch_loss) / len(batch_loss))
        print("Client \tEpoch: {}\tLoss: {:.6f}".format(epoch, sum(epoch_loss) / len(epoch_loss))) 
    threshold = get_local_threshold(model, train_data)
    
    
    model.to(device)
    model.eval()
    threshold_func = LOSS_FUNC()

    for i in set(input_d[id]['test_labels']):
        
        idx = np.where(input_d[id]['test_labels'] == i)
        test_data = input_d[id]['test_data'][idx]
        test_data = torch.utils.data.DataLoader(test_data[:,:input_size], batch_size=batch_size, shuffle=False, num_workers=0)
        same = 0
        different = 0
        for batch_idx, x in enumerate(test_data):
            x = x.to(device).float()
            diff = threshold_func(model(x), x)
            mse = diff.mean(dim=1)
            different += sum(mse > threshold)
            same += sum(mse <= threshold)
        print(f"train({id}) vs test({i}): same class={same.numpy()}, different class={different.numpy()}")
        print("--------------------------")
        res[id][i] = same.numpy()/ sample_num
        
                
    

In [None]:
id = 'NestCamera'
res[id] = {}
input_size = input_d[id]['train_data'].shape[1]
input_size=100
sample_num = input_d[id]['train_data'].shape[0]
print(f"input size {input_size}, num samples {sample_num}")
model = AutoEncoder(input_size)
learning_rate =  0.03#0.03
device='cpu'
epochs = 3
batch_size = 64

model.to(device)
model.train()

criterion = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
epoch_loss = []
train_data = torch.utils.data.DataLoader(input_d[id]['train_data'][:,:input_size], batch_size=batch_size, shuffle=False, num_workers=0)

In [None]:
for epoch in range(epochs):
    batch_loss = []
    for batch_idx, x in tqdm(enumerate(train_data)):
        x = x.to(device).float()
        optimizer.zero_grad()
        decode = model(x)
        loss = criterion(decode, x)
        loss.backward()
        optimizer.step()
        batch_loss.append(loss.item())
    epoch_loss.append(sum(batch_loss) / len(batch_loss))
    print("Client \tEpoch: {}\tLoss: {:.6f}".format(epoch, sum(epoch_loss) / len(epoch_loss))) 
threshold = get_local_threshold(model, train_data)

In [None]:
sorted_dict = dict(sorted(res.items()))

In [None]:
for i in sorted_dict:
    sorted_dict[i] = dict(sorted(sorted_dict[i].items()))

In [None]:
df = pd.DataFrame(sorted_dict)

In [None]:
myplot = sns.heatmap(df,annot=True)
myplot.set_xticklabels(myplot.get_xticklabels(), rotation=90)
plt.xlabel("Train Device")
plt.ylabel("Test Device")
plt.tight_layout()
plt.savefig("/data/thomas/Principals/FL/2g.pdf") 

In [72]:
def get_local_threshold_test(model, train_data):
    device='cpu'
    model.to(device)
    model.eval()
    mse = list()
    threshold_func = nn.MSELoss(reduction="none")
    for batch_idx, x in enumerate(train_data):
        x = x.to(device).float()
        diff = threshold_func(model(x), x)
        mse.append(diff)
    mse_global = torch.cat(mse).mean(dim=1)
    return mse_global

In [None]:
res = {}
for id in input_d.keys():
    res[id] = {}
    input_size = input_d[id]['train_data'].shape[1]
    sample_num = input_d[id]['train_data'].shape[0]
    print(f"input size {input_size}, num samples {sample_num}")
    model = AutoEncoder(input_size)
    learning_rate =  0.03#0.03
    device='cpu'
    epochs = 10
    batch_size = 64

    model.to(device)
    model.train()

    criterion = nn.MSELoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    epoch_loss = []
    train_data = torch.utils.data.DataLoader(input_d[id]['train_data'], batch_size=batch_size, shuffle=False, num_workers=0)
    for epoch in range(epochs):
        batch_loss = []
        for batch_idx, x in enumerate(train_data):
            x = x.to(device).float()
            optimizer.zero_grad()
            decode = model(x)
            loss = criterion(decode, x)
            loss.backward()
            optimizer.step()
            batch_loss.append(loss.item())
        epoch_loss.append(sum(batch_loss) / len(batch_loss))
        print("Client \tEpoch: {}\tLoss: {:.6f}".format(epoch, sum(epoch_loss) / len(epoch_loss))) 
    threshold = get_local_threshold_test(model, train_data)
    break

In [98]:
tr = torch.mean(threshold) + 3* torch.std(threshold)

In [None]:
tr

In [86]:
r = threshold.detach().numpy()

In [None]:
np.std(r)*2 + np.mean(r)

In [None]:
np.mean(r) 

In [None]:
r[r < (np.mean(r) + 1*np.std(r))].shape[0] / r.shape[0]

In [None]:
r[r < (np.mean(r) + 2*np.std(r))].shape[0] / r.shape[0]

In [None]:
r[r < (np.mean(r) + 3*np.std(r))].shape[0] / r.shape[0]

In [None]:
n, bins, patches = plt.hist(threshold.detach().numpy())
plt.show()
plt.savefig("/data/thomas/Principals/FL/mse.pdf") 