In [None]:
import os
import time
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # 使用GPU 0
import torch
import torch.nn as nn
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torch.optim as optim
from torch.autograd import Variable
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, Birch
import csv
from sklearn.mixture import GaussianMixture

from utils.LNL_DOH import Data_Process
from utils.CNN_AE_updated import CNN_AE_Classifier
from utils.model_MoCo import MoCo

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import silhouette_score, silhouette_samples
from scipy.optimize import linear_sum_assignment
import math

import warnings
warnings.filterwarnings("ignore")

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from geomloss import SamplesLoss

class SoftClusteringWassersteinLoss(nn.Module):
    def __init__(self, num_clusters, feature_dim):
        super(SoftClusteringWassersteinLoss, self).__init__()
        self.num_clusters = num_clusters
        self.centroids = nn.Parameter(torch.randn(num_clusters, feature_dim))
        self.wasserstein_loss = SamplesLoss("sinkhorn", p=2, blur=0.01)
        
    def forward(self, features):
        if self.centroids.device != features.device:
            self.centroids.data = self.centroids.data.to(features.device)
        
        batch_size = features.size(0)
        
        # 计算每个样本到每个中心的距离
        distances = torch.cdist(features, self.centroids)
        
        # 软分配（使用softmax）
        soft_assignments = F.softmax(-distances, dim=1)
        
        # 计算簇内损失（希望最小化）
        intra_cluster_loss = torch.mean(torch.sum(soft_assignments * distances, dim=1))
        
        # 计算簇间损失（希望最大化）
        inter_cluster_loss = 0
        for i in range(self.num_clusters):
            for j in range(i+1, self.num_clusters):
                cluster_i = features[soft_assignments[:, i] > 0.5]
                cluster_j = features[soft_assignments[:, j] > 0.5]
                if len(cluster_i) > 0 and len(cluster_j) > 0:
                    w_distance = self.wasserstein_loss(cluster_i, cluster_j)
                    inter_cluster_loss -= w_distance
        
        # 总损失
        total_loss = intra_cluster_loss + inter_cluster_loss
        
        return total_loss, soft_assignments

In [4]:
config = dict(algorithm = 'TRACE',
              dataset = 'DOH',
              data = "./Malicious_TLS/DOH21_new.csv",
              
              savedir = './results',
              noise_pattern = 'sym', ##asym or sym
              INCV_C_list = [0.5],
              percent = 0.7,
              #seed = 1,
              
              batch_size = 256, 
              num_workers =1,
              epochs = 100,
              adjust_lr = 1,
              learning_rate = 1e-2,
              
              embedding_size = 128,
              moco_queue = 8192,
              moco_m = 0.999,
              temperature = 0.1,
              alpha = 0.5,
              pseudo_th = 0.8,
              proto_m = 0.999,
              lr = 0.05,
              cos = False,
              schedule = [40, 80],
              w_proto = 1,
              w_inst = 1,
              print_freq = 300,
              
                          
              
              num_class = 2, #
              low_dim = 16,
              train_size = 0,
              val_size = 0,
              input_dim = 120,
              
              
              
              ) 

In [5]:
from sklearn.metrics import classification_report, precision_recall_fscore_support
def adjust_learning_rate(optimizer, epoch, config):
    """Decay the learning rate based on schedule"""
    lr = config['lr']
    if config['cos']:  # cosine lr schedule
        lr *= 0.5 * (1. + math.cos(math.pi * epoch / config['epochs']))
    else:  # stepwise lr schedule
        for milestone in config['schedule']:
            lr *= 0.1 if epoch >= milestone else 1.
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)

class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'    

def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res  
    
def acc_and_f1(y_true, y_pred, num_cluster, csv_path):
    y_true = y_true.astype(np.int64)
    y_pred = y_pred.astype(np.int64)
    assert y_pred.size == y_true.size

    w = np.zeros((num_cluster, num_cluster))
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1

    ind = linear_sum_assignment(w.max() - w)
    ind = np.array(ind).T

    accuracy = 0.0
    for i, j in ind:
        accuracy += w[i, j]
    accuracy /= y_pred.size

    # 创建一个新的对齐后的预测标签数组
    new_y_pred = np.zeros_like(y_pred)
    for i, j in ind:
        new_y_pred[y_pred == i] = j

    # 计算分类报告
    report1 = classification_report(y_true, new_y_pred, digits=4)
    report = classification_report(y_true, new_y_pred, digits=4, output_dict=True)

    return accuracy, report1

In [6]:

from sklearn.metrics import classification_report




def TRACE():
    csv_filename = f"{config['num_class']}_{config['dataset']}_{config['noise_pattern']}_{str(INCV_c)}_attentionAE_was.csv"
    cls_acc = []
    KMeans_acc = []
    
    data = pd.read_csv(config['data'])
    class_le = LabelEncoder()
    data['Label'] = class_le.fit_transform(data['Label'])
    df_train = data.sample(frac = config['percent'])  
    df_val = data[~data.index.isin(df_train.index)]  
    
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    
    train_dataset = Data_Process(data = df_train,
                               train=True,
                               transform = transforms.ToTensor(),
                               noise_type = config['noise_pattern'],
                               INCV_b = INCV_b,
                               INCV_c = INCV_c             
                               )
        
    val_dataset = Data_Process(data = df_val,#
                               train=False,
                               transform = transforms.ToTensor(),
                               noise_type = config['noise_pattern'],
                               INCV_b = INCV_b,
                               INCV_c = INCV_c             
                               )
    
    config['train_size'] = len(train_dataset)
    config['val_size'] = len(val_dataset)
    config['num_class'] = len(np.unique(data['Label']))
    config['input_dim'] = 120
    
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=config['batch_size'], 
                                               num_workers=config['num_workers'],
                                               drop_last=True,
                                               shuffle=True)
        
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                              batch_size=config['batch_size'], 
                                              num_workers=config['num_workers'],
                                              drop_last=True,
                                              shuffle=False)
    
    
    
    ##############################################################################
    
    print('building model...')
    
    model = MoCo(CNN_AE_Classifier,config)
    model.cuda()
    
    criterion2 = nn.CrossEntropyLoss()#.cuda()
    
    optimizer = torch.optim.SGD(model.parameters(), lr = config['lr'],
                                momentum=0.9,
                                weight_decay=1e-4)
    
    for epoch in range(1,config['epochs']):
        print('epoch:',epoch)
           
        adjust_learning_rate(optimizer, epoch, config)
        
        
        batch_time = AverageMeter('Time', ':1.2f')
        data_time = AverageMeter('Data', ':1.2f')   
        acc_cls = AverageMeter('Acc@Cls', ':2.2f')
        acc_proto = AverageMeter('Acc@Proto', ':2.2f')
        #acc_inst = AverageMeter('Acc@Inst', ':2.2f')
        
        progress = ProgressMeter(
            len(train_loader),
            [batch_time, data_time, acc_cls, acc_proto],
            prefix="Epoch: [{}]".format(epoch))
        
        ##开始训练
        model.train()
        end = time.time()
        
        for i, (x, target_, indexes) in enumerate(train_loader):
            # print(x.shape)
            x = x.reshape(config['batch_size'],1,-1)
            x = Variable(x).cuda()
            # x_aug = x_aug.reshape(config['batch_size'],1,-1)
            # x_aug = Variable(x_aug).cuda()
            # print(x.shape)
            target_ = Variable(target_).cuda()
            
            data_time.update(time.time() - end)
            
            loss = 0
            
            # compute model output               
            cls_out, target, logits, x_q, logits_proto, u = \
            model(x, target_, config, is_eval=False, is_proto=(epoch>0))       
                
            loss_proto = criterion2(logits_proto, target.squeeze(1))
            acc = accuracy(logits_proto, target)[0] 
            acc_proto.update(acc[0]) 
            
            loss_cls = criterion2(cls_out, target.squeeze(1)) 
            soft_clustering_loss = SoftClusteringWassersteinLoss(num_clusters=23, feature_dim=128)
            soft_clustering_loss = soft_clustering_loss.to(u.device)
            loss_war, assignments = soft_clustering_loss(u)
            loss_AE = nn.MSELoss()(x, x_q)
            x_1 = x.reshape(config['batch_size'],-1)
            loss = loss_cls + config['w_proto']*loss_proto + loss_AE + loss_war
            
            # log accuracy
            acc = accuracy(cls_out, target)[0] 
            acc_cls.update(acc[0])
               
             
            # compute gradient and do SGD step
            optimizer.zero_grad()
            loss.backward()#######
            optimizer.step()
            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            #if i % config['print_freq'] == 0:
                #progress.display(i)
         
        with torch.no_grad():
            print('==> Evaluation...')       
            model.eval()    
            top1_acc = AverageMeter("Top1")
            top5_acc = AverageMeter("Top5")

            all_preds = []
            all_targets = []
            
            # evaluate on webvision val set
            for batch_idx, (x, target_, indexes) in enumerate(val_loader):
                x = x.reshape(config['batch_size'],1,-1)
                x = Variable(x).cuda()
                target_ = Variable(target_).cuda()
                                
                outputs,_,target = model(x, target_, config, is_eval=True)    
                acc1 = accuracy(outputs, target)
                top1_acc.update(acc1[0])

                # Store predictions and targets for later use
                _, preds = torch.max(outputs, 1)
                all_preds.extend(preds.cpu().numpy())
                all_targets.extend(target.cpu().numpy())
                            
            # average across all processes
            acc_tensors = torch.Tensor([top1_acc.avg]).cuda()
            # Convert lists to numpy arrays
            all_preds = np.array(all_preds)
            all_targets = np.array(all_targets)
            
            # Compute classification report
            class_names = [f"Class {i}" for i in range(config['num_class'])]  # Adjust this if you have actual class names
            report = classification_report(all_targets, all_preds, target_names=class_names, digits=4)
            # Compute macro-averaged metrics
            precision, recall, f1, _ = precision_recall_fscore_support(all_targets, all_preds, average='macro')
            
            # average across all processes
            acc_tensor = torch.Tensor([top1_acc.avg]).cuda()
           
            val_ACC = KMeans_model_evaluation(model = model, train_dataloader= train_loader, val_dataloader = val_loader)

        cls_acc.append(acc_tensors[0].data.cpu().numpy())
        KMeans_acc.append(val_ACC)

In [None]:
for INCV_c in config['INCV_C_list']:
    #INCV_c = 0.9
    if config['noise_pattern'] == 'asym':
        INCV_b = 0 
    else:
        INCV_b = INCV_c
    
    print("INCV_c:",INCV_c)
    TRACE()