In [11]:
# from utils.data_process import load_keel_dataset, keel_dataset_preprocess, get_keel_dataset_batch_list, batch_list_to_df
# from utils.train import init_net, init_optimizers, train
# from utils.generate_new_samples import generate_new_1_sample

import tensorflow as tf
import os


# 设置GPU相关信息
# os.environ['CUDA_VISIBLE_DEVICES'] = "0"  # 指定哪块GPU训练
# config = tf.compat.v1.ConfigProto()
# # 设置最大占有GPU不超过显存的80%（可选）
# # config.gpu_options.per_process_gpu_memory_fraction=0.8
# config.gpu_options.allow_growth = True  # 设置动态分配GPU内存
# sess = tf.compat.v1.Session(config=config)


# 初始化编码维度 ：enc_dim，即z的维度
enc_dim = 64
# 指定epochs
epochs = 300
# 指定batch_size
batch_size = 16
# 指定vae损失计算的参数alpha_1和alpha_2
vae_loss_paramaters_dict = {"alpha_1": 1.5, "alpha_2": 0.1}
# 设置数据库文件路径以及性能指标保存路径
project_path = './'


## data process

In [12]:
"""
数据处理
    加载keel数据集数据
    数据预处理
    根据交叉验证获取训练集和测试机batch
    将一个batch转为array, df
    将一个batch_list转为array, df
"""


import random

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier


def load_keel_dataset(dataset_path, keel_dataset_name):
    """
    加载keel数据集
    先获取keel数据集的特征和具体数据，依据此来构造df

    :param dataset_path : 数据集路径
    :param keel_dataset_name : 需要加载的keel数据集名称
    :return df : 返回加载的数据集的pandas.DataFrame数据格式
    """

    df = pd.read_csv(dataset_path + keel_dataset_name)
    # 字符串转为数字，忽略错误（默认返回dtype为float64或int64，具体取决于提供的数据。）
    df = df.apply(pd.to_numeric, errors='ignore')

    return df


def keel_dataset_preprocess(keel_dataset_df):
    """
    对pandas.DataFrame格式的keel_dataset进行预处理
    主要包括：
            对样本标签的操作：将多数类样本的标签赋值为0，少数类样本的标签赋值为1
            对样本属性的操作：进行特征提取，将非数字属性值转换为one-hot编码格式
                           将数据归一化到(-1, 1)范围内
    
    :param : keel_dataset_df : keel数据集经过数据集加载后得到的pandas.DataFrame格式

    :return : keel_dataset_df : 经过上述处理后的keel_dataset_df
    """
    # 添加一列权重的部分， 权重=10个近邻中   1.0 * (多数类样本的个数 + 1) / (少数类样本的个数 + 1)
    knn = KNeighborsClassifier(n_neighbors=10)
    knn.fit(np.array(keel_dataset_df.iloc[:, :-1]), np.array(keel_dataset_df.iloc[:, -1]))
    label_sum_list = []
    for i in range(keel_dataset_df.shape[0]):
        knn_index = knn.kneighbors([(keel_dataset_df.iloc[i, :-1])], return_distance=False)
        label_sum = keel_dataset_df.iloc[knn_index[0], -1].sum()
        label_sum_list.append(1.0 * (10 - label_sum + 1) / (label_sum + 1))
    keel_dataset_df.insert(loc=len(keel_dataset_df.columns) - 1, column='knn', value=label_sum_list)
    
    return keel_dataset_df


def get_keel_dataset_batch_list(keel_dataset_df, batch_size=16, my_random_state=2022):
    """
    根据n折交叉验证和batch_size大小 获取经过预处理后keel数据集的batch_list

    :param : keel_dataset_df : 经过预处理后的keel数据集，数据类型为pandas.DataFrame
    :param : my_n_splits : 交叉验证参数，要分割为多少个子集
    :param : batch_size : batch大小
    :param : my_random_state : random_state大小

    :return : batch_list : [train_batch_list, val_batch_list]
    """
    # 初始化train_batch_list, val_batch_list
    train_batch_list, val_batch_list = [], []

    # 获取多数类样本和少数类样本的df并进行打乱
    data_0_df = keel_dataset_df[keel_dataset_df[keel_dataset_df.columns[-1]] == 0]
    data_1_df = keel_dataset_df[keel_dataset_df[keel_dataset_df.columns[-1]] == 1]
    data_0_df.index = range(data_0_df.shape[0])
    data_1_df.index = range(data_1_df.shape[0])
        
    class_0_train_index = list(data_0_df.index)
    class_1_train_index = list(data_1_df.index)

    class_0_val_index = random.sample(class_0_train_index, (int)(data_0_df.shape[0] / 4))
    class_1_val_index = random.sample(class_1_train_index, (int)(data_1_df.shape[0] / 4))

    class_0_train_index = list(set(class_0_train_index).difference(class_0_val_index))
    class_1_train_index = list(set(class_1_train_index).difference(class_1_val_index))

    # 根据上述index列表，获取一次交叉验证中所有的训练集样本
    train_dataset_0_df = data_0_df.iloc[class_0_train_index]
    train_dataset_1_df = data_1_df.iloc[class_1_train_index]

    # 根据上述index列表，获取一次交叉验证中所有的验证集样本
    val_dataset_0_df = data_0_df.iloc[class_0_val_index]
    val_dataset_1_df = data_1_df.iloc[class_1_val_index]

    train_dataset_df = pd.concat([train_dataset_0_df, train_dataset_1_df], axis=0)
    val_dataset_df = pd.concat([val_dataset_0_df, val_dataset_1_df], axis=0)
    
    # 获取 训练集中 多数类样本与少数类样本的比例 rate = 多数类样本数目 / 少数类样本数目
    rate = (len(class_0_train_index) // len(class_1_train_index)) + 1
    
    # 对batch_size大小进行调整，如果 训练集中少数类样本的数目 大于 原batch_size 的四倍，则不变，否则调整后的batch_size是训练集中少数类样本的数目大小的1/4
    if batch_size * 4 > len(class_1_train_index):
        batch_size = (len(class_1_train_index) // 4) + 1
    
    # 多数类样本和少数类类样本使用不同的batch_size，多数类类样本的batch_size = 调整后的batch_size（少数类类样本的batch_size）* rate
    train_batch_list = [tf.data.Dataset.from_tensor_slices((train_dataset_0_df.iloc[:, :-1].values, train_dataset_0_df.iloc[:, -1].values)).batch(batch_size*rate).shuffle(100), 
                            tf.data.Dataset.from_tensor_slices((train_dataset_1_df.iloc[:, :-1].values, train_dataset_1_df.iloc[:, -1].values)).batch(batch_size).shuffle(100)]
    
    
    val_batch_list = tf.data.Dataset.from_tensor_slices((val_dataset_df.iloc[:, :-1].values, val_dataset_df.iloc[:, -1].values)).batch(batch_size).shuffle(100)

    batch_list = [train_batch_list, val_batch_list]
    
    return batch_list


def batch_to_array(one_batch):
    """
    将一个batch转为np.array
    """
    # 分别获取属性值与标签值的array
    attribute_value_array = np.array(one_batch[0])
    label_array = np.transpose(np.array(one_batch[1]))
    label_array = label_array[:, np.newaxis]  # 升维  (n, ) ----> (n, 1)
    # 合并np.array
    one_batch_array = np.concatenate([attribute_value_array, label_array], axis=1)
        
    return one_batch_array


def batch_list_to_array(one_batch_list):
    """
    将 [batch_1, batch_2, ..., batch_n, ...] 转为array
    """
    one_batch_array_list = []
    # 获取每一个batch的array
    for one_batch in one_batch_list:
        one_batch_array = batch_to_array(one_batch=one_batch)
        one_batch_array_list.append(one_batch_array)
    # 纵向合并
    one_batch_list_array = np.concatenate(one_batch_array_list, axis=0)
    return one_batch_list_array


def batch_to_df(one_batch, columns):
    """
    将一个batch转为df
    """
    # 先将batch转为np.array
    one_batch_array = batch_to_array(one_batch=one_batch)
    # 将np.array转为df
    df = pd.DataFrame(one_batch_array, columns=columns)

    return df


def batch_list_to_df(one_batch_list, columns):
    """
    将 [batch_1, batch_2, ..., batch_n, ...] 转为df
    """

    # 将one_batch_list转为np.array
    one_batch_list_array = batch_list_to_array(one_batch_list=one_batch_list)

    # 将np.array转为df
    df = pd.DataFrame(one_batch_list_array, columns=columns) 

    return df





In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

def select_1_samples_by_lr(data_0_df, data_1_df, migrate_data_1_df):
    original_data = pd.concat([data_0_df, data_1_df], axis=0)
    train_x = original_data.iloc[:, :-1]
    train_y = original_data.iloc[:, -1]
    test_x = migrate_data_1_df.iloc[:, :-1]

    lr = LogisticRegression()
    lr.fit(train_x, train_y)
    test_x = test_x.fillna(0)
    proba_array = lr.predict_proba(test_x)[:, -1]
    migrate_data_1_df['proba'] = proba_array
    migrate_data_1_df.sort_values(by="proba", inplace=True, ascending=True)
    migrate_data_1_df.index = range(len(migrate_data_1_df))
    m = data_0_df.shape[0] - data_1_df.shape[0]
    migrate_data_1_df = migrate_data_1_df.iloc[[i for i in range(m)], :]
    # 再将最后一列删除
    selected_migrate_data_1_df = migrate_data_1_df.drop(columns='proba')

    return selected_migrate_data_1_df


def select_1_samples_by_rf(data_0_df, data_1_df, migrate_data_1_df):
    original_data = pd.concat([data_0_df, data_1_df], axis=0)
    train_x = original_data.iloc[:, :-1]
    train_y = original_data.iloc[:, -1]
    test_x = migrate_data_1_df.iloc[:, :-1]

    rf = RandomForestClassifier()
    rf.fit(train_x, train_y)
    test_x = test_x.fillna(0)
    proba_array = rf.predict_proba(test_x)[:, -1]
    migrate_data_1_df['proba'] = proba_array
    migrate_data_1_df.sort_values(by="proba", inplace=True, ascending=True)
    migrate_data_1_df.index = range(len(migrate_data_1_df))
    m = data_0_df.shape[0] - data_1_df.shape[0]
    migrate_data_1_df = migrate_data_1_df.iloc[[i for i in range(m)], :]
    # 再将最后一列删除
    selected_migrate_data_1_df = migrate_data_1_df.drop(columns='proba')

    return selected_migrate_data_1_df


def select_1_samples_by_svm(data_0_df, data_1_df, migrate_data_1_df):
    original_data = pd.concat([data_0_df, data_1_df], axis=0)
    train_x = original_data.iloc[:, :-1]
    train_y = original_data.iloc[:, -1]
    test_x = migrate_data_1_df.iloc[:, :-1]

    svm = SVC(probability=True)
    svm.fit(train_x, train_y)
    test_x = test_x.fillna(0)
    proba_array = svm.predict_proba(test_x)[:, -1]
    migrate_data_1_df['proba'] = proba_array
    migrate_data_1_df.sort_values(by="proba", inplace=True, ascending=True)
    migrate_data_1_df.index = range(len(migrate_data_1_df))
    m = data_0_df.shape[0] - data_1_df.shape[0]
    migrate_data_1_df = migrate_data_1_df.iloc[[i for i in range(m)], :]
    # 再将最后一列删除
    selected_migrate_data_1_df = migrate_data_1_df.drop(columns='proba')

    return selected_migrate_data_1_df


def generate_new_1_sample(optimized_smsg_net_dict, train_batch, columns):
    """
    从优化好的网络中生成新的少数类样本，将其添加到原来的训练集中
    :param : optimized_smsg_net_dict : 优化好的SMSG_PRO网络
    :param : train_batch : 训练集batches，包括多数类样本和少数类样本的batches
    :param : columns : df的列名

    :return : balanced_train_dataset_df : 经过平衡后的训练数据集batch
    """

    # 取出优化好的SMSG网络的编码器0和解码器1
    enc = optimized_smsg_net_dict.get('enc')
    dec = optimized_smsg_net_dict.get('dec')
    map_net = optimized_smsg_net_dict.get('map_net')

    # 初始化需要添加到训练集的迁移少数类batch_list
    migrate_data_1_batch_list = []

    # 取出每一个训练data_0_batch
    for data_0_batch in train_batch[0]:
        x_0, y_0 = data_0_batch
        # 类型转换，将x_0中数据类型转为tf.float32
        x_0 = tf.cast(x_0, dtype=tf.float32)
        mean_and_stddec_0 = enc(x_0)  # 使用优化好的编码器对多数类样本进行编码

        # 多数类样本迁移得到迁移少数类样本
        mean_and_stddec_0_1 = map_net(mean_and_stddec_0)  # 多数类样本的编码经过映射网络得到少数类样本的迁移编码
        mean_0_1, stddec_0_1 = tf.split(mean_and_stddec_0_1, 2, 1)  # 获取分布的两个参数
        z_0_1 = sample_z(mean_0_1, stddec_0_1)  # 期望与z_1一致
        x_0_1 = dec(z_0_1)  # 使用迁移编码经过解码器得到的迁移少数类样本
        y_0_1 = tf.constant(1.0, shape=y_0.shape)  # 对标签进行赋值1.0
        added_data_1_batch = (x_0_1, y_0_1)
        # 将每一次生成的样本batch添加到列表中
        migrate_data_1_batch_list.append(added_data_1_batch)

    # 得到原始多数类，原始少数类，迁移少数类的df
    data_0_df = batch_list_to_df(train_batch[0], columns=columns)
    data_1_df = batch_list_to_df(train_batch[1], columns=columns)

    migrate_data_1_df = batch_list_to_df(migrate_data_1_batch_list, columns=columns)
    # 进行一些深拷贝
    migrate_data_1_df2 = migrate_data_1_df.copy(deep=True)
    migrate_data_1_df3 = migrate_data_1_df.copy(deep=True)
    data_0_df2 = data_0_df.copy(deep=True)
    data_0_df3 = data_0_df.copy(deep=True)
    data_1_df2 = data_1_df.copy(deep=True)
    data_1_df3 = data_1_df.copy(deep=True)

    # data_0_df.to_csv('/home/lqw/testone/ttgan/ttgan/temp/data_0_df.csv', index=False, header=True, sep=',')
    # data_1_df.to_csv('/home/lqw/testone/ttgan/ttgan/temp/data_1_df.csv', index=False, header=True, sep=',')
    # migrate_data_1_df.to_csv('/home/lqw/testone/ttgan/ttgan/temp/migrate_data_1_df.csv', index=False, header=True, sep=',')

    # 从迁移生成的少数类样本的df中随机抽取原始多数类和原始少数类的差值个样本，保证生成后总的多数类和少数类样本数目相同
    # migrate_data_1_df = migrate_data_1_df.sample(n=data_0_df.shape[0]-data_1_df.shape[0], random_state=2022)

    migrate_data_1_df_by_lr = select_1_samples_by_lr(data_0_df, data_1_df, migrate_data_1_df)
    balanced_train_dataset_df_by_lr = pd.concat([data_0_df, data_1_df, migrate_data_1_df_by_lr], axis=0)

    migrate_data_1_df_by_rf = select_1_samples_by_rf(data_0_df2, data_1_df2, migrate_data_1_df2)
    balanced_train_dataset_df_by_rf = pd.concat([data_0_df, data_1_df, migrate_data_1_df_by_rf], axis=0)

    migrate_data_1_df_by_svm = select_1_samples_by_svm(data_0_df3, data_1_df3, migrate_data_1_df3)
    balanced_train_dataset_df_by_svm = pd.concat([data_0_df, data_1_df, migrate_data_1_df_by_svm], axis=0)

    return {'lr': balanced_train_dataset_df_by_lr, 'rf': balanced_train_dataset_df_by_rf,
            'svm': balanced_train_dataset_df_by_svm}


## train

In [14]:
"""
smsg_pro_pro训练
"""
from tensorflow.keras import layers
from tensorflow import keras
import numpy as np
import tensorflow as tf
import math
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, f1_score
from sklearn.svm import SVC

def init_net(x_dim, enc_dim):
    """
    初始化SMSG_PRO的网络结构
    enc    dec
    map_net
    dis

    :param : x_dim : 样本的维度
    :param : enc_dim : 编码器编码的维度

    :return : smsg_net_dict = {'enc':enc, 'dec':dec, 'map_net':map_net, 'dis':dis}
    """
    enc = Encoder(enc_dim)  # 编码器
    dec = Decoder(x_dim)  # 解码器
    map_net = Map_Net(enc_dim)  # 映射网络
    dis = Discriminator(x_dim)  # 对样本进行判别的判别器
    dis_map = Discriminator(x_dim=enc_dim)  # 对隐编码进行判别的判别器

    smsg_net_dict = {'enc': enc, 'dec': dec, 'map_net': map_net, 'dis': dis, 'dis_map': dis_map}
    return smsg_net_dict


def init_optimizers(learning_rate=2e-4, beta_1=0.5):
    """
    初始化上述网络的优化器
    网络优化策略设置，优化器为Adam(学习率为2e-4，beta_1为0.5)
    :param : learning_rate default = 2e-4
    :param : beta_1 default = 0.5

    :return : optimizers_dict = {'optimizer_enc':optimizer_enc, 'optimizer_dec':optimizer_dec,
                                 'optimizer_map_net':optimizer_map_net, 'optimizer_dis':optimizer_dis, }

    """

    optimizer_enc = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1)
    optimizer_dec = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1)
    optimizer_map_net = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1)
    optimizer_dis = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1)
    optimizer_dis_map = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1)

    optimizers_dict = {'optimizer_enc': optimizer_enc, 'optimizer_dec': optimizer_dec,
                       'optimizer_map_net': optimizer_map_net, 'optimizer_dis': optimizer_dis,
                       'optimizer_dis_map': optimizer_dis_map}

    return optimizers_dict
    
def batch_list_to_df(one_batch_list, columns):
    """
    将 [batch_1, batch_2, ..., batch_n, ...] 转为df
    """

    # 将one_batch_list转为np.array
    one_batch_list_array = batch_list_to_array(one_batch_list=one_batch_list)

    # 将np.array转为df
    df = pd.DataFrame(one_batch_list_array, columns=columns) 

    return df


class Discriminator(tf.keras.Model):
    def __init__(self, x_dim):
        super(Discriminator, self).__init__()
        self.x_dim = x_dim
        self.Line = tf.keras.Sequential([
            # 隐层
            layers.Dense(self.x_dim * 2, kernel_initializer='glorot_uniform'),
            layers.LeakyReLU(0.2),
            layers.Dropout(0.5),
            
            layers.Dense(256*16, kernel_initializer='glorot_uniform'),
            layers.LeakyReLU(0.2),
            layers.Dropout(0.5),

            layers.Dense(128*8, kernel_initializer='glorot_uniform'),
            layers.LeakyReLU(0.2),
            layers.Dropout(0.5),

            layers.Dense(64*4, kernel_initializer='glorot_uniform'),
            layers.LeakyReLU(0.2),
            layers.Dropout(0.5),

            layers.Dense(64, kernel_initializer='glorot_uniform'),
            layers.LeakyReLU(0.2),
            layers.Dropout(0.5),
            
            # 输出层
            layers.Dense(1, kernel_initializer='glorot_uniform'),
            layers.Activation('sigmoid')
        ])

    def call(self, x, is_training=1):
        # 判别网络的输出
        out = self.Line(x, training=is_training)

        return out

def sample_z(mean, stddev):
    """
    根据均值和标准差进行采样得到z
    具体做法：先得到z的标准正态分布，再根据均值和标准差进行变换得到z
    :param : mean : 均值
    :param : stddev : 标准差
    """
    # # 标准正态分布生成器
    # std_normal_distribution_init = tf.random_normal_initializer(stddev=1.0) 
    # # 得到z的标准正态分布std_z
    # std_z = std_normal_distribution_init(shape=mean.shape)
    
    # # 变换 
    # z = mean + std_z * stddev

    # return z

    eps_init = tf.random_normal_initializer()
    eps = eps_init(shape=mean.shape)

    return mean + eps * tf.exp(stddev)

def get_svm_performance_dict(train_datatset_df, test_datatset_df):
    """
    获取平衡后的数据集在SVM上的表现
    :param : train_datatset_df : 平衡后的训练集
    :param : test_datatset_df : 原测试集

    :return : {'method_name': 'svm', 'f1':my_f1, 'gmean':my_gmean}
    """
    
    train_x = np.array(train_datatset_df.iloc[:, :-1])
    train_y = np.array(train_datatset_df.iloc[:, -1])
    test_x = np.array(test_datatset_df.iloc[:, :-1])
    test_y = np.array(test_datatset_df.iloc[:, -1])
    # 初始化SVM
    svm = SVC()
    svm.fit(train_x, train_y)
    predicted_y = svm.predict(test_x)
    # 获取性能
    my_f1 = f1_score(test_y, predicted_y, average=None)[1]
    recall = recall_score(test_y, predicted_y, average=None)
    my_gmean = math.sqrt(recall[0] * recall[1])
    return {'method_name': 'svm', 'f1':my_f1, 'gmean':my_gmean}


def get_lr_performance_dict(train_datatset_df, test_datatset_df):
    """
    获取平衡后的数据集在LR上的表现
    :param : train_datatset_df : 平衡后的训练集
    :param : test_datatset_df : 原测试集

    :return : {'method_name': 'lr', 'f1':my_f1, 'gmean':my_gmean}
    """

    
    train_x = np.array(train_datatset_df.iloc[:, :-1])
    train_y = np.array(train_datatset_df.iloc[:, -1])
    test_x = np.array(test_datatset_df.iloc[:, :-1])
    test_y = np.array(test_datatset_df.iloc[:, -1])

    # 初始化lr
    lr = LogisticRegression()
    lr.fit(train_x, train_y)
    predicted_y = lr.predict(test_x)
    # 获取性能
    my_f1 = f1_score(test_y, predicted_y, average=None)[1]  # index=0得到的是类0的F1，index=1得到的是类1的F1，这里我们需要的是类1（少数类）的F1
    recall = recall_score(test_y, predicted_y, average=None)
    my_gmean = math.sqrt(recall[0] * recall[1])
    return {'method_name': 'lr', 'f1':my_f1, 'gmean':my_gmean}


def get_rf_performance_dict(train_datatset_df, test_datatset_df):
    """
    获取平衡后的数据集在RF上的表现
    :param : train_datatset_df : 平衡后的训练集
    :param : test_datatset_df : 原测试集

    :return : {'method_name': 'rf', 'f1':my_f1, 'gmean':my_gmean}
    """
    
    train_x = np.array(train_datatset_df.iloc[:, :-1])
    train_y = np.array(train_datatset_df.iloc[:, -1])
    test_x = np.array(test_datatset_df.iloc[:, :-1])
    test_y = np.array(test_datatset_df.iloc[:, -1])

    # 初始化lr
    rf = RandomForestClassifier()
    rf.fit(train_x, train_y)
    predicted_y = rf.predict(test_x)
    # 获取性能
    my_f1 = f1_score(test_y, predicted_y, average=None)[1]
    recall = recall_score(test_y, predicted_y, average=None)
    my_gmean = math.sqrt(recall[0] * recall[1])
    return {'method_name': 'rf', 'f1':my_f1, 'gmean':my_gmean}


def get_three_methods_performance_df(train_datatset_df_dict, test_datatset_df):
    """
    综合上述三个方法，将上述三个方法得到的字典合成为一个pandas.DataFrame
    :param : train_datatset_df_dict : 平衡后的训练集字典
    :param : test_datatset_df : 原测试集
    
    :return : three_methods_performance_df
               method       f1          gmean
                svm      f1-value     gmean-value
                lr       f1-value     gmean-value
                rf       f1-value     gmean-value
    """
    train_datatset_df_lr = train_datatset_df_dict.get('lr')
    train_datatset_df_rf = train_datatset_df_dict.get('rf')
    train_datatset_df_svm = train_datatset_df_dict.get('svm')

    svm_performance_dict = get_svm_performance_dict(train_datatset_df_svm, test_datatset_df)
    lr_performance_dict = get_lr_performance_dict(train_datatset_df_lr, test_datatset_df)
    rf_performance_dict = get_rf_performance_dict(train_datatset_df_rf, test_datatset_df)

    method_list = [svm_performance_dict.get('method_name'), lr_performance_dict.get('method_name'), rf_performance_dict.get('method_name')]
    lr_list = [svm_performance_dict.get('f1'), lr_performance_dict.get('f1'), rf_performance_dict.get('f1')]
    rf_list = [svm_performance_dict.get('gmean'), lr_performance_dict.get('gmean'), rf_performance_dict.get('gmean')]
    
    three_methods_performance_df = pd.DataFrame(data=np.transpose([method_list, lr_list, rf_list]), columns=['method', 'f1', 'gmean'])

    return three_methods_performance_df

class Map_Net(tf.keras.Model):
    """
    映射网络

    把多数类样本的隐编码映射为少数类样本的隐编码
    """
    def __init__(self, enc_dim):
        self.enc_dim = enc_dim * 2
        super(Map_Net, self).__init__()
        self.Map_Net0 = tf.keras.Sequential([
            
            keras.layers.Dense(units = 1024, kernel_initializer = 'uniform', activation = 'elu',
                kernel_regularizer=keras.regularizers.l2(0.01)),
            keras.layers.Dropout(0.5),  # 随机失活，有利于防止过拟合

            keras.layers.Dense(units = 512, kernel_initializer = 'uniform', activation = 'elu',
                kernel_regularizer=keras.regularizers.l2(0.01)),
            keras.layers.Dropout(0.5),  # 随机失活，有利于防止过拟合

            keras.layers.Dense(units = 256, kernel_initializer = 'uniform', activation = 'elu',
                kernel_regularizer=keras.regularizers.l2(0.01)),
            keras.layers.Dropout(0.5),  # 随机失活，有利于防止过拟合

            layers.Dense(self.enc_dim),  # 全连接层，units=128是输出节点数
            layers.Dropout(0.5),  # 随机失活，有利于防止过拟合
            layers.LeakyReLU(0.2)  # 使用LeakyReLU作为激活函数
        ])
        self.Map_Net1 = tf.keras.Sequential([
            layers.Dense(self.enc_dim, activation='tanh'),
        ])
    def call(self, input_enc, is_training=1):
        """
        :param : input_enc : 输入的多数类样本的编码
        :return : output_enc : 得到的少数类样本的编码
        """
        temp = self.Map_Net0(input_enc, training=is_training)
        output_enc = self.Map_Net1(temp, training=is_training)

        return output_enc


class Encoder(tf.keras.Model):
    def __init__(self, enc_dim):
        super(Encoder, self).__init__()
        self.enc_dim = enc_dim * 2
        # Sequential()方法是一个容器，描述了神经网络的网络结构，在Sequential()的输入参数中描述从输入层到输出层的网络结构
        self.Encoder0 = tf.keras.Sequential([
            layers.Reshape((22, 8, 30)),  # 形状重塑层，将512重塑为128*4
            # input= layers.Input(shape=(22, 8, 30))
            layers.Conv2D(16, (2, 2), strides=(1, 1), name='conv1', input_shape = (22, 8, 30)),  # 也可以写为Conv2D(64, 2, strides=(1, 1), name='convl)
            # keras.layers.Conv2D(128, 4),  # 一维卷积层，输出空间维度为128，卷积核大小为3，保持输入输出尺寸相同

            layers.Flatten(),  # 压平层，把多维的输入一维化，常用在从卷积层到全连接层的过渡
            
            layers.Dense(units = 1024, kernel_initializer = 'uniform', activation = 'elu',
            kernel_regularizer=keras.regularizers.l2(0.01)),
            layers.Dropout(0.5),  # 随机失活，有利于防止过拟合

            layers.Dense(units = 256, kernel_initializer = 'uniform', activation = 'elu',
            kernel_regularizer=keras.regularizers.l2(0.01)),
            layers.Dropout(0.5),  # 随机失活，有利于防止过拟合

            layers.Dense(units = 128, kernel_initializer = 'uniform', activation = 'elu',
            kernel_regularizer=keras.regularizers.l2(0.01)),
            layers.Dropout(0.5),  # 随机失活，有利于防止过拟合
            layers.LeakyReLU(0.2)  # 使用LeakyReLU作为激活函数
        ])
        self.Encoder1 = tf.keras.Sequential([
            layers.Dense(self.enc_dim, activation='tanh'),
        ])

    # 具体执行
    def call(self, x, is_training=1):
        temp = self.Encoder0(x, training=is_training)
        out = self.Encoder1(temp, training=is_training)

        return out


# 解码器
class Decoder(tf.keras.Model):
    def __init__(self, x_dim):
        super(Decoder, self).__init__()
        self.x_dim = x_dim

        self.Decoder = tf.keras.Sequential([

            layers.Dense(128),
            layers.LeakyReLU(0.2),
            layers.Reshape((8, 16)),

            layers.Conv1D(32, 3, padding='same'),
            
            layers.Conv1D(16, 3),
            layers.Dropout(0.5),
            layers.Flatten(),
            
            layers.Dense(self.x_dim, activation='tanh'),
        ])

    def call(self, x, is_training=1):
        out = self.Decoder(x, training=is_training)
        return out


def sample_z(mean, stddev):
    """
    根据均值和标准差进行采样得到z
    具体做法：先得到z的标准正态分布，再根据均值和标准差进行变换得到z
    :param : mean : 均值
    :param : stddev : 标准差
    """
    # # 标准正态分布生成器
    # std_normal_distribution_init = tf.random_normal_initializer(stddev=1.0) 
    # # 得到z的标准正态分布std_z
    # std_z = std_normal_distribution_init(shape=mean.shape)
    
    # # 变换 
    # z = mean + std_z * stddev

    # return z

    eps_init = tf.random_normal_initializer()
    eps = eps_init(shape=mean.shape)

    return mean + eps * tf.exp(stddev)


def get_vae_prior_loss(mean, stddec):
    """
    计算VAE的先验损失（Dkl）
    :param : mean : 隐编码分布的均值
    :param : stddec : 隐编码分布的标准差，改为标准差的对数

    :return : vae_prior_loss : VAE的先验损失
    """
    # vae_prior_loss = - tf.reduce_mean(0.5 * (tf.math.log(tf.square(stddec)) - tf.square(stddec) - tf.square(mean) + 1))
    vae_prior_loss = - tf.reduce_mean(0.5 * (2 * stddec - tf.square(tf.exp(stddec)) - tf.square(mean) + 1))
    return vae_prior_loss


def get_vae_likelihood_loss(original_x, generated_x):
    """
    计算VAE的似然损失
    :param : original_x : 原样本
    :param : generated_x : 经过VAE网络生成的的样本

    :return : vae_likelihood_loss : VAE的似然损失
    """
    vae_likelihood_loss = tf.reduce_mean(tf.square(original_x - generated_x))
    return vae_likelihood_loss


def get_vae_loss(mean, stddec, original_x, generated_x, alpha_1=1.5, alpha_2=0.1):
    """
    计算VAE的损失
    
    :param : mean : 隐编码分布的均值
    :param : stddec : 隐编码分布的标准差
    :param : original_x : 原样本
    :param : generated_x : 经过VAE网络生成的的样本
    :param : alpha_1 :   default = 1.5
    :param : alpha_2 :   default = 0.1

    :return : loss : 计算vae损失
    """
    vae_prior_loss = get_vae_prior_loss(mean=mean, stddec=stddec)
    vae_likelihood_loss = get_vae_likelihood_loss(original_x=original_x, generated_x=generated_x)
    
    vae_loss = (alpha_1 * vae_prior_loss) + (alpha_2 * vae_likelihood_loss)
    
    return vae_loss

def init_net(x_dim, enc_dim):
    """
    初始化SMSG_PRO的网络结构
    enc    dec
    map_net
    dis
    
    :param : x_dim : 样本的维度
    :param : enc_dim : 编码器编码的维度

    :return : smsg_net_dict = {'enc':enc, 'dec':dec, 'map_net':map_net, 'dis':dis}
    """
    enc = Encoder(enc_dim)  # 编码器
    dec = Decoder(x_dim)  # 解码器
    map_net = Map_Net(enc_dim)  # 映射网络
    dis = Discriminator(x_dim)  # 对样本进行判别的判别器
    dis_map = Discriminator(x_dim=enc_dim)  # 对隐编码进行判别的判别器

    smsg_net_dict = {'enc': enc, 'dec': dec, 'map_net': map_net, 'dis': dis, 'dis_map': dis_map}
    return smsg_net_dict


def init_optimizers(learning_rate=2e-4, beta_1=0.5):
    """
    初始化上述网络的优化器
    网络优化策略设置，优化器为Adam(学习率为2e-4，beta_1为0.5)
    :param : learning_rate default = 2e-4
    :param : beta_1 default = 0.5

    :return : optimizers_dict = {'optimizer_enc':optimizer_enc, 'optimizer_dec':optimizer_dec,
                                 'optimizer_map_net':optimizer_map_net, 'optimizer_dis':optimizer_dis, }

    """

    optimizer_enc = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1)
    optimizer_dec = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1)
    optimizer_map_net = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1)
    optimizer_dis = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1)
    optimizer_dis_map = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1)

    optimizers_dict = {'optimizer_enc': optimizer_enc, 'optimizer_dec': optimizer_dec,
                       'optimizer_map_net': optimizer_map_net, 'optimizer_dis': optimizer_dis,
                       'optimizer_dis_map': optimizer_dis_map}

    return optimizers_dict


def train(smsg_net_dict, optimizers_dict, train_batch, val_batch, vae_loss_paramaters_dict, columns, epochs=300):
    """
    一次交叉验证的训练过程，同时将每训练一次batch的相关的损失写入到文件中
    :param : smsg_net_dict : SMSG_PRO网络结构字典
    :param : optimizers_dict : 所有网络优化器字典
    :param : train_batch : 一次交叉验证中训练集的batch，包括多数类样本和少数类样本的batches
    :param : val_batch : 一次交叉验证中验证集的batch，包括多数类样本和少数类样本的batches
    :param : vae_loss_paramaters_dict : 计算VAE损失的参数的字典 {'alpha_1':alpha_1_value, 'alpha_2':alpha_2_value}
    :param : columns : 列名，用于构造新样本时df的构造，检验后验崩塌
    :param : epochs : 训练轮数
    
    :return : optimized_smsg_net_dict : 优化过的SMSG的网络结构
    """

    # 初始化损失的pandas.DataFrame
    # loss_df = pd.DataFrame(columns = ['epoch', 'enc0_loss', 'dec0_loss', 'enc1_loss', 'dec1_loss', 'dis0_loss', 'dis1_loss'])

    # 获取SMSG_PRO中所有的网络以及对应的优化器
    enc = smsg_net_dict.get('enc')
    dec = smsg_net_dict.get('dec')
    map_net = smsg_net_dict.get('map_net')
    dis = smsg_net_dict.get('dis')
    dis_map = smsg_net_dict.get('dis_map')
    optimizer_enc = optimizers_dict.get('optimizer_enc')
    optimizer_dec = optimizers_dict.get('optimizer_dec')
    optimizer_map_net = optimizers_dict.get('optimizer_map_net')
    optimizer_dis = optimizers_dict.get('optimizer_dis')
    optimizer_dis_map = optimizers_dict.get('optimizer_dis_map')

    # 获取计算VAE损失的参数alpha_1和alpha_2
    alpha_1 = vae_loss_paramaters_dict.get('alpha_1')
    alpha_2 = vae_loss_paramaters_dict.get('alpha_2')

    val_performance_list = []
    optimized_smsg_net_dict_list = []

    # 综合训练
    for epoch in range(epochs):
        # 根据train_batch获取每一次训练的多数类样本batch和少数类样本batch
        for data_0_batch, data_1_batch in zip(train_batch[0], train_batch[1]):
            x_0, y_0 = data_0_batch
            x_1, y_1 = data_1_batch

            # 类型转换，将x_0, x_1中数据类型转为tf.float32
            x_0 = tf.cast(x_0, dtype=tf.float32)
            x_1 = tf.cast(x_1, dtype=tf.float32)

            # 使用自动微分机制进行训练
            with tf.GradientTape() as enc_tape, tf.GradientTape() as dec_tape, tf.GradientTape() as map_net_tape, tf.GradientTape() as dis_tape, tf.GradientTape() as dis_map_tape:
                # 多数类样本重构
                mean_and_stddec_0 = enc(x_0)  # 使用编码器对多数类样本进行编码
                mean_0, stddec_0 = tf.split(mean_and_stddec_0, 2, 1)  # 将1x128拆分为2x64，获取分布的两个参数
                z_0 = sample_z(mean_0, stddec_0)  # 根据分布的两个参数得到分布然后采样取值
                x_0_0 = dec(z_0)  # 使用解码器0生成多数类样本，即由多数类样本重构得到的多数类样本

                # 多数类样本迁移得到迁移少数类样本
                mean_and_stddec_0_1 = map_net(mean_and_stddec_0)  # 多数类样本的编码经过映射网络得到少数类样本的迁移编码
                mean_0_1, stddec_0_1 = tf.split(mean_and_stddec_0_1, 2, 1)  # 获取分布的两个参数
                z_0_1 = sample_z(mean_0_1, stddec_0_1)  # 期望与z_1一致
                x_0_1 = dec(z_0_1)  # 使用迁移编码经过解码器得到的迁移少数类样本

                # 少数类样本重构
                mean_and_stddec_1 = enc(x_1)  # 使用编码器对少数类样本进行编码
                mean_1, stddec_1 = tf.split(mean_and_stddec_1, 2, 1)  # 将1x128拆分为2x64，获取分布的两个参数
                z_1 = sample_z(mean_1, stddec_1)  # 根据分布的两个参数得到分布然后采样取值
                x_1_1 = dec(z_1)  # 由少数类样本重构得到的少数类样本

                # 原样本，重构样本以及迁移样本通过判别器
                dis_output_1 = dis(x=x_1)
                dis_output_1_1 = dis(x=x_1_1)
                dis_output_0_1 = dis(x=x_0_1)

                # 不同编码通过编码的判别器
                dis_map_output_1 = dis_map(x=z_1)
                dis_map_output_0_1 = dis_map(x=z_0_1)

                # VAE的损失（alpha_1*先验+alpha_2*似然）
                vae_0_0_likelihood_loss = get_vae_likelihood_loss(original_x=x_0, generated_x=x_0_0)
                vae_1_1_likelihood_loss = get_vae_likelihood_loss(original_x=x_1, generated_x=x_1_1)
                vae_0_0_loss = get_vae_loss(mean=mean_0, stddec=stddec_0, original_x=x_0, generated_x=x_0_0,
                                            alpha_1=alpha_1, alpha_2=alpha_2)
                vae_1_1_loss = get_vae_loss(mean=mean_1, stddec=stddec_1, original_x=x_1, generated_x=x_1_1,
                                            alpha_1=alpha_1, alpha_2=alpha_2)

                # 解码器生成迁移样本的损失
                dec_gen_migration_loss = - tf.math.log(tf.reduce_mean(dis_output_0_1))

                # 多数类迁移少数类编码与少数类编码的一致性损失   # 需要求一个方向上的均值
                code_consistency_loss = tf.reduce_mean(
                    tf.square(tf.reduce_mean(z_1, axis=0) - tf.reduce_mean(z_0_1, axis=0)))

                # 欧氏距离约束    # 需要求一个方向上的均值
                loss_distance_0_1_to_1 = tf.reduce_mean(
                    tf.square(tf.reduce_mean(x_0_1, axis=0) - tf.reduce_mean(x_1, axis=0)))
                loss_distance_0_1_to_0 = tf.reduce_mean(
                    tf.square(tf.reduce_mean(x_0_1, axis=0) - tf.reduce_mean(x_0, axis=0)))
                loss_distance = 2 * loss_distance_0_1_to_1 + 1 * loss_distance_0_1_to_0

                # 映射网络转换编码的损失
                loss_map_enc = - tf.math.log(tf.reduce_mean(dis_map_output_0_1))

                # 需要得到的损失
                enc_loss = vae_0_0_loss + vae_1_1_loss
                dec_loss = vae_0_0_likelihood_loss + vae_1_1_likelihood_loss + dec_gen_migration_loss + loss_distance
                map_net_loss = code_consistency_loss + loss_distance + loss_map_enc
                # dis_loss = 2.0 * tf.reduce_mean(dis_output_0_1) - tf.reduce_mean(dis_output_1_1) - tf.reduce_mean(dis_output_1)

                dis_loss = - (
                            tf.math.log(tf.reduce_mean(dis_output_1)) + tf.math.log(1 - tf.reduce_mean(dis_output_0_1)))
                dis_map_loss = - (tf.math.log(tf.reduce_mean(dis_map_output_1)) + tf.math.log(
                    1 - tf.reduce_mean(dis_map_output_0_1)))
                # 
                # loss = enc_loss + dec_gen_migration_loss + loss_distance + 1.2 * map_net_loss + dis_loss

                '''
                    # 将上述损失添加到loss_df中
                    # 需要把tensor数据类型转为float
                    # loss_df = loss_df.append([{'epoch':epoch, 
                    #                             'enc_loss':float(enc_loss.numpy()), 
                    #                             'dec_loss':float(dec_loss.numpy()), 
                    #                             'map_net_loss':float(map_net_loss.numpy()), 
                    #                             'dis_loss':float(dis_loss.numpy())}], ignore_index=True)
                '''
            # 计算梯度，优化编码器，解码器以及判别器
            grads = enc_tape.gradient(enc_loss, enc.trainable_variables)
            optimizer_enc.apply_gradients(zip(grads, enc.trainable_variables))

            grads = dec_tape.gradient(dec_loss, dec.trainable_variables)
            optimizer_dec.apply_gradients(zip(grads, dec.trainable_variables))

            grads = map_net_tape.gradient(map_net_loss, map_net.trainable_variables)
            optimizer_map_net.apply_gradients(zip(grads, map_net.trainable_variables))

            grads = dis_tape.gradient(dis_loss, dis.trainable_variables)
            optimizer_dis.apply_gradients(zip(grads, dis.trainable_variables))

            grads = dis_map_tape.gradient(dis_map_loss, dis_map.trainable_variables)
            optimizer_dis_map.apply_gradients(zip(grads, dis_map.trainable_variables))

        # 保存每一个epoch后的网络结构，并用测试验证集的性能指标
        optimized_smsg_net_dict = {'enc': enc, 'dec': dec, 'map_net': map_net, 'dis': dis}
        optimized_smsg_net_dict_list.append(optimized_smsg_net_dict)
        balanced_train_dataset_df = generate_new_1_sample(optimized_smsg_net_dict, train_batch, columns=columns)
        val_dataset_df = batch_list_to_df(val_batch, columns=columns)

        val_dataset_performance_sum = np.array(
            get_three_methods_performance_df(balanced_train_dataset_df, val_dataset_df).iloc[:, 1:]).astype(
            'double').sum()
        # print(val_dataset_performance_sum)
        val_performance_list.append(val_dataset_performance_sum)

        # 每100个epoch
        if (epoch + 1) % 100 == 0:
            # if True:
            #     # 每100个epoch绘制一下图像，检验有没有后验崩塌的现象存在
            #     optimized_smsg_net_dict = {'enc':enc, 'dec':dec, 'map_net':map_net, 'dis':dis}
            #     balanced_train_dataset_df = generate_new_1_sample(optimized_smsg_net_dict, train_batch, columns=columns)
            #     x1, y1 = balanced_train_dataset_df.iloc[:, :-1].astype('float'), balanced_train_dataset_df.iloc[:, -1].astype('int')
            #     x1 = pca.transform(x1)
            #     fig1 = sns.stripplot(x=x1[:, 0], y=x1[:, 1], hue=y1)
            #     scatter_fig1 = fig1.get_figure()
            #     scatter_fig1.savefig('/home/lqw/testone/my_code_pro_pro_pro/smsg/images/' + 'balanced_pima_' + str(index + 1) + '_' + str(epoch + 1) + '.png')
            #     fig1.clear()

            print('epoch' + str(epoch + 1) + '训练完成！', end='\t')

    # 将每个epoch训练得到的loss_df写入到文件中
    # loss_df.to_csv('smsg/loss_logger/train_loss.csv', index=False, header=True, sep=',')

    # 优化过的SMSG的网络结构
    # optimized_smsg_net_dict = {'enc':enc, 'dec':dec, 'map_net':map_net, 'dis':dis}
    # 使用300个epoch中对验证集效果最好的一组网络结构作为最终的网络结构
    max_index = val_performance_list.index(max(val_performance_list))
    print('最好的epoch是：' + str(max_index))
    optimized_smsg_net_dict = optimized_smsg_net_dict_list[max_index]

    return optimized_smsg_net_dict


## generate new samples

## main

In [15]:
# 五次交叉验证
for i in range(1, 6):
    dataset_path = project_path + 'eeg/split/' + str(i) + '/train/'
    if not os.path.exists(dataset_path):
        os.makedirs(dataset_path)
    balanced_dataset_path = project_path + 'eeg/split/' + str(i) + '/balanced_train/'
    if not os.path.exists(balanced_dataset_path):
        os.makedirs(balanced_dataset_path)    
    # 获取所有的数据集名称的列表
    keel_dataset_name_list = os.listdir(dataset_path)
    # 对每一个数据集进行处理
    for keel_dataset_name in keel_dataset_name_list:
        # 加载数据集
        my_df = load_keel_dataset(dataset_path, keel_dataset_name)
        # 数据集预处理
        my_df_after_preprocess = keel_dataset_preprocess(my_df)
        print('数据集预处理成功', end='\t')
        # 获取训练样本x的特征维度，方便后续对网络进行初始化
        x_dim = my_df_after_preprocess.shape[1] - 1
        columns = my_df_after_preprocess.columns
        # 获取K次交叉验证中训练集和测试集的batches列表
        batch_list = get_keel_dataset_batch_list(my_df_after_preprocess, batch_size=batch_size, my_random_state=2022)
        train_batch = batch_list[0]
        val_batch = batch_list[1]
        print('\t取出训练batches成功', end='\t')
        # 初始化SMSG网络及其对应的优化器
        smsg_net_dict = init_net(x_dim=x_dim, enc_dim=enc_dim)
        optimizers_dict = init_optimizers(learning_rate=2e-4, beta_1=0.5)
        print('\t初始化网络和优化器成功', end='\t')
        # 进行一次交叉验证中的训练，得到优化后的SMSG的网络结构
        optimized_smsg_net_dict = train(smsg_net_dict=smsg_net_dict, optimizers_dict=optimizers_dict, train_batch=train_batch, val_batch=val_batch, vae_loss_paramaters_dict=vae_loss_paramaters_dict, columns=columns, epochs=epochs)
        print('\t网络优化成功', end='\t')
        # 从优化好的网络中生成新的少数类样本得到平衡的数据集df
        balanced_train_dataset_df = generate_new_1_sample(optimized_smsg_net_dict, train_batch, columns=columns)
        balanced_train_dataset_df.get('rf').to_csv(balanced_dataset_path + keel_dataset_name[:-4] + '_rf.csv', index=False)
        balanced_train_dataset_df.get('lr').to_csv(balanced_dataset_path + keel_dataset_name[:-4] + '_lr.csv', index=False)
        balanced_train_dataset_df.get('svm').to_csv(balanced_dataset_path + keel_dataset_name[:-4] + '_svm.csv', index=False)

        print('\t生成新的少数类样本成功', end='\t')
        print(keel_dataset_name + "结束！！！")

print('jieshu')

数据集预处理成功		取出训练batches成功		初始化网络和优化器成功	

InvalidArgumentError: Exception encountered when calling layer "reshape_4" "                 f"(type Reshape).

{{function_node __wrapped__Reshape_device_/job:localhost/replica:0/task:0/device:CPU:0}} Input to reshape is a tensor with 1188 values, but the requested shape has 190080 [Op:Reshape]

Call arguments received by layer "reshape_4" "                 f"(type Reshape):
  • inputs=tf.Tensor(shape=(36, 33), dtype=float32)