In [1]:
import numpy as np
import pandas as pd


def result_show(src_data, imputed_data, M_matrix):
    '''
    根据原来数据, 插补数据, 以及缺失矩阵计算缺失部分的插补结果展示。
    src_data: (batch, dim)输入维度
    '''
    src_imputed = src_data[M_matrix==0]
    gen_imputed = imputed_data[M_matrix==0]
    print("原始数据",src_imputed[:10])
    print("插补数据",gen_imputed[:10])
    diff_ratio = abs(src_imputed - gen_imputed)/(src_imputed+1e-8) # 插补数与真值比例相差比例
    MSE_loss = sum((src_imputed - gen_imputed)**2)
    return MSE_loss

def get_missing(data, p_miss, random_seed=0):
    '''
    得到缺失缺失矩阵(1代表存在数据,0代表缺失数据), 以及包含缺失数据的数据集, 缺失数据采用9999替代
    data: 完整数据
    p: 缺失概率
    return: 包含缺失数据的矩阵, Missing矩阵
    '''
    np.random.seed(random_seed)
    num, Dim = data.shape
    p_miss_vec = p_miss * np.ones((Dim,1))
    Missing = np.zeros((num, Dim)) # 缺失矩阵, 1代表存在数据,0代表缺失数据
    for i in range(Dim):
        A = np.random.uniform(0., 1., size = [num,]) # 从[0,1)抽取随机数，shape=size，此处为(4601,)
        B = A > p_miss_vec[i] # (4601,)返回bool向量，如果随机数大于p_miss_vec则为1，控制缺失比率
        Missing[:,i] = 1.*B   # 得到随机缺失矩阵
    
    missing_data = data.copy()
    missing_data[Missing==0] = np.nan
    return missing_data, Missing

### 根据完整数据集模拟得到缺失数据集

In [None]:
import numpy as np
import pandas as pd
data = pd.read_csv('Letter清洗后.csv')
miss_data, miss_matrix = get_missing(data, 0.3)
## 保存缺失数据 和 真实值
index = 100
miss_data[:index].to_csv('缺失数据min_batch.csv', index=False)
data[:index].to_csv('真实数据min_batch.csv', index=False)
pd.DataFrame(miss_matrix[:index]).to_csv('缺失矩阵min_batch.csv', index=False)

### 多种方法插补缺失数据
- 参看网站: https://blog.csdn.net/qq_38958113/article/details/98220246

In [13]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
import miceforest as mf
import numpy as np
import pandas as pd
import sys
sys.path.append(r'C:\王源权\西南财经大学\硕士毕业论文\模型代码\VAE_pytorchLightning')
from dataModule.dataset1 import FlatDataset

data_set = FlatDataset('无标签Spam.csv')
number = 20
src_data = np.stack([data_set[index]['src_data'] for index in range(number)], axis=0)
miss_data, miss_matrix = get_missing(src_data, 0.3, random_seed=99)

# miss_data = pd.read_csv('无标签缺失数据min_batch.csv').to_numpy()
# src_data = pd.read_csv('无标签真实数据min_batch.csv').to_numpy()
# miss_matrix = pd.read_csv('无标签缺失矩阵min_batch.csv').to_numpy()

#### 采用列均值填补

In [14]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') # 采用列均值填补
imp_mean.fit(miss_data)
imputed_data = imp_mean.transform(miss_data) # 如果是离散数值则np.around进行四舍五入操作
result_show(src_data, imputed_data, miss_matrix)

原始数据 [0.64 0.   0.64 0.   1.93 0.   0.   0.   0.   0.  ]
插补数据 [0.17777778 0.49846154 0.28714286 0.03307692 1.97352941 0.39363636
 0.20214286 0.         0.019375   0.        ]


3890825.575453561

#### 采用回归器方式进行插补

In [15]:
imp = IterativeImputer(max_iter=36, random_state=0)
imp.fit(miss_data) # .fit_transform
imputed_data = imp.transform(miss_data)
result_show(src_data, imputed_data, miss_matrix)



原始数据 [0.64 0.   0.64 0.   1.93 0.   0.   0.   0.   0.  ]
插补数据 [ 0.55594938  0.48360971  0.28772599  0.03655715  1.92279175 -0.38549468
  0.12365756  0.          0.055217    0.        ]


899086.8336186797

#### 采用knn插值方法

In [16]:
imputer_knn = KNNImputer(n_neighbors=3) # 存在超参数需要调整
imputer_knn.fit(miss_data)
imputed_data = imputer_knn.transform(miss_data)
result_show(src_data, imputed_data, miss_matrix)

原始数据 [0.64 0.   0.64 0.   1.93 0.   0.   0.   0.   0.  ]
插补数据 [0.23       0.21       0.33333333 0.         2.48       0.
 0.14333333 0.         0.         0.        ]


3551686.127585561

#### 使用多重插补方法miceforest
- 参考网站：https://blog.csdn.net/upluck/article/details/111868990
- mf.ampute_data(miss_data, perc=0.25, random_state=1991)可以根据比例制造缺失数据集

In [17]:
kernel = mf.ImputationKernel(
  miss_data,
  datasets=4,
  save_all_iterations=True,
  random_state=1991
)
kernel.mice(10)
imputed_data = kernel.complete_data(0) # 耗时较长
result_show(src_data, imputed_data, miss_matrix)

原始数据 [0.64 0.   0.64 0.   1.93 0.   0.   0.   0.   0.  ]
插补数据 [0.28 0.   0.   0.   1.88 0.   0.   0.   0.   0.  ]


2991587.0104910037

#### 采用VAE模型插补结果

In [18]:
from modelModule.model2 import VAE2
import torch

In [19]:
# 模型载入，注意数据维度和模型维度匹配
model = VAE2(dim=57, nhead=3)
checkpoint_path = r'C:\王源权\西南财经大学\硕士毕业论文\模型代码\VAE_pytorchLightning\模型参数保存\Spam_model2_norm_way_mean_norm_\version_1\epoch=227-step=3420.ckpt'    
checkpoint = torch.load(checkpoint_path)
model.load_state_dict({k.replace('model.',''):v for k,v in checkpoint['state_dict'].items()}) 
model.eval()
# 载入训练数据集
train_dataset = FlatDataset(r'C:\王源权\西南财经大学\硕士毕业论文\模型代码\VAE_pytorchLightning\Spam_train.csv', data_norm='mean_norm')
model.get_global_min_max(global_max=train_dataset.Max_Val, global_min=train_dataset.Min_Val)
print('模型载入成功')
imputed_data, _ = model.inference(miss_data, miss_matrix)
result_show(src_data, imputed_data, miss_matrix)

模型载入成功
原始数据 [0.64 0.   0.64 0.   1.93 0.   0.   0.   0.   0.  ]
插补数据 [ 1.01899262  0.58106679  0.65615669  0.32079956  2.07123064 -0.07889239
  0.06986575  0.74187349 -0.20302457  0.24425268]


3974363.7786406972

In [20]:
imputed_data

array([[ 2.19716623e-01,  1.01899262e+00, -2.92007744e-03, ...,
         3.87677391e+00,  1.67422871e+01,  3.02092849e+02],
       [ 2.19641037e-01,  1.01999877e+00, -3.37557110e-03, ...,
         3.89506820e+00,  1.71484385e+01,  3.01454281e+02],
       [ 2.19741367e-01,  1.01917114e+00, -2.74495515e-03, ...,
         3.87073627e+00,  1.65504260e+01,  3.02244406e+02],
       ...,
       [ 2.19598354e-01,  1.01828654e+00, -2.32814652e-03, ...,
         3.87110601e+00,  1.64658758e+01,  3.02616740e+02],
       [ 2.19424203e-01,  1.01691033e+00, -2.74029888e-03, ...,
         3.87811887e+00,  1.72094237e+01,  3.01610964e+02],
       [ 2.18725516e-01,  1.01581502e+00, -1.91040078e-03, ...,
         3.91219547e+00,  1.72485853e+01,  3.01342105e+02]])