In [2]:
import os
import random
import pandas as pd
import numpy as np

In [3]:
# 设置随机种子，保证实验可复现
def set_reproducibility(seed=10):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

set_reproducibility(10)

In [None]:
# 读取 NSL-KDD-10000 数据集
input_filename = "0-NSL-KDD-10000.csv"
data = pd.read_csv(input_filename, header=0)
header = data.columns  # 保存列名
data

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,http,SF,151,7061,0,0,0,0,...,255,1.00,0.00,0.01,0.02,0.00,0.00,0.00,0.00,1
1,0,tcp,http,SF,205,770,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.01,0.01,0.02,0.02,1
2,0,tcp,ftp_data,SF,1830,0,0,0,0,0,...,16,0.06,0.06,0.06,0.00,0.82,0.00,0.01,0.00,1
3,0,udp,domain_u,SF,45,127,0,0,0,0,...,254,1.00,0.01,0.00,0.00,0.00,0.00,0.00,0.00,1
4,0,udp,private,SF,28,0,0,3,0,0,...,57,0.22,0.78,0.22,0.00,0.09,0.00,0.69,0.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,0,icmp,eco_i,SF,8,0,0,0,0,0,...,204,1.00,0.00,1.00,0.50,0.00,0.00,0.00,0.00,0
9995,0,udp,domain_u,SF,45,110,0,0,0,0,...,65,0.93,0.03,0.01,0.00,0.00,0.00,0.00,0.00,1
9996,1,tcp,smtp,SF,773,330,0,0,0,0,...,147,0.53,0.04,0.01,0.01,0.01,0.01,0.00,0.00,1
9997,0,tcp,private,S0,0,0,0,0,0,0,...,3,0.01,0.10,0.00,0.00,1.00,1.00,0.00,0.00,0


In [5]:
# 选择需要添加随机扰动的列
columns_to_modify = [0,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40]

In [None]:
# 1. 正态分布 (Gaussian)
def Normally_distributed_disturbance(data, columns_to_modify, mu=0, sigma=1, output_filename="NSL-KDD-Gaussian.csv"):
    modified_data = data.copy()
    for col in columns_to_modify:
        if col < len(modified_data.columns) and pd.api.types.is_numeric_dtype(modified_data.iloc[1:, col]):
            random_noise = np.random.normal(mu, sigma, size=modified_data.shape[0] - 1)
            modified_data.iloc[1:, col] += random_noise
        else:
            print(f"⚠️ 警告: 列索引 {col} 不存在或不是数值类型，跳过该列。")
    return modified_data, output_filename

# 2. 均匀分布 (Uniform)
def Uniformly_distributed_disturbance(data, columns_to_modify, low=-1, high=1, output_filename="NSL-KDD-Uniform.csv"):
    modified_data = data.copy()
    for col in columns_to_modify:
        if col < len(modified_data.columns) and pd.api.types.is_numeric_dtype(modified_data.iloc[1:, col]):
            random_noise = np.random.uniform(low, high, size=modified_data.shape[0] - 1)
            modified_data.iloc[1:, col] += random_noise
        else:
            print(f"⚠️ 警告: 列索引 {col} 不存在或不是数值类型，跳过该列。")
    return modified_data, output_filename

# 3. 拉普拉斯分布 (Laplacian)
def Laplacian_distributed_disturbance(data, columns_to_modify, mu=0, scale=1, output_filename="NSL-KDD-Laplace.csv"):
    modified_data = data.copy()
    for col in columns_to_modify:
        if col < len(modified_data.columns) and pd.api.types.is_numeric_dtype(modified_data.iloc[1:, col]):
            random_noise = np.random.laplace(mu, scale, size=modified_data.shape[0] - 1)
            modified_data.iloc[1:, col] += random_noise
        else:
            print(f"⚠️ 警告: 列索引 {col} 不存在或不是数值类型，跳过该列。")
    return modified_data, output_filename

# 4. 伽马分布 (Gamma) - 适用于非负数据
def Gamma_distributed_disturbance(data, columns_to_modify, shape=2, scale=1, output_filename="NSL-KDD-Gamma.csv"):
    modified_data = data.copy()
    for col in columns_to_modify:
        if col < len(modified_data.columns) and pd.api.types.is_numeric_dtype(modified_data.iloc[1:, col]):
            random_noise = np.random.gamma(shape, scale, size=modified_data.shape[0] - 1)
            modified_data.iloc[1:, col] += random_noise
        else:
            print(f"⚠️ 警告: 列索引 {col} 不存在或不是数值类型，跳过该列。")
    return modified_data, output_filename

# 5. 泊松分布 (Poisson) - 适用于离散数据
def Poisson_distributed_disturbance(data, columns_to_modify, lam=1, output_filename="NSL-KDD-Poisson.csv"):
    modified_data = data.copy()
    for col in columns_to_modify:
        if col < len(modified_data.columns) and pd.api.types.is_numeric_dtype(modified_data.iloc[1:, col]):
            random_noise = np.random.poisson(lam, size=modified_data.shape[0] - 1)
            modified_data.iloc[1:, col] += random_noise
        else:
            print(f"⚠️ 警告: 列索引 {col} 不存在或不是数值类型，跳过该列。")
    return modified_data, output_filename

# 6. 二项分布 (Binomial) - 适用于二值化数据
def Binomial_distributed_disturbance(data, columns_to_modify, n=1, p=0.5, output_filename="NSL-KDD-Binomial.csv"):
    modified_data = data.copy()
    for col in columns_to_modify:
        if col < len(modified_data.columns) and pd.api.types.is_numeric_dtype(modified_data.iloc[1:, col]):
            random_noise = np.random.binomial(n, p, size=modified_data.shape[0] - 1)
            modified_data.iloc[1:, col] += random_noise
        else:
            print(f"⚠️ 警告: 列索引 {col} 不存在或不是数值类型，跳过该列。")
    return modified_data, output_filename

# 7. Beta 分布 (Beta) - 适用于0-1归一化数据
def Beta_distributed_disturbance(data, columns_to_modify, alpha=2, beta=5, output_filename="NSL-KDD-Beta.csv"):
    modified_data = data.copy()
    for col in columns_to_modify:
        if col < len(modified_data.columns) and pd.api.types.is_numeric_dtype(modified_data.iloc[1:, col]):
            random_noise = np.random.beta(alpha, beta, size=modified_data.shape[0] - 1)
            modified_data.iloc[1:, col] += random_noise
            modified_data.iloc[1:, col] = np.clip(modified_data.iloc[1:, col], 0, 1)  # 限制在0-1范围
        else:
            print(f"⚠️ 警告: 列索引 {col} 不存在或不是数值类型，跳过该列。")
    return modified_data, output_filename

# 8. 柯西分布 (Cauchy) - 极端长尾扰动
def Cauchy_distributed_disturbance(data, columns_to_modify, loc=0, scale=1, output_filename="NSL-KDD-Cauchy.csv"):
    modified_data = data.copy()
    for col in columns_to_modify:
        if col < len(modified_data.columns) and pd.api.types.is_numeric_dtype(modified_data.iloc[1:, col]):
            random_noise = np.random.standard_cauchy(size=modified_data.shape[0] - 1) * scale + loc
            modified_data.iloc[1:, col] += random_noise
        else:
            print(f"⚠️ 警告: 列索引 {col} 不存在或不是数值类型，跳过该列。")
    return modified_data, output_filename


In [None]:
# 基于正态分布的扰动
data_with_noise, output_filename = Normally_distributed_disturbance(data, columns_to_modify, mu=0, sigma=1)
# 基于正态分布的扰动
# data_with_noise, output_filename = Uniformly_distributed_disturbance(data, columns_to_modify, low=-2, high=2)
# 基于正态分布的扰动
# data_with_noise, output_filename = Laplacian_distributed_disturbance(data, columns_to_modify, mu=0, scale=0.5)
# 基于伽马分布的扰动
# data_with_noise, output_filename = Gamma_distributed_disturbance(data, columns_to_modify, shape=2, scale=1)
# 基于泊松分布的扰动
# data_with_noise, output_filename = Poisson_distributed_disturbance(data, columns_to_modify, lam=3)
# 基于二项分布的扰动
# data_with_noise, output_filename = Binomial_distributed_disturbance(data, columns_to_modify, n=1, p=0.5)
# 基于 Beta 分布的扰动
# data_with_noise, output_filename = Beta_distributed_disturbance(data, columns_to_modify, alpha=2, beta=5)
# 基于柯西分布的扰动
# data_with_noise, output_filename = Cauchy_distributed_disturbance(data, columns_to_modify, loc=0, scale=1)
# 保存新的扰动数据到CSV文件，并保留列名
data_with_noise.to_csv(output_filename, index=False, header=True)
print(f"已生成扰动数据文件: {output_filename}")

  0.64737781]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  modified_data.iloc[1:, col] += random_noise
 -7.17012892e-01  4.33093662e+01]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  modified_data.iloc[1:, col] += random_noise
 -3.20151527e-01  1.36671203e+02]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  modified_data.iloc[1:, col] += random_noise
 -0.56315377]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  modified_data.iloc[1:, col] += random_noise
 -1.37352718]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  modified_data.iloc[1:, col] += random_noise
  2.18473621]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  modified_data.iloc[1:, col] += random_noise
 -0.75618735]' has dtype incompatible with int64, please explicitly cast

已生成扰动数据文件: 0-NSL-KDD-10000-PDG.csv
