In [9]:
import os
import pandas as pd
import numpy as np
from collections import Counter

# 机器学习和数据处理库
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# imbalanced-learn 库，专门用于处理类别不均衡问题
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# 可视化库
import matplotlib.pyplot as plt
import seaborn as sns

print("====================================================")
print("===      第 5.1.3 节: 数据预处理策略 (数据采样)      ===")
print("====================================================")

# --- 步骤 1: 加载已处理好的最终数据集 ---
processed_data_path = os.path.join('data', 'processed', 'final_processed.csv')

try:
    final_df = pd.read_csv(processed_data_path)
    print(f"\n[INFO] 已成功从 '{processed_data_path}' 加载数据集。")
    print(f"数据集形状: {final_df.shape}")
except FileNotFoundError:
    print(f"[ERROR] 文件未找到: '{processed_data_path}'。请确保您已运行并保存了之前的预处理代码。")
    # 如果文件不存在，退出后续操作
    exit()

===      第 5.1.3 节: 数据预处理策略 (数据采样)      ===

[INFO] 已成功从 'data\processed\final_processed.csv' 加载数据集。
数据集形状: (4312063, 10)


In [10]:
# --- 步骤 2: 数据编码 (将所有数据转换为数值型) ---
print("\n[INFO] 正在对数据进行编码...")

# 2.0 统一目标标签名称
#     确保'Legitimate'和'Legitimate_traffic'等统一为一个名称
print("\n[INFO] 正在统一目标标签的名称...")
label_mapping_dict = {
    'Legitimate': 'Legitimate',
    'TCPSYN-flood': 'TCPSYN-flood',
    'UDP-flood': 'UDP-flood',
    # 需要统一的标签
    'Legitimate_traffic': 'Legitimate',
    'TCP_syn_flood_attack': 'TCPSYN-flood',
    'UDP_flood_attack': 'UDP-flood',
    'ICMP_flood_attack': 'ICMP-flood',
    'DNS_flood_attack': 'DNS-flood',
    'HTTP_flood_attack': 'HTTP-flood'
}

# 使用 .map() 函数高效地应用这个映射
final_df['Label'] = final_df['Label'].map(label_mapping_dict)

print("标签名称统一完成。统一后的标签分布情况:")
print(final_df['Label'].value_counts())


[INFO] 正在对数据进行编码...

[INFO] 正在统一目标标签的名称...
标签名称统一完成。统一后的标签分布情况:
Label
Legitimate      3072442
TCPSYN-flood     610074
UDP-flood        335018
ICMP-flood       124847
DNS-flood        102745
HTTP-flood        66937
Name: count, dtype: int64


In [11]:
# 2.1 编码特征 (X)
# 'Protocol' 是类别型特征，我们使用 One-Hot 编码 (pd.get_dummies)
# 这会为每个协议类型创建一个新的二元 (0/1) 列，避免了模型错误地理解协议之间的顺序关系
X = pd.get_dummies(final_df.drop('Label', axis=1), columns=['Protocol'], dtype=int)
print(f"特征集 X 编码完成。新的特征形状: {X.shape}")
print(f"新的特征列: {X.columns.tolist()}...")

特征集 X 编码完成。新的特征形状: (4312063, 19)
新的特征列: ['Timestamp', 'TTL', 'Length', 'SYN', 'ACK', 'RST', 'PSH', 'FIN', 'Protocol_DNS', 'Protocol_Generic Routing Encapsulation', 'Protocol_HTTP', 'Protocol_ICMP', 'Protocol_ICMP,ICMP', 'Protocol_ICMP,TCP', 'Protocol_ICMP,UDP', 'Protocol_IGMP', 'Protocol_IPv6', 'Protocol_TCP', 'Protocol_UDP']...


In [12]:
# 2.2 编码目标标签 (y)
# 'Label' 是我们的目标，我们使用 LabelEncoder 将其转换为整数 (0, 1, 2...)
le = LabelEncoder()
y = le.fit_transform(final_df['Label'])
# 保存编码器和类别映射关系，以便后续解码
label_mapping = {index: label for index, label in enumerate(le.classes_)}
print("\n目标标签 y 编码完成。")
print("标签映射关系 (编码 -> 原始标签):")
print(label_mapping)


目标标签 y 编码完成。
标签映射关系 (编码 -> 原始标签):
{0: 'DNS-flood', 1: 'HTTP-flood', 2: 'ICMP-flood', 3: 'Legitimate', 4: 'TCPSYN-flood', 5: 'UDP-flood'}


In [18]:
# --- 步骤 3: 划分训练集和测试集 ---
# 严格按照论文的比例: 30% 训练, 70% 测试
# stratify=y: 确保划分后的训练集和测试集中的类别分布与原始数据集相似，这在处理不均衡数据时非常重要
# random_state=42: 确保每次运行代码时，划分结果都是一样的，便于复现
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.7, 
    random_state=42, 
    stratify=y
)
print("\n[INFO] 数据集已划分为训练集和测试集。")
print(f"训练集形状: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"测试集形状: X_test={X_test.shape}, y_test={y_test.shape}")

import pandas as pd
import numpy as np
import os
from collections import Counter

# 使用绝对路径
output_dir = os.path.join('data', 'splitted')
os.makedirs(output_dir, exist_ok=True)

# === 保存训练集 ===
X_train_df = pd.DataFrame(X_train, columns=X.columns)
X_train_df['Label_code'] = y_train

train_path = os.path.join(output_dir, 'train_original.csv')
X_train_df.to_csv(train_path, index=False)

print(f"训练集已保存: '{train_path}'")
print(f"训练集形状: {X_train_df.shape}")

# === 保存测试集 ===
X_test_df = pd.DataFrame(X_test, columns=X.columns)
X_test_df['Label_code'] = y_test

test_path = os.path.join(output_dir, 'test.csv')
X_test_df.to_csv(test_path, index=False)

print(f"\n测试集已保存: '{test_path}'")
print(f"测试集形状: {X_test_df.shape}")

# === 验证保存的数据 ===
print("\n[验证] 训练集类别分布:")
print(Counter(y_train))

print("\n[验证] 测试集类别分布:")
print(Counter(y_test))

# # === 可选：同时保存为npy格式 ===
# np.save(os.path.join(output_dir, 'X_train.npy'), X_train)
# np.save(os.path.join(output_dir, 'y_train.npy'), y_train)
# np.save(os.path.join(output_dir, 'X_test.npy'), X_test)
# np.save(os.path.join(output_dir, 'y_test.npy'), y_test)

print(f"\n[INFO] 数据已保存到目录: {output_dir}")


[INFO] 数据集已划分为训练集和测试集。
训练集形状: X_train=(1293618, 19), y_train=(1293618,)
测试集形状: X_test=(3018445, 19), y_test=(3018445,)
训练集已保存: 'data\splitted\train_original.csv'
训练集形状: (1293618, 20)

测试集已保存: 'data\splitted\test.csv'
测试集形状: (3018445, 20)

[验证] 训练集类别分布:
Counter({np.int64(3): 921732, np.int64(4): 183022, np.int64(5): 100505, np.int64(2): 37454, np.int64(0): 30824, np.int64(1): 20081})

[验证] 测试集类别分布:
Counter({np.int64(3): 2150710, np.int64(4): 427052, np.int64(5): 234513, np.int64(2): 87393, np.int64(0): 71921, np.int64(1): 46856})

[INFO] 数据已保存到目录: data\splitted


In [14]:
# --- 步骤 4: 应用数据采样技术 (仅对训练集) ---
print("\n[INFO] 正在对训练集应用数据采样技术...")

# 打印原始训练集的类别分布
print("\n原始训练集的类别分布:")
original_counts = Counter(y_train)
for label_code, count in sorted(original_counts.items()):
    print(f"  - {label_mapping[label_code]}: {count} 个样本")


[INFO] 正在对训练集应用数据采样技术...

原始训练集的类别分布:
  - DNS-flood: 30824 个样本
  - HTTP-flood: 20081 个样本
  - ICMP-flood: 37454 个样本
  - Legitimate: 921732 个样本
  - TCPSYN-flood: 183022 个样本
  - UDP-flood: 100505 个样本


In [15]:
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from collections import Counter

# 查看原始训练集分布
print("原始训练集的类别分布:")
print(Counter(y_train))

# 步骤1: 应用 Tomek Links 移除所有多数类的边界样本
tomek = TomekLinks(sampling_strategy='not minority', n_jobs=-1)
X_train_tomek, y_train_tomek = tomek.fit_resample(X_train, y_train)

print("\n应用 Tomek Links 后的类别分布:")
print(Counter(y_train_tomek))
print(f"移除的样本数: {len(y_train) - len(y_train_tomek)}")

# 步骤2: 应用 Random UnderSampling 使所有类别样本数相等
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_tomek, y_train_tomek)

print("\n应用 Random UnderSampling 后的类别分布:")
print(Counter(y_train_resampled))
print(f"最终训练集样本数: {len(y_train_resampled)}")

output_dir = os.path.join('data', 'resampled')
os.makedirs(output_dir, exist_ok=True)

# 1. 将采样后的 X (NumPy数组) 转换回带列名的 DataFrame
X_resampled_df = pd.DataFrame(X_train_resampled, columns=X_train.columns)

# 2. 将采样后的 y (NumPy数组) 直接作为新的一列添加到 X_resampled_df 中
#    因为它们的长度和顺序由同一个函数生成，所以保证对齐
X_resampled_df['Label_code'] = y_train_resampled

# 3. 此时，final_resampled_df 就是 X_resampled_df
final_resampled_df = X_resampled_df

# 4. 定义保存路径并保存为 CSV 文件
output_path_csv = os.path.join(output_dir, 'train_TLink_RUS.csv')
final_resampled_df.to_csv(output_path_csv, index=False)

# 5. 打印保存信息
print(f"\n数据已保存为CSV文件: '{output_path_csv}'")
print(f"保存的数据形状: {final_resampled_df.shape}")

# np.save(os.path.join(output_dir, 'X_train_TLink_RUS.npy'), X_train_resampled)
# np.save(os.path.join(output_dir, 'y_train_TLink_RUS.npy'), y_train_resampled)

# print(f"\n数据已保存到: {output_dir}")
# print(f"- X_train_TLink_RUS.npy: {X_train_resampled.shape}")
# print(f"- y_train_TLink_RUS.npy: {y_train_resampled.shape}")

原始训练集的类别分布:
Counter({np.int64(3): 921732, np.int64(4): 183022, np.int64(5): 100505, np.int64(2): 37454, np.int64(0): 30824, np.int64(1): 20081})

应用 Tomek Links 后的类别分布:
Counter({np.int64(3): 921024, np.int64(4): 183022, np.int64(5): 100271, np.int64(2): 37454, np.int64(0): 30824, np.int64(1): 20081})
移除的样本数: 942

应用 Random UnderSampling 后的类别分布:
Counter({np.int64(0): 20081, np.int64(1): 20081, np.int64(2): 20081, np.int64(3): 20081, np.int64(4): 20081, np.int64(5): 20081})
最终训练集样本数: 120486

数据已保存为CSV文件: 'data\resampled\train_TLink_RUS.csv'
保存的数据形状: (120486, 20)


In [16]:
from sklearn.cluster import MiniBatchKMeans
from collections import Counter
import numpy as np
import os

print("原始训练集的类别分布:")
print(Counter(y_train))

# 强制转换为 numpy array
X_train_array = np.array(X_train)
y_train_array = np.array(y_train)

# 找出多数类
class_counts = Counter(y_train_array)
majority_class = max(class_counts, key=class_counts.get)
minority_classes = [cls for cls in class_counts.keys() if cls != majority_class]

print(f"\n多数类: {majority_class}, 少数类: {minority_classes}")

# 分离多数类和少数类
majority_idx = y_train_array == majority_class
minority_idx = ~majority_idx

X_majority = X_train_array[majority_idx]
y_majority = y_train_array[majority_idx]
X_minority = X_train_array[minority_idx]
y_minority = y_train_array[minority_idx]

print(f"多数类样本数: {len(y_majority)}")
print(f"少数类样本数: {len(y_minority)}")

# CBMP 参数设置
n_clusters = 10
n_minority = len(y_minority)

print(f"\n开始 MiniBatch K-Means 聚类，聚类数: {n_clusters}")

# 步骤1: 对多数类进行聚类
kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=10000, n_init=3, max_iter=100)
cluster_labels = kmeans.fit_predict(X_majority)

print("聚类完成")

# 步骤2: 为每个簇计算采样数量（C1和C2各一半）
sampled_indices_random = []  # C1: 随机选择
sampled_indices_centroid = []  # C2: 距离簇心最近

for i in range(n_clusters):
    cluster_mask = cluster_labels == i
    cluster_indices = np.where(cluster_mask)[0]
    cluster_size = len(cluster_indices)
    
    # 计算该簇的采样数量（完整的 s_i，不分半）
    r_i = cluster_size / len(y_majority)
    s_i = int(r_i * n_minority)
    
    
    print(f"簇 {i}: 样本数={cluster_size}, 比例={r_i:.4f}, 采样数={s_i}")
    
    if s_i > 0:
        # 策略1: 随机选择 s_i 个样本
        random_sample = np.random.choice(
            cluster_indices, 
            size=min(s_i, cluster_size),  # ← 修改：使用完整的 s_i
            replace=False
        )
        sampled_indices_random.extend(random_sample)
        
        # 策略2: 选择距离簇心最近的 s_i 个样本
        cluster_center = kmeans.cluster_centers_[i]
        X_cluster = X_majority[cluster_indices]
        distances = np.linalg.norm(X_cluster - cluster_center, axis=1)
        closest_indices = cluster_indices[
            np.argsort(distances)[:min(s_i, cluster_size)]  # ← 修改：使用完整的 s_i
        ]
        sampled_indices_centroid.extend(closest_indices)

# 步骤3: 合并采样结果
X_majority_sampled_C1 = X_majority[sampled_indices_random]
y_majority_sampled_C1 = y_majority[sampled_indices_random]

X_majority_sampled_C2 = X_majority[sampled_indices_centroid]
y_majority_sampled_C2 = y_majority[sampled_indices_centroid]

# 合并 C1、C2 和少数类
X_resampled = np.vstack([X_minority, X_majority_sampled_C1, X_majority_sampled_C2])
y_resampled = np.hstack([y_minority, y_majority_sampled_C1, y_majority_sampled_C2])

print(f"\n采样后的类别分布:")
print(Counter(y_resampled))
print(f"最终训练集样本数: {len(y_resampled)}")

# 保存处理后的数据
output_dir = os.path.join('data', 'resampled')
os.makedirs(output_dir, exist_ok=True)

# 步骤4: 保存为CSV格式

# 将采样后的 X (NumPy数组) 转换回带列名的 DataFrame
X_resampled_df = pd.DataFrame(X_resampled, columns=X_train.columns)

# 添加标签列
X_resampled_df['Label_code'] = y_resampled

# 保存为CSV
output_path_csv = os.path.join(output_dir, 'train_CBMP.csv')
X_resampled_df.to_csv(output_path_csv, index=False)

print(f"\n数据已保存为CSV文件: '{output_path_csv}'")
print(f"保存的数据形状: {X_resampled_df.shape}")

# np.save(os.path.join(output_dir, 'X_train_CBMP.npy'), X_resampled)
# np.save(os.path.join(output_dir, 'y_train_CBMP.npy'), y_resampled)

# print(f"\n数据已保存到: {output_dir}")
# print(f"- X_train_CBMP.npy: {X_resampled.shape}")
# print(f"- y_train_CBMP.npy: {y_resampled.shape}")

X_train_resampled = X_resampled
y_train_resampled = y_resampled

原始训练集的类别分布:
Counter({np.int64(3): 921732, np.int64(4): 183022, np.int64(5): 100505, np.int64(2): 37454, np.int64(0): 30824, np.int64(1): 20081})

多数类: 3, 少数类: [np.int64(5), np.int64(1), np.int64(4), np.int64(0), np.int64(2)]
多数类样本数: 921732
少数类样本数: 371886

开始 MiniBatch K-Means 聚类，聚类数: 10
聚类完成
簇 0: 样本数=127019, 比例=0.1378, 采样数=51247
簇 1: 样本数=112421, 比例=0.1220, 采样数=45357
簇 2: 样本数=152946, 比例=0.1659, 采样数=61708
簇 3: 样本数=120599, 比例=0.1308, 采样数=48657
簇 4: 样本数=5752, 比例=0.0062, 采样数=2320
簇 5: 样本数=296585, 比例=0.3218, 采样数=119661
簇 6: 样本数=13739, 比例=0.0149, 采样数=5543
簇 7: 样本数=32356, 比例=0.0351, 采样数=13054
簇 8: 样本数=3936, 比例=0.0043, 采样数=1588
簇 9: 样本数=56379, 比例=0.0612, 采样数=22746

采样后的类别分布:
Counter({np.int64(3): 743762, np.int64(4): 183022, np.int64(5): 100505, np.int64(2): 37454, np.int64(0): 30824, np.int64(1): 20081})
最终训练集样本数: 1115648

数据已保存为CSV文件: 'data\resampled\train_CBMP.csv'
保存的数据形状: (1115648, 20)


In [17]:
from imblearn.over_sampling import SMOTE
from collections import Counter
import numpy as np
import os

print("原始训练集的类别分布:")
print(Counter(y_train))

# 应用 SMOTE 过采样
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\n应用 SMOTE 后的类别分布:")
print(Counter(y_train_resampled))
print(f"原始训练集样本数: {len(y_train)}")
print(f"过采样后训练集样本数: {len(y_train_resampled)}")
print(f"新增样本数: {len(y_train_resampled) - len(y_train)}")

# 保存处理后的数据
output_dir = os.path.join('data', 'resampled')
os.makedirs(output_dir, exist_ok=True)

# # 方式1: 保存为 .npy 格式
# np.save(os.path.join(output_dir, 'X_train_SMOTE.npy'), X_train_resampled)
# np.save(os.path.join(output_dir, 'y_train_SMOTE.npy'), y_train_resampled)

# print(f"\n数据已保存到: {output_dir}")
# print(f"- X_train_SMOTE.npy: {X_train_resampled.shape}")
# print(f"- y_train_SMOTE.npy: {y_train_resampled.shape}")

# 方式2: 保存为 CSV 格式
X_resampled_df = pd.DataFrame(X_train_resampled, columns=X_train.columns)
X_resampled_df['Label_code'] = y_train_resampled

output_path_csv = os.path.join(output_dir, 'train_SMOTE.csv')
X_resampled_df.to_csv(output_path_csv, index=False)

print(f"\n数据已保存为CSV文件: '{output_path_csv}'")
print(f"保存的数据形状: {X_resampled_df.shape}")

原始训练集的类别分布:
Counter({np.int64(3): 921732, np.int64(4): 183022, np.int64(5): 100505, np.int64(2): 37454, np.int64(0): 30824, np.int64(1): 20081})

应用 SMOTE 后的类别分布:
Counter({np.int64(5): 921732, np.int64(3): 921732, np.int64(1): 921732, np.int64(4): 921732, np.int64(0): 921732, np.int64(2): 921732})
原始训练集样本数: 1293618
过采样后训练集样本数: 5530392
新增样本数: 4236774

数据已保存为CSV文件: 'data\resampled\train_SMOTE.csv'
保存的数据形状: (5530392, 20)


In [34]:
from sklearn.decomposition import KernelPCA
from collections import Counter
import numpy as np
import pandas as pd
import os

print("原始训练集的类别分布:")
print(Counter(y_train))

# 转换为 numpy array
X_train_array = np.array(X_train)
y_train_array = np.array(y_train)

# 找出所有类别及其样本数
class_counts = Counter(y_train_array)
max_samples = max(class_counts.values())
majority_class = max(class_counts, key=class_counts.get)

print(f"\n类别分布: {dict(class_counts)}")
print(f"多数类: {majority_class}, 样本数: {max_samples}")

# 初始化结果列表
X_resampled_list = []
y_resampled_list = []

# 对每个少数类进行LICIC处理
for cls in class_counts.keys():
    cls_idx = y_train_array == cls
    X_cls = X_train_array[cls_idx]
    y_cls = y_train_array[cls_idx]
    
    current_samples = len(X_cls)
    samples_needed = max_samples - current_samples
    
    print(f"\n类别 {cls}: 当前样本数={current_samples}, 需要生成={samples_needed}")
    
    # 保留原始样本
    X_resampled_list.append(X_cls)
    y_resampled_list.append(y_cls)
    
    if samples_needed > 0 and current_samples < max_samples:  # 少数类需要过采样
        # 步骤1: 使用 KPCA 进行核主成分分析
        n_components = min(current_samples - 1, X_cls.shape[1], 20)
        kpca = KernelPCA(n_components=n_components, kernel='rbf', 
                        gamma=1.0 / X_cls.shape[1], random_state=42, 
                        fit_inverse_transform=True)
        X_kpca = kpca.fit_transform(X_cls)
        
        print(f"  KPCA变换: {X_cls.shape} -> {X_kpca.shape}")
        
        # 步骤2: 识别不重要成分（可调比例）
        less_important_ratio = 0.5  # ← 修改：设为可调参数
        n_less_important = max(1, int(n_components * less_important_ratio))
        important_indices = np.array(range(n_components - n_less_important))
        less_important_indices = np.array(range(n_components - n_less_important, n_components))
        
        print(f"  重要成分数量: {len(important_indices)}")
        print(f"  不重要成分数量: {len(less_important_indices)}")
        
        # 步骤3: 生成新样本
        synthetic_samples_kpca = []
        
        for _ in range(samples_needed):
            # 随机选择基础样本（用于重要成分）
            base_idx = np.random.choice(len(X_kpca))
            new_sample = X_kpca[base_idx].copy()
            
            # 重要成分：完全保持不变（已经在copy中保留）
            
            # 不重要成分：从多个样本中组合
            # 随机选择另外2个样本用于不重要成分的混合
            idx1, idx2 = np.random.choice(len(X_kpca), 2, replace=True)
            sample1_less_imp = X_kpca[idx1][less_important_indices]
            sample2_less_imp = X_kpca[idx2][less_important_indices]
            
            # 对这些不重要成分进行排列
            permuted_indices_1 = np.random.permutation(len(less_important_indices))
            permuted_indices_2 = np.random.permutation(len(less_important_indices))
            
            # 线性组合排列后的不重要成分
            beta = np.random.uniform(0.0, 1.0)
            new_sample[less_important_indices] = (
                beta * sample1_less_imp[permuted_indices_1] +
                (1 - beta) * sample2_less_imp[permuted_indices_2]
            )
            
            synthetic_samples_kpca.append(new_sample)
        
        synthetic_samples_kpca = np.array(synthetic_samples_kpca)
        
        # 步骤4: 使用 KPCA 的逆变换映射回原始空间
        try:
            X_synthetic = kpca.inverse_transform(synthetic_samples_kpca)
            print(f"  成功使用KPCA逆变换生成 {samples_needed} 个合成样本")
        except Exception as e:
            # 如果逆变换失败，使用近似方法
            print(f"  警告: KPCA逆变换失败 ({str(e)})，使用最近邻近似方法")
            X_synthetic = np.zeros((samples_needed, X_cls.shape[1]))
            for i, sample_kpca in enumerate(synthetic_samples_kpca):
                distances = np.linalg.norm(X_kpca - sample_kpca, axis=1)
                nearest_idx = np.argmin(distances)
                X_synthetic[i] = X_cls[nearest_idx]
            print(f"  使用近似方法生成 {samples_needed} 个合成样本")
        
        y_synthetic = np.full(samples_needed, cls)
        
        X_resampled_list.append(X_synthetic)
        y_resampled_list.append(y_synthetic)

# 合并所有类别
X_train_resampled = np.vstack(X_resampled_list)
y_train_resampled = np.hstack(y_resampled_list)

print(f"\n应用 LICIC 后的类别分布:")
print(Counter(y_train_resampled))
print(f"原始训练集样本数: {len(y_train_array)}")
print(f"过采样后训练集样本数: {len(y_train_resampled)}")
print(f"新增样本数: {len(y_train_resampled) - len(y_train_array)}")

# 保存处理后的数据
output_dir = './resampled_data'
os.makedirs(output_dir, exist_ok=True)

# 保存为CSV格式
X_resampled_df = pd.DataFrame(X_train_resampled, columns=X_train.columns)
X_resampled_df['Label_code'] = y_train_resampled

output_path_csv = os.path.join(output_dir, 'train_LICIC.csv')
X_resampled_df.to_csv(output_path_csv, index=False)

print(f"\n数据已保存为CSV文件: '{output_path_csv}'")
print(f"保存的数据形状: {X_resampled_df.shape}")

# 可选：同时保存为npy格式（取消注释以启用）
# np.save(os.path.join(output_dir, 'X_train_LICIC.npy'), X_train_resampled)
# np.save(os.path.join(output_dir, 'y_train_LICIC.npy'), y_train_resampled)

原始训练集的类别分布:
Counter({np.int64(3): 921732, np.int64(4): 183022, np.int64(5): 100505, np.int64(2): 37454, np.int64(0): 30824, np.int64(1): 20081})

类别分布: {np.int64(5): 100505, np.int64(3): 921732, np.int64(1): 20081, np.int64(4): 183022, np.int64(0): 30824, np.int64(2): 37454}
多数类: 3, 样本数: 921732

类别 5: 当前样本数=100505, 需要生成=821227


MemoryError: Unable to allocate 75.3 GiB for an array with shape (100505, 100505) and data type float64