In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q font-roboto
!pip install -q git+https://github.com/googlefonts/fontmake.git
!wget -q https://github.com/googlefonts/noto-cjk/raw/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Regular.otf
!wget -q https://github.com/googlefonts/noto-cjk/raw/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Bold.otf


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/2.4 MB[0m [31m2.0 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/2.4 MB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m2.1/2.4 MB[0m [31m20.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for font-roboto (setup.py) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90

In [3]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fm.fontManager.addfont('NotoSansCJKsc-Regular.otf')
fm.fontManager.addfont('NotoSansCJKsc-Bold.otf')

plt.rcParams['font.sans-serif'] = ['Noto Sans CJK SC']
plt.rcParams['axes.unicode_minus'] = False


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

X_train = pd.read_csv('/content/drive/MyDrive/9417pro/X_train.csv').values
y_train = pd.read_csv('/content/drive/MyDrive/9417pro/y_train.csv')['label'].values


# 1. 数据分析
# ----------------------------------
print("训练集形状:", X_train.shape)
print("标签形状:", y_train.shape)

label_counts = pd.Series(y_train).value_counts().sort_index()
print("类别分布:")
print(label_counts)
imbalance_ratio = label_counts.max() / label_counts.min()
print("最大类别与最小类别的不平衡比例:", imbalance_ratio)

plt.figure(figsize=(10,5))
sns.barplot(x=label_counts.index, y=label_counts.values, palette='viridis')
plt.title("训练集类别分布")
plt.xlabel("类别")
plt.ylabel("样本数")
plt.show()


# 2. HCBOU
# -------------------------------

# SMOTE-like合成样本生成函数 局部内插
def generate_synthetic_samples(X_cluster, n_samples, k_neighbors=5, random_state=42):
    np.random.seed(random_state)
    n_samples_cluster = X_cluster.shape[0]
    if n_samples_cluster < 2:
        # 如果簇内样本数太少，则直接复制已有样本
        return np.tile(X_cluster, (n_samples, 1))

    nn = NearestNeighbors(n_neighbors=min(k_neighbors, n_samples_cluster)).fit(X_cluster)
    synthetic_samples = []
    for _ in range(n_samples):
        idx = np.random.randint(0, n_samples_cluster)
        sample = X_cluster[idx]
        # 获取最近
        distances, indices = nn.kneighbors([sample])
        if len(indices[0]) > 1:
            # 随机选择一个邻居
            neighbor_idx = np.random.choice(indices[0][1:])
        else:
            neighbor_idx = indices[0][0]
        neighbor = X_cluster[neighbor_idx]
        lam = np.random.rand()
        synthetic = sample + lam * (neighbor - sample)
        synthetic_samples.append(synthetic)
    return np.array(synthetic_samples)

def hc_bou_resample(X, y, random_state=42, minority_k=3):

    np.random.seed(random_state)
    unique_labels = np.unique(y)
    C = len(unique_labels)
    N = X.shape[0]
    S = int(np.floor(N / C))
    print(f"目标每类样本数 S: {S}")

    X_resampled_list = []
    y_resampled_list = []

    for label in unique_labels:
        idx = np.where(y == label)[0]
        X_class = X[idx]
        n_class = X_class.shape[0]
        # 多数类处理：欠采样
        if n_class >= S:
            print(f"类别 {label} 为多数类, 原始样本数: {n_class}")
            # 采用KMeans聚类，选取 S 个聚类中心作为代表
            km = KMeans(n_clusters=S, random_state=random_state)
            km.fit(X_class)
            X_new = km.cluster_centers_
            y_new = np.full(S, label)
        else:
            print(f"类别 {label} 为少数类, 原始样本数: {n_class}")
            # 先进行少数类内部聚类，设定聚类数 minority_k
            km = KMeans(n_clusters=min(minority_k, n_class), random_state=random_state)
            km.fit(X_class)
            labels_cluster = km.labels_
            X_new = X_class.copy()  # 原有样本保留
            # 计算合成样本总数需要补充
            n_to_generate = S - n_class
            synth_samples_all = []
            # 对于各簇，按样本占比分配生成数量
            for cluster in np.unique(labels_cluster):
                cluster_idx = np.where(labels_cluster == cluster)[0]
                X_cluster = X_class[cluster_idx]
                weight = X_cluster.shape[0] / n_class
                n_synth = int(np.floor(weight * n_to_generate))
                if n_synth > 0:
                    synth_samples = generate_synthetic_samples(X_cluster, n_synth, k_neighbors=5, random_state=random_state)
                    synth_samples_all.append(synth_samples)
            if synth_samples_all:
                synth_samples_all = np.vstack(synth_samples_all)
                X_new = np.vstack([X_new, synth_samples_all])
            else:
                # 若未生成任何新样本，则简单复制随机样本
                duplicates = X_class[np.random.choice(n_class, n_to_generate, replace=True)]
                X_new = np.vstack([X_new, duplicates])
            y_new = np.full(X_new.shape[0], label)

        X_resampled_list.append(X_new)
        y_resampled_list.append(y_new)

    # 合并所有类别
    X_resampled = np.vstack(X_resampled_list)
    y_resampled = np.hstack(y_resampled_list)
    print("重采样后总样本数:", X_resampled.shape[0])
    return X_resampled, y_resampled


# 3. 应用
# ----------------------------------

# HCBOU算法重采样
X_resampled, y_resampled = hc_bou_resample(X_train, y_train, random_state=42, minority_k=3)

# 检查新数据分布
resampled_label_counts = pd.Series(y_resampled).value_counts().sort_index()
print("重采样后各类别分布:")
print(resampled_label_counts)

plt.figure(figsize=(10,5))
sns.barplot(x=resampled_label_counts.index, y=resampled_label_counts.values, palette='viridis')
plt.title("重采样后训练集类别分布")
plt.xlabel("类别")
plt.ylabel("样本数")
plt.show()

# ----------------------------------
# 4. 建模（随机森林验证）
# ----------------------------------
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(random_state=42, n_estimators=100)
scores = cross_val_score(clf, X_resampled, y_resampled, cv=5, scoring='f1_macro')
print("使用重采样数据后随机森林的 F1-macro 交叉验证得分:", scores.mean())


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/9417pro/X_train.csv'