In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# 加载数据
xref_file = "opamps-xref-cleaned.csv"  
features_file = "opamps-features.csv"  

# 读取数据
xref_data = pd.read_csv(xref_file)
features_data = pd.read_csv(features_file)

# Cross Reference Type 转换为相似性分数
cross_ref_score_mapping = {
    "D": 0.2, 
    "C": 0.5, "C/Upgrade": 0.5, "C/Downgrade": 0.5,
    "B": 0.8, "B/Upgrade": 0.8, "B/Downgrade": 0.8,
    "A": 1.0
}
xref_data["Similarity_Score"] = xref_data["Cross Reference Type"].map(cross_ref_score_mapping)

# 数据抽样：根据 Cross Reference Type 进行分层抽样
sampled_data = []
for label, group in xref_data.groupby("Cross Reference Type"):
    if label in cross_ref_score_mapping:  # 忽略 "SF" 类别
        # 根据类别样本数量动态调整抽样数
        sample_size = 1000 if len(group) > 1000 else len(group)
        sampled_data.append(group.sample(sample_size, random_state=42))

xref_sampled = pd.concat(sampled_data, ignore_index=True)

# 特征拼接与提取
features_list = []
labels = []

for _, row in xref_sampled.iterrows():
    # 获取 STMicro 和 Competitor 的 MPN 和 MANUFACTURER
    st_mpn, st_name = row["STMicro MPN"], row["STMicro Name"]
    competitor_mpn, competitor_name = row["Competitor MPN"], row["Competitor Name"]

    # 查找 STMicro 和 Competitor 在 features_data 中的记录
    st_features = features_data[
        (features_data["MPN"] == st_mpn) & 
        (features_data["MANUFACTURER"] == st_name)
    ]
    competitor_features = features_data[
        (features_data["MPN"] == competitor_mpn) & 
        (features_data["MANUFACTURER"] == competitor_name)
    ]

    #  将 DataFrame 转化为 Series，例如 [[1 2 3]] 转化为 [1 2 3]
    st_features = st_features.iloc[0]
    competitor_features = competitor_features.iloc[0]

    # 提取数值型特征
    numeric_features = ["Maximum Input Offset Voltage", "Maximum Single Supply Voltage", 
                        "Minimum Single Supply Voltage", "Number of Channels per Chip", "Typical Gain Bandwidth Product"]
    st_values = st_features[numeric_features].values
    competitor_values = competitor_features[numeric_features].values

    # 计算特征差异，得到0到1之间的一个值
    feature_diff = np.zeros_like(st_values, dtype=float)  # 初始化差异数组

    for i in range(len(st_values)):
        if st_values[i] == competitor_values[i]:
            feature_diff[i] = 0  # 如果两值相等，差异为 0
        else:
            max_val = max(st_values[i], competitor_values[i])
            min_val = min(st_values[i], competitor_values[i])
            feature_diff[i] = (max_val - min_val) / max_val  # 按照 (大数 - 小数) / 大数 计算差异

    # 处理字符串特征（Supplier_Package）
    supplier_diff = 0 if st_features["Supplier_Package"] == competitor_features["Supplier_Package"] else 1

    # 拼接特征
    combined_features = np.append(feature_diff, supplier_diff)
    features_list.append(combined_features)

    # 添加目标标签
    labels.append(row["Similarity_Score"])

# 转换为数组格式
X = np.array(features_list)
y = np.array(labels)

# 划分训练集和测试集 (80% 训练集, 20% 测试集)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 计算样本权重 (针对训练集)
sample_weights = np.array([1 / cross_ref_score_mapping.get(label, 1) for label in y_train])

# 定义参数网格
param_grid = {
    'n_estimators': [50, 100, 200],  # 决策树数量
    'max_depth': [None, 10, 20, 30],  # 最大深度
    'min_samples_split': [2, 5, 10],  # 最小分裂样本数
    'min_samples_leaf': [1, 2, 4],  # 叶子节点的最小样本数
}

# 定义随机森林模型
rf_model = RandomForestRegressor(random_state=42)

# 定义 GridSearchCV，用于自动调参
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='r2',  # 使用 R² 作为评估指标
    cv=5,  # 使用 5 折交叉验证
    verbose=2,  # 输出详细信息
    n_jobs=-1  # 使用所有可用 CPU 进行并行计算
)

# 使用样本权重进行训练
grid_search.fit(X_train, y_train, sample_weight=sample_weights)

# 输出最佳参数
print("最佳参数：", grid_search.best_params_)

# 使用最佳模型进行预测
best_model = grid_search.best_estimator_

# 测试集评估
test_predictions = best_model.predict(X_test)
test_loss = mean_squared_error(y_test, test_predictions)  # 计算 MSE 作为损失
test_accuracy = r2_score(y_test, test_predictions)  # 计算 R² 作为准确率

# 打印结果
print(f"测试集 Loss (MSE): {test_loss:.4f}")
print(f"测试集 Accuracy (R²): {test_accuracy:.4f}")


Fitting 5 folds for each of 108 candidates, totalling 540 fits
最佳参数： {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
测试集 Loss (MSE): 0.0230
测试集 Accuracy (R²): 0.5363
