In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Standardize the input data
def standard_input(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    return X_scaled_df

def load_data_AUS():
    # 数据文件路径
    path = '/home/gehongfei/project/TabGNN/dataset/AUS.csv'
    # 读取没有表头且以空白字符分隔的数据
    df = pd.read_csv(path, sep='\s+')
    
    # 获取列数，假设最后一列为标签
    n_cols = df.shape[1]
    # 为前 n_cols-1 列生成特征列名，最后一列命名为 'label'
    feature_cols = [f'feature_{i}' for i in range(n_cols - 1)]
    df.columns = feature_cols + ['label']
    
    # 分离特征 X 和标签 y
    y = df['label']
    X = df.drop(columns=['label'])
    
    # 标准化特征数据
    X = standard_input(X)
    return X, y

# 调用函数加载数据
X, y = load_data_AUS()


# Split data into train (70%), validation (10%), and test (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=42)

# Define models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(max_iter=1000),  # Increase max_iter for convergence
    "LDA": LinearDiscriminantAnalysis()
}

# Evaluate models on test set
results = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_test_pred)
    prec = precision_score(y_test, y_test_pred, average='weighted')
    rec = recall_score(y_test, y_test_pred, average='weighted')
    f1 = f1_score(y_test, y_test_pred, average='weighted')
    
    results[model_name] = {
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-score': f1
    }
    
    print(f"\n{model_name} Performance:")
    print(classification_report(y_test, y_test_pred, digits=4))

# Report summary for all models
print("\nTest Performance Summary:")
for model, metrics in results.items():
    print(f"{model}: {metrics}")


Training Decision Tree...

Decision Tree Performance:
              precision    recall  f1-score   support

           0     0.8506    0.9024    0.8757        82
           1     0.8431    0.7679    0.8037        56

    accuracy                         0.8478       138
   macro avg     0.8469    0.8351    0.8397       138
weighted avg     0.8476    0.8478    0.8465       138

Training Logistic Regression...

Logistic Regression Performance:
              precision    recall  f1-score   support

           0     0.8734    0.8415    0.8571        82
           1     0.7797    0.8214    0.8000        56

    accuracy                         0.8333       138
   macro avg     0.8265    0.8314    0.8286       138
weighted avg     0.8354    0.8333    0.8340       138

Training SVM...

SVM Performance:
              precision    recall  f1-score   support

           0     0.8846    0.8415    0.8625        82
           1     0.7833    0.8393    0.8103        56

    accuracy                



In [9]:
# 使用随机森林模型进行训练，并生成测试集上的分类报告
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred_rf = rf_model.predict(X_test)

# 输出测试集上的分类报告
print("Test Set Classification Report:")
print(classification_report(y_test, y_pred_rf, digits=4))


# 为了便于比较，将预测结果转换为与 y_test 相同的索引
y_pred_rf_series = pd.Series(y_pred_rf, index=y_test.index)

# 统计各类样本的索引集合
tp_idx = y_test[(y_test == 1) & (y_pred_rf_series == 1)].index.tolist()
tn_idx = y_test[(y_test == 0) & (y_pred_rf_series == 0)].index.tolist()
fp_idx = y_test[(y_test == 0) & (y_pred_rf_series == 1)].index.tolist()
fn_idx = y_test[(y_test == 1) & (y_pred_rf_series == 0)].index.tolist()

print("True Positives (1被分为1):", tp_idx)
print("True Negatives (0被分为0):", tn_idx)
print("False Positives (0被分为1):", fp_idx)
print("False Negatives (1被分为0):", fn_idx)


import json

# 将四个列表合并为一个列表
error_idx_list = [tp_idx, tn_idx, fp_idx, fn_idx]

# 定义要保存的文件名
filename = "AUS-RF.json"

# 保存为 JSON 文件
with open(filename, "w") as f:
    json.dump(error_idx_list, f, indent=4)

print(f"索引集合已保存至 {filename}")


# 如果只需要错误样本（FP和FN）的索引集合，可以合并如下：
error_idx = {
    "[0,1]": fp_idx,  # 真实为0，但预测为1
    "[1,0]": fn_idx   # 真实为1，但预测为0
}
print("错误样本的索引集合:", error_idx)

# 读取 JSON 文件
with open(filename, "r") as f:
    loaded_error_idx_list = json.load(f)

print("加载的索引集合:", loaded_error_idx_list)


Test Set Classification Report:
              precision    recall  f1-score   support

           0     0.8588    0.8902    0.8743        82
           1     0.8302    0.7857    0.8073        56

    accuracy                         0.8478       138
   macro avg     0.8445    0.8380    0.8408       138
weighted avg     0.8472    0.8478    0.8471       138

True Positives (1被分为1): [497, 405, 28, 685, 65, 321, 182, 210, 278, 599, 601, 55, 11, 539, 204, 412, 56, 314, 264, 603, 621, 235, 671, 320, 120, 380, 39, 360, 220, 518, 292, 624, 29, 42, 145, 487, 662, 499, 174, 286, 227, 420, 158, 163]
True Negatives (0被分为0): [431, 81, 164, 425, 24, 645, 18, 133, 501, 86, 131, 669, 629, 310, 327, 51, 208, 631, 135, 534, 356, 569, 432, 60, 677, 244, 281, 31, 335, 381, 527, 576, 611, 296, 652, 353, 218, 90, 199, 357, 72, 556, 250, 110, 595, 657, 354, 192, 493, 82, 212, 284, 289, 109, 165, 209, 377, 54, 552, 318, 136, 6, 515, 608, 92, 606, 640, 686, 260, 61, 554, 132, 404]
False Positives (0被分为1): [522