In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Standardize the input data
def standard_input(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    return X_scaled_df

# Load SGER1000 data and prepare X and y
def load_data_SGER1000():
    path = '/home/gehongfei/project/TabGNN/dataset/SGER1000.csv'
    df = pd.read_csv(path, sep='\s+')
    
    if 'kredit' not in df.columns:
        print("Error: 'kredit' column not found.")
        return None, None

    y = df['kredit']
    X = df.drop(columns=['kredit'])
    X = standard_input(X)
    return X, y

# Load the dataset
X, y = load_data_SGER1000()

# Split data into train (70%), validation (10%), and test (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=42)

# Define models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(max_iter=1000),  # Increase max_iter for convergence
    "LDA": LinearDiscriminantAnalysis()
}

# Evaluate models on test set
results = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_test_pred)
    prec = precision_score(y_test, y_test_pred, average='weighted')
    rec = recall_score(y_test, y_test_pred, average='weighted')
    f1 = f1_score(y_test, y_test_pred, average='weighted')
    
    results[model_name] = {
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-score': f1
    }
    
    print(f"\n{model_name} Performance:")
    print(classification_report(y_test, y_test_pred))

# Report summary for all models
print("\nTest Performance Summary:")
for model, metrics in results.items():
    print(f"{model}: {metrics}")


Training Decision Tree...

Decision Tree Performance:
              precision    recall  f1-score   support

           0       0.78      0.80      0.79       143
           1       0.45      0.42      0.44        57

    accuracy                           0.69       200
   macro avg       0.61      0.61      0.61       200
weighted avg       0.68      0.69      0.69       200

Training Logistic Regression...

Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       143
           1       0.53      0.42      0.47        57

    accuracy                           0.73       200
   macro avg       0.66      0.64      0.64       200
weighted avg       0.71      0.73      0.72       200

Training SVM...

SVM Performance:
              precision    recall  f1-score   support

           0       0.80      0.90      0.84       143
           1       0.62      0.42      0.50        57

    accuracy                

In [2]:
# 使用随机森林模型进行训练，并生成测试集上的分类报告
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred_rf = rf_model.predict(X_test)

# 输出测试集上的分类报告
print("Test Set Classification Report:")
print(classification_report(y_test, y_pred_rf, digits=4))


# 为了便于比较，将预测结果转换为与 y_test 相同的索引
y_pred_rf_series = pd.Series(y_pred_rf, index=y_test.index)

# 统计各类样本的索引集合
tp_idx = y_test[(y_test == 1) & (y_pred_rf_series == 1)].index.tolist()
tn_idx = y_test[(y_test == 0) & (y_pred_rf_series == 0)].index.tolist()
fp_idx = y_test[(y_test == 0) & (y_pred_rf_series == 1)].index.tolist()
fn_idx = y_test[(y_test == 1) & (y_pred_rf_series == 0)].index.tolist()

print("True Positives (1被分为1):", tp_idx)
print("True Negatives (0被分为0):", tn_idx)
print("False Positives (0被分为1):", fp_idx)
print("False Negatives (1被分为0):", fn_idx)

# 如果只需要错误样本（FP和FN）的索引集合，可以合并如下：
error_idx = {
    "[0,1]": fp_idx,  # 真实为0，但预测为1
    "[1,0]": fn_idx   # 真实为1，但预测为0
}
print("错误样本的索引集合:", error_idx)


Test Set Classification Report:
              precision    recall  f1-score   support

           0     0.8052    0.8671    0.8350       143
           1     0.5870    0.4737    0.5243        57

    accuracy                         0.7550       200
   macro avg     0.6961    0.6704    0.6796       200
weighted avg     0.7430    0.7550    0.7465       200

True Positives (1被分为1): [798, 977, 917, 963, 907, 950, 822, 883, 994, 893, 583, 800, 941, 811, 584, 866, 904, 868, 810, 689, 802, 975, 902, 521, 643, 986, 708]
True Negatives (0被分为0): [557, 136, 544, 332, 678, 363, 10, 277, 377, 141, 668, 589, 286, 82, 312, 697, 192, 499, 210, 331, 294, 741, 652, 359, 296, 39, 209, 33, 536, 494, 333, 289, 428, 81, 88, 522, 481, 626, 346, 371, 617, 355, 168, 344, 199, 239, 275, 318, 662, 227, 650, 307, 86, 590, 213, 351, 70, 280, 677, 244, 523, 543, 139, 2, 670, 265, 84, 109, 184, 501, 165, 30, 72, 559, 408, 290, 215, 314, 603, 118, 462, 254, 740, 432, 281, 259, 65, 731, 445, 292, 67, 381, 49, 60, 587