In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Standardize the input data
def standard_input(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    return X_scaled_df

def load_data_AUS():
    # 数据文件路径
    path = '/home/gehongfei/project/TabGNN/dataset/AUS.csv'
    # 读取没有表头且以空白字符分隔的数据
    df = pd.read_csv(path, sep='\s+', header=None)
    
    # 获取列数，假设最后一列为标签
    n_cols = df.shape[1]
    # 为前 n_cols-1 列生成特征列名，最后一列命名为 'label'
    feature_cols = [f'feature_{i}' for i in range(n_cols - 1)]
    df.columns = feature_cols + ['label']
    
    # 分离特征 X 和标签 y
    y = df['label']
    X = df.drop(columns=['label'])
    
    # 标准化特征数据
    X = standard_input(X)
    return X, y

# 调用函数加载数据
X, y = load_data_AUS()


# Split data into train (70%), validation (10%), and test (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=42)

# Define models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(max_iter=1000),  # Increase max_iter for convergence
    "LDA": LinearDiscriminantAnalysis()
}

# Evaluate models on test set
results = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_test_pred)
    prec = precision_score(y_test, y_test_pred, average='weighted')
    rec = recall_score(y_test, y_test_pred, average='weighted')
    f1 = f1_score(y_test, y_test_pred, average='weighted')
    
    results[model_name] = {
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-score': f1
    }
    
    print(f"\n{model_name} Performance:")
    print(classification_report(y_test, y_test_pred, digits=4))

# Report summary for all models
print("\nTest Performance Summary:")
for model, metrics in results.items():
    print(f"{model}: {metrics}")


Training Decision Tree...

Decision Tree Performance:
              precision    recall  f1-score   support

           0     0.8353    0.8659    0.8503        82
           1     0.7925    0.7500    0.7706        56

    accuracy                         0.8188       138
   macro avg     0.8139    0.8079    0.8105       138
weighted avg     0.8179    0.8188    0.8180       138

Training Logistic Regression...

Logistic Regression Performance:
              precision    recall  f1-score   support

           0     0.8734    0.8415    0.8571        82
           1     0.7797    0.8214    0.8000        56

    accuracy                         0.8333       138
   macro avg     0.8265    0.8314    0.8286       138
weighted avg     0.8354    0.8333    0.8340       138

Training SVM...

SVM Performance:
              precision    recall  f1-score   support

           0     0.8846    0.8415    0.8625        82
           1     0.7833    0.8393    0.8103        56

    accuracy                

