In [7]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Standardize the input data
def standard_input(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    return X_scaled_df


def load_data_DEF(random_state=42):
    # CSV 文件路径（根据需要修改路径）
    path = '/home/gehongfei/project/TabGNN/dataset/DEF.csv'
    
    # 读取 CSV 文件，CSV 文件的分隔符为逗号
    df = pd.read_csv(path, sep=',')
    
    # 检查目标变量 'label' 是否存在
    target_col = 'label'
    if target_col not in df.columns:
        print(f"Error: '{target_col}' column not found in the dataset.")
        return None, None, None, None, None, None, None, None, None, None, None
    
    # 将目标变量和特征进行分离
    y = df[target_col]
    # 如果存在 'ID' 列，则将其和目标变量一起移除
    if "ID" in df.columns:
        X = df.drop(columns=["ID", target_col])
    else:
        X = df.drop(columns=[target_col])
    
    # 划分训练集、验证集和测试集
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.3, random_state=random_state, stratify=y
    )
    X_valid, X_test, y_valid, y_test = train_test_split(
        X_temp, y_temp, test_size=2/3, random_state=random_state, stratify=y_temp
    )
    
    # 创建节点 mask（这里假设每一行数据对应图中的一个节点）
    num_nodes = len(df)
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)
    
    # 根据原始数据的索引设置 mask
    train_mask[X_train.index] = True
    val_mask[X_valid.index] = True
    test_mask[X_test.index] = True
    
    # 标准化输入（请确保 standard_input 函数已经定义）
    X = standard_input(X)
    X_train = standard_input(X_train)
    X_valid = standard_input(X_valid)
    X_test = standard_input(X_test)
    
    return X, y

# Load the dataset
X, y = load_data_DEF()

# Split data into train (70%), validation (10%), and test (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=42)

# Define models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(max_iter=1000),  # Increase max_iter for convergence
    "LDA": LinearDiscriminantAnalysis()
}

# Evaluate models on test set
results = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_test_pred)
    prec = precision_score(y_test, y_test_pred, average='weighted')
    rec = recall_score(y_test, y_test_pred, average='weighted')
    f1 = f1_score(y_test, y_test_pred, average='weighted')
    
    results[model_name] = {
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-score': f1
    }
    
    print(f"\n{model_name} Performance:")
    print(classification_report(y_test, y_test_pred))

# Report summary for all models
print("\nTest Performance Summary:")
for model, metrics in results.items():
    print(f"{model}: {metrics}")


Training Decision Tree...

Decision Tree Performance:
              precision    recall  f1-score   support

           0       0.84      0.81      0.83      4692
           1       0.39      0.43      0.41      1308

    accuracy                           0.73      6000
   macro avg       0.62      0.62      0.62      6000
weighted avg       0.74      0.73      0.74      6000

Training Logistic Regression...

Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      4692
           1       0.69      0.24      0.36      1308

    accuracy                           0.81      6000
   macro avg       0.76      0.60      0.62      6000
weighted avg       0.79      0.81      0.77      6000

Training SVM...

SVM Performance:
              precision    recall  f1-score   support

           0       0.84      0.96      0.89      4692
           1       0.69      0.34      0.45      1308

    accuracy                

In [5]:
X.shape

(30000, 23)

In [6]:
y.shape

(30000,)