In [24]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score

# Standardize the input data
def standard_input(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    return X_scaled_df

# Load SGER1000 data and prepare X and y
def load_data_SGER1000():
    path = '/home/gehongfei/project/TabGNN/dataset/SGER1000.csv'
    df = pd.read_csv(path, sep='\s+')
    
    if 'kredit' not in df.columns:
        print("Error: 'kredit' column not found.")
        return None, None

    y = df['kredit']
    X = df.drop(columns=['kredit'])
    X = standard_input(X)
    return X, y

# Load the dataset
X, y = load_data_SGER1000()

# Define models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(max_iter=1000),  # Increase max_iter for convergence
    "LDA": LinearDiscriminantAnalysis()
}

# Define scoring metrics
scoring = ['accuracy', 'precision', 'recall', 'f1']

# Evaluate models using 5-fold cross-validation
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    # Using cross_validate to evaluate multiple metrics
    scores = cross_validate(model, X, y, cv=5, scoring=scoring)
    
    print(f"{model_name} - Accuracy: {scores['test_accuracy'].mean():.4f} ± {scores['test_accuracy'].std():.4f}")
    print(f"{model_name} - Precision: {scores['test_precision'].mean():.4f} ± {scores['test_precision'].std():.4f}")
    print(f"{model_name} - Recall: {scores['test_recall'].mean():.4f} ± {scores['test_recall'].std():.4f}")
    print(f"{model_name} - F1-score: {scores['test_f1'].mean():.4f} ± {scores['test_f1'].std():.4f}")
    print("-" * 50)


Evaluating Decision Tree...
Decision Tree - Accuracy: 0.6650 ± 0.0454
Decision Tree - Precision: 0.4498 ± 0.0606
Decision Tree - Recall: 0.4633 ± 0.0488
Decision Tree - F1-score: 0.4548 ± 0.0489
--------------------------------------------------
Evaluating Logistic Regression...
Logistic Regression - Accuracy: 0.7360 ± 0.0926
Logistic Regression - Precision: 0.6088 ± 0.1658
Logistic Regression - Recall: 0.4567 ± 0.1153
Logistic Regression - F1-score: 0.5140 ± 0.1287
--------------------------------------------------
Evaluating SVM...
SVM - Accuracy: 0.7410 ± 0.0560
SVM - Precision: 0.6241 ± 0.1311
SVM - Recall: 0.4067 ± 0.0672
SVM - F1-score: 0.4872 ± 0.0824
--------------------------------------------------
Evaluating Random Forest...
Random Forest - Accuracy: 0.7430 ± 0.0543
Random Forest - Precision: 0.6443 ± 0.1358
Random Forest - Recall: 0.3767 ± 0.0696
Random Forest - F1-score: 0.4693 ± 0.0834
--------------------------------------------------
Evaluating Naive Bayes...
Naive Baye

In [30]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Standardize the input data
def standard_input(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    return X_scaled_df

# Load SGER1000 data and prepare X and y
def load_data_SGER1000():
    path = '/home/gehongfei/project/TabGNN/dataset/SGER1000.csv'
    df = pd.read_csv(path, sep='\s+')
    
    if 'kredit' not in df.columns:
        print("Error: 'kredit' column not found.")
        return None, None

    y = df['kredit']
    X = df.drop(columns=['kredit'])
    X = standard_input(X)
    return X, y

# Load the dataset
X, y = load_data_SGER1000()

# Split data into train (70%), validation (10%), and test (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=42)

# Define models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(max_iter=1000),  # Increase max_iter for convergence
    "LDA": LinearDiscriminantAnalysis()
}

# Evaluate models on test set
results = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_test_pred)
    prec = precision_score(y_test, y_test_pred, average='weighted')
    rec = recall_score(y_test, y_test_pred, average='weighted')
    f1 = f1_score(y_test, y_test_pred, average='weighted')
    
    results[model_name] = {
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-score': f1
    }
    
    print(f"\n{model_name} Performance:")
    print(classification_report(y_test, y_test_pred))

# Report summary for all models
print("\nTest Performance Summary:")
for model, metrics in results.items():
    print(f"{model}: {metrics}")


Training Decision Tree...

Decision Tree Performance:
              precision    recall  f1-score   support

           0       0.77      0.82      0.80       143
           1       0.47      0.40      0.43        57

    accuracy                           0.70       200
   macro avg       0.62      0.61      0.61       200
weighted avg       0.69      0.70      0.69       200

Training Logistic Regression...

Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       143
           1       0.53      0.42      0.47        57

    accuracy                           0.73       200
   macro avg       0.66      0.64      0.64       200
weighted avg       0.71      0.73      0.72       200

Training SVM...

SVM Performance:
              precision    recall  f1-score   support

           0       0.80      0.90      0.84       143
           1       0.62      0.42      0.50        57

    accuracy                

In [34]:
X.shape

(1400, 20)