In [6]:
# util function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import csr_matrix
import torch

def standard_input(X):
    # 标准化输入
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return pd.DataFrame(X_scaled, columns=X.columns)

def load_data_SGER_RAW(random_state=42):
    # 读取以空格分隔的SGER CSV文件
    path = '/home/gehongfei/project/TabGNN/dataset/SGER1000.csv'
    df = pd.read_csv(path, sep='\s+')
    # 确保 'kredit' 列存在
    if 'kredit' not in df.columns:
        print("Error: 'kredit' column not found.")
        return None, None, None, None, None, None
    # 目标变量和特征
    y = df['kredit']
    X = df.drop(columns=['kredit'])
    # 划分训练集、验证集和测试集
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=random_state, stratify=y)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=random_state, stratify=y_temp)
    # 计算节点数并创建 mask
    num_nodes = len(df)
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)
    # 获取索引并设置 mask
    train_mask[X_train.index] = True
    val_mask[X_valid.index] = True
    test_mask[X_test.index] = True
    # 标准化输入
    X = standard_input(X)
    X_train = standard_input(X_train)
    X_valid = standard_input(X_valid)
    X_test = standard_input(X_test)
    return X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, val_mask, test_mask
X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, val_mask, test_mask = load_data_SGER_RAW()
X.shape


(1000,)

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, classification_report

class ResidualMLP(nn.Module):
    def __init__(self, input_dim, hidden_size, num_layers, dropout_rate):
        super(ResidualMLP, self).__init__()
        self.input_layer = nn.Linear(input_dim, hidden_size)
        self.hidden_layers = nn.ModuleList([
            nn.Linear(hidden_size, hidden_size) for _ in range(num_layers - 1)
        ])
        self.output_layer = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(dropout_rate)
        self.num_layers = num_layers

    def forward(self, x):
        out = F.relu(self.input_layer(x))
        for layer in self.hidden_layers:
            residual = out
            out = F.relu(layer(out))
            out = self.dropout(out)
            out += residual  # Add residual connection
        out = torch.sigmoid(self.output_layer(out))
        return out

class BinaryClassifier:
    def __init__(self, input_dim, hidden_size=64, num_layers=3, dropout_rate=0.5, learning_rate=0.001):
        self.model = ResidualMLP(input_dim, hidden_size, num_layers, dropout_rate)
        self.criterion = nn.BCELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def train(self, X_train, y_train, X_valid, y_valid, epochs=20, batch_size=32):
        train_dataset = TensorDataset(torch.tensor(X_train.values, dtype=torch.float32), 
                                      torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1))
        valid_dataset = TensorDataset(torch.tensor(X_valid.values, dtype=torch.float32), 
                                      torch.tensor(y_valid.values, dtype=torch.float32).unsqueeze(1))
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

        for epoch in range(epochs):
            self.model.train()
            train_loss = 0.0
            for X_batch, y_batch in train_loader:
                self.optimizer.zero_grad()
                outputs = self.model(X_batch)
                loss = self.criterion(outputs, y_batch)
                loss.backward()
                self.optimizer.step()
                train_loss += loss.item()

            valid_loss = self.evaluate(valid_loader)
            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader)}, Validation Loss: {valid_loss}")

    def evaluate(self, data_loader):
        self.model.eval()
        total_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in data_loader:
                outputs = self.model(X_batch)
                loss = self.criterion(outputs, y_batch)
                total_loss += loss.item()
        return total_loss / len(data_loader)

    def save_model(self, path):
        torch.save(self.model.state_dict(), path)

    def load_model(self, path):
        self.model.load_state_dict(torch.load(path))
        self.model.eval()

    def predict_with_uncertainty(self, X_sample, n_iter=100):
        self.model.train()  # Enable dropout
        X_sample = torch.tensor(X_sample.values, dtype=torch.float32)
        predictions = torch.zeros(n_iter, X_sample.size(0))

        for i in range(n_iter):
            outputs = self.model(X_sample)
            predictions[i] = outputs.squeeze()

        mean_prediction = predictions.mean(dim=0)
        uncertainty = predictions.std(dim=0)
        predicted_classes = (mean_prediction > 0.5).int()

        return predicted_classes, uncertainty

    def predict(self, X_data):
        self.model.eval()
        X_tensor = torch.tensor(X_data.values, dtype=torch.float32)
        with torch.no_grad():
            outputs = self.model(X_tensor)
        predicted_classes = (outputs.squeeze() > 0.5).int()
        return predicted_classes

    def compute_similarity_and_uncertainty(self, X_data, n_iter=100):
        self.model.train()  # Enable dropout
        X_tensor = torch.tensor(X_data.values, dtype=torch.float32)
        
        with torch.no_grad():
            output_vectors = self.model.input_layer(X_tensor)
            for layer in self.model.hidden_layers:
                output_vectors = F.relu(layer(output_vectors))
                output_vectors = self.model.dropout(output_vectors)

        similarity_matrix = torch.mm(output_vectors, output_vectors.t())
        
        predictions = torch.zeros(n_iter, X_tensor.size(0))
        for i in range(n_iter):
            outputs = self.model(X_tensor)
            predictions[i] = outputs.squeeze()

        mean_prediction = predictions.mean(dim=0)
        uncertainty = predictions.std(dim=0)
        predicted_classes = (mean_prediction > 0.5).int()

        uncertainty_matrix = torch.zeros(X_tensor.size(0), X_tensor.size(0))
        for i in range(X_tensor.size(0)):
            for j in range(X_tensor.size(0)):
                if predicted_classes[i] == predicted_classes[j]:
                    uncertainty_matrix[i, j] = uncertainty[i] + uncertainty[j]
                else:
                    uncertainty_matrix[i, j] = 0

        return similarity_matrix, uncertainty_matrix

# Initialize and train the model
input_dim = X.shape[1]
classifier = BinaryClassifier(input_dim=input_dim, hidden_size=128, num_layers=4, dropout_rate=0.3, learning_rate=0.001)
classifier.train(X_train, y_train, X_valid, y_valid, epochs=30, batch_size=64)

# Save the model
classifier.save_model('residual_mlp_model.pth')

# Load the model
classifier.load_model('residual_mlp_model.pth')

# Predict on the test set
predictions = classifier.predict(X_test)

# Calculate metrics
test_accuracy = accuracy_score(y_test, predictions)
test_recall = recall_score(y_test, predictions)
test_f1 = f1_score(y_test, predictions)
test_precision = precision_score(y_test, predictions)

print(f"Test Accuracy: {test_accuracy}")
print(f"Test Recall: {test_recall}")
print(f"Test F1 Score: {test_f1}")
print(f"Test Precision: {test_precision}")

# Classification report
print("Classification Report:")
print(classification_report(y_test, predictions))

# Predict a single sample
sample = X_test.iloc[[0]]
print("Sample to Predict:", sample)

predicted_classes, uncertainty = classifier.predict_with_uncertainty(sample)

print("Predicted Class:", predicted_classes.item())
print("Uncertainty:", uncertainty.item())

# Compute similarity and uncertainty matrices
similarity_matrix, uncertainty_matrix = classifier.compute_similarity_and_uncertainty(pd.concat([X_train, X_valid, X_test]))
print("Similarity Matrix:", similarity_matrix)
print("Uncertainty Matrix:", uncertainty_matrix)


Epoch 1/30, Train Loss: 0.6003749641505155, Validation Loss: 0.5305873602628708
Epoch 2/30, Train Loss: 0.5224356055259705, Validation Loss: 0.4850348085165024
Epoch 3/30, Train Loss: 0.4759216010570526, Validation Loss: 0.4665784388780594
Epoch 4/30, Train Loss: 0.44562107595530426, Validation Loss: 0.4634212255477905
Epoch 5/30, Train Loss: 0.43111248449845746, Validation Loss: 0.46111078560352325
Epoch 6/30, Train Loss: 0.4162434122779153, Validation Loss: 0.4752398729324341
Epoch 7/30, Train Loss: 0.4016761075366627, Validation Loss: 0.47323934733867645
Epoch 8/30, Train Loss: 0.38446633924137463, Validation Loss: 0.47396431863307953
Epoch 9/30, Train Loss: 0.3728475123643875, Validation Loss: 0.47851546108722687
Epoch 10/30, Train Loss: 0.35500493916598236, Validation Loss: 0.47707679867744446
Epoch 11/30, Train Loss: 0.33597189458933746, Validation Loss: 0.5017910897731781
Epoch 12/30, Train Loss: 0.32580102709206665, Validation Loss: 0.5007322132587433
Epoch 13/30, Train Loss: 0