In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
import time
import psutil
import os
import random

# Set random seed for reproducibility
seed = 999
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

# If using CUDA, set the seed for GPU as well
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # for multi-GPU


adult_node_model

In [3]:
# read data
train_data = pd.read_csv('../data/adult/adult.data', header=None)
test_data = pd.read_csv('../data/adult/adult.test', header=None, skiprows=1)

columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]
train_data.columns = test_data.columns = columns

# cleaning data
def clean_data(df):
    df = df.replace('?', np.nan)
    df = df.dropna()
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df["income"] = df["income"].replace({">50K.": 1, "<=50K.": 0, ">50K": 1, "<=50K": 0})
    return df

train_data = clean_data(train_data)
test_data = clean_data(test_data)

# label encoding
X_train = train_data.drop(columns=["income"])
y_train = train_data["income"]
X_test = test_data.drop(columns=["income"])
y_test = test_data["income"]

# One-Hot Encoding 
X_combined = pd.get_dummies(pd.concat([X_train, X_test], axis=0))
X_train = X_combined.iloc[:len(X_train)].copy().astype(np.float32)
X_test = X_combined.iloc[len(X_train):].copy().astype(np.float32)

# standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# transform to tensor
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# create dataset and dataloader
batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# define the model
class NodeModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=2, dropout=0.3):
        super(NodeModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.model(x)

# initialize the model, loss function, optimizer
input_dim = X_train_tensor.shape[1]
model = NodeModel(input_dim=input_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# train the model
start_time = time.time()
cpu_usage_start = psutil.cpu_percent(interval=1)

EPOCHS = 30
for epoch in range(EPOCHS):
    model.train()
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
    scheduler.step()

    if (epoch + 1) % 5 == 0:
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch_x, batch_y in test_loader:
                output = model(batch_x)
                pred_label = torch.argmax(output, dim=1)
                all_preds.append(pred_label)
                all_labels.append(batch_y)

        all_preds = torch.cat(all_preds)
        all_labels = torch.cat(all_labels)
        acc = accuracy_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds)
        print(f"Epoch {epoch+1} | Loss: {loss.item():.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | LR: {optimizer.param_groups[0]['lr']:.6f}")

# evaluate the model
end_time = time.time()
cpu_usage_end = psutil.cpu_percent(interval=1)
duration = end_time - start_time
avg_cpu = (cpu_usage_start + cpu_usage_end) / 2

print(f"\ntrain_time: {duration:.2f} seconds")
print(f"avg_cpu: {avg_cpu:.2f}%")


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  df["income"] = df["income"].replace({">50K.": 1, "<=50K.": 0, ">50K": 1, "<=50K": 0})


Epoch 5 | Loss: 0.4499 | Acc: 0.8542 | F1: 0.6669 | LR: 0.001000
Epoch 10 | Loss: 0.2747 | Acc: 0.8533 | F1: 0.6507 | LR: 0.000500
Epoch 15 | Loss: 0.3630 | Acc: 0.8529 | F1: 0.6588 | LR: 0.000500
Epoch 20 | Loss: 0.3608 | Acc: 0.8536 | F1: 0.6534 | LR: 0.000250
Epoch 25 | Loss: 0.4244 | Acc: 0.8528 | F1: 0.6574 | LR: 0.000250
Epoch 30 | Loss: 0.2182 | Acc: 0.8531 | F1: 0.6562 | LR: 0.000125

train_time: 36.51 seconds
avg_cpu: 10.80%


bank_marketing_node_model

In [5]:
from sklearn.model_selection import train_test_split

# read bank data
bank_data = pd.read_csv('../data/bank_marketing/bank_additional_full.csv', sep=';')
bank_data = bank_data.applymap(lambda x: x.strip() if isinstance(x, str) else x)
bank_data['y'] = bank_data['y'].map({'yes': 1, 'no': 0})


X = pd.get_dummies(bank_data.drop(columns=['y']))
y = bank_data['y']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# transform to tensor
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# create dataset and dataloader
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=256, shuffle=True)

# define the model
input_dim = X_train_tensor.shape[1]
model = NodeModel(input_dim=input_dim)  # 重用你的 NodeModel

# initialize the model, loss function, optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# train the model
for epoch in range(30):
    start_time = time.time()
    model.train()
    total_loss = 0.0

    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    scheduler.step()

    # evaluate the model
    model.eval()
    with torch.no_grad():
        pred = model(X_test_tensor)
        pred_label = torch.argmax(pred, dim=1)
        acc = accuracy_score(y_test_tensor, pred_label)
        f1 = f1_score(y_test_tensor, pred_label)

    if (epoch + 1) % 5 == 0:
        elapsed = time.time() - start_time
        cpu = psutil.cpu_percent(interval=0.1)
        print(f"Epoch {epoch+1:2d} | Loss: {total_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | LR: {scheduler.get_last_lr()[0]:.6f} | Time: {elapsed:.2f}s | CPU: {cpu}%")


  bank_data = bank_data.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Epoch  5 | Loss: 23.9557 | Acc: 0.9122 | F1: 0.5745 | LR: 0.001000 | Time: 0.60s | CPU: 25.9%
Epoch 10 | Loss: 22.9828 | Acc: 0.9154 | F1: 0.5794 | LR: 0.000500 | Time: 0.60s | CPU: 20.0%
Epoch 15 | Loss: 22.2285 | Acc: 0.9154 | F1: 0.6117 | LR: 0.000500 | Time: 0.60s | CPU: 31.2%
Epoch 20 | Loss: 21.8395 | Acc: 0.9155 | F1: 0.6032 | LR: 0.000250 | Time: 0.58s | CPU: 19.1%
Epoch 25 | Loss: 21.8064 | Acc: 0.9144 | F1: 0.5865 | LR: 0.000250 | Time: 0.62s | CPU: 41.7%
Epoch 30 | Loss: 21.5286 | Acc: 0.9158 | F1: 0.5970 | LR: 0.000125 | Time: 0.61s | CPU: 18.6%


covertype_node_model

In [8]:
# read  data
covtype_data = pd.read_csv('../data/covertype/covtype.data', header=None)


X = covtype_data.iloc[:, :-1]
y = covtype_data.iloc[:, -1] - 1  

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# transform to tensor
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# dataloader
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=256, shuffle=True)

# define the model
input_dim = X_train_tensor.shape[1]
output_dim = len(np.unique(y_train))  # == 7
model = NodeModel(input_dim=input_dim, output_dim=output_dim)

# initialize the model, loss function, optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# train the model
for epoch in range(30):
    start_time = time.time()
    model.train()
    total_loss = 0.0

    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    scheduler.step()

    # evaluate the model
    model.eval()
    with torch.no_grad():
        pred = model(X_test_tensor)
        pred_label = torch.argmax(pred, dim=1)
        acc = accuracy_score(y_test_tensor, pred_label)
        f1 = f1_score(y_test_tensor, pred_label, average='macro')  # Modify here for multiclass F1 score

    if (epoch + 1) % 5 == 0:
        elapsed = time.time() - start_time
        cpu = psutil.cpu_percent(interval=0.1)
        print(f"Epoch {epoch+1:2d} | Loss: {total_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | LR: {scheduler.get_last_lr()[0]:.6f} | Time: {elapsed:.2f}s | CPU: {cpu}%")


Epoch  5 | Loss: 998.3166 | Acc: 0.7852 | F1: 0.5762 | LR: 0.001000 | Time: 8.16s | CPU: 16.3%
Epoch 10 | Loss: 954.9634 | Acc: 0.7997 | F1: 0.6518 | LR: 0.000500 | Time: 8.10s | CPU: 18.4%
Epoch 15 | Loss: 927.3428 | Acc: 0.8088 | F1: 0.6806 | LR: 0.000500 | Time: 8.18s | CPU: 16.7%
Epoch 20 | Loss: 920.6391 | Acc: 0.8138 | F1: 0.6830 | LR: 0.000250 | Time: 8.38s | CPU: 21.0%
Epoch 25 | Loss: 907.8465 | Acc: 0.8166 | F1: 0.6943 | LR: 0.000250 | Time: 10.10s | CPU: 22.0%
Epoch 30 | Loss: 904.5499 | Acc: 0.8169 | F1: 0.6903 | LR: 0.000125 | Time: 9.15s | CPU: 22.6%


小样本

In [None]:
# read data
covtype_data = pd.read_csv('../data/covertype/covtype.data', header=None)
columns = [f'feature_{i}' for i in range(1, 55)] + ['target']
covtype_data.columns = columns

# label encoding
X = covtype_data.drop(columns=['target'])
y = covtype_data['target'] - 1  # 使标签从 0 开始

# train-test split
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# rtansform to tensor
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# define the model
class NodeModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=7, dropout=0.3):
        super(NodeModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.model(x)

# define the training function
def train_model(X_train, y_train, X_test, y_test, batch_size, lr_scheduler=None):
    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model = NodeModel(input_dim=X_train.shape[1])
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    # uesr-defined learning rate scheduler
    if lr_scheduler:
        scheduler = lr_scheduler(optimizer, step_size=10, gamma=0.5)  

    EPOCHS = 30
    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        correct_preds = 0
        total_preds = 0

        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(output, 1)
            correct_preds += (predicted == target).sum().item()
            total_preds += target.size(0)

        # calculate average loss and accuracy
        train_loss = running_loss / len(train_loader)
        train_acc = correct_preds / total_preds

        
        if (epoch + 1) % 5 == 0:
            model.eval()
            with torch.no_grad():
                correct_preds = 0
                total_preds = 0
                for data, target in test_loader:
                    output = model(data)
                    _, predicted = torch.max(output, 1)
                    correct_preds += (predicted == target).sum().item()
                    total_preds += target.size(0)

                test_acc = correct_preds / total_preds
                if lr_scheduler:
                    print(f"Epoch {epoch+1} | Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f} | LR: {optimizer.param_groups[0]['lr']:.6f}")
                else:
                    print(f"Epoch {epoch+1} | Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

        if lr_scheduler:
            scheduler.step()

# try with different sample sizes
sample_sizes = [0.1, 0.5]  # 1.0
batch_size = 64
lr_scheduler = torch.optim.lr_scheduler.StepLR  

for sample_size in sample_sizes:
    print(f"\nTraining with {int(sample_size*100)}% of the data...")

    # select a subset of the data
    X_sub, _, y_sub, _ = train_test_split(X_tensor, y_tensor, train_size=sample_size, random_state=42)

    X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(X_sub, y_sub, train_size=0.8, random_state=42)

    train_model(X_train_sub, y_train_sub, X_test_sub, y_test_sub, batch_size, lr_scheduler)

# train with 100% of the data
print("\nTraining with 100% of the data...")
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_tensor, y_tensor, train_size=0.8, random_state=42)
train_model(X_train_full, y_train_full, X_test_full, y_test_full, batch_size, lr_scheduler)



Training with 10% of the data...
Epoch 5 | Loss: 0.6275 | Train Acc: 0.7329 | Test Acc: 0.7469 | LR: 0.001000
Epoch 10 | Loss: 0.5861 | Train Acc: 0.7490 | Test Acc: 0.7603 | LR: 0.001000
Epoch 15 | Loss: 0.5612 | Train Acc: 0.7586 | Test Acc: 0.7727 | LR: 0.000500
Epoch 20 | Loss: 0.5537 | Train Acc: 0.7613 | Test Acc: 0.7743 | LR: 0.000500
Epoch 25 | Loss: 0.5443 | Train Acc: 0.7657 | Test Acc: 0.7798 | LR: 0.000250
Epoch 30 | Loss: 0.5372 | Train Acc: 0.7654 | Test Acc: 0.7789 | LR: 0.000250

Training with 50% of the data...
Epoch 5 | Loss: 0.5591 | Train Acc: 0.7581 | Test Acc: 0.7793 | LR: 0.001000
Epoch 10 | Loss: 0.5360 | Train Acc: 0.7684 | Test Acc: 0.7931 | LR: 0.001000
Epoch 15 | Loss: 0.5185 | Train Acc: 0.7782 | Test Acc: 0.8037 | LR: 0.000500
Epoch 20 | Loss: 0.5128 | Train Acc: 0.7810 | Test Acc: 0.8084 | LR: 0.000500
Epoch 25 | Loss: 0.5040 | Train Acc: 0.7841 | Test Acc: 0.8116 | LR: 0.000250
Epoch 30 | Loss: 0.5028 | Train Acc: 0.7856 | Test Acc: 0.8121 | LR: 0.00025

poker_hand_node_model

In [3]:
ph_testing_data = pd.read_csv('../data/poker_hand/poker_hand_testing.data', header=None)
ph_trainingtrue_data = pd.read_csv('../data/poker_hand/poker_hand_training-true.data', header=None)

columns = [f'feature_{i}' for i in range(1, 11)] + ['target']
ph_testing_data.columns = columns
ph_trainingtrue_data.columns = columns

X_train = ph_trainingtrue_data.drop(columns=['target'])
y_train = ph_trainingtrue_data['target']
X_test = ph_testing_data.drop(columns=['target'])
y_test = ph_testing_data['target']

# standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# create dataset and dataloader
BATCH_SIZE = 256
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# define the model
class NodeModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=10, dropout=0.3):
        super(NodeModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.model(x)

input_dim = X_train_tensor.shape[1]
model = NodeModel(input_dim=input_dim)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# train the model
EPOCHS = 30
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0.0
    start_time = time.time()

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_X)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    scheduler.step()
    
    model.eval()
    with torch.no_grad():
        pred = model(X_test_tensor)
        pred_label = torch.argmax(pred, dim=1)
        acc = accuracy_score(y_test_tensor, pred_label)
        f1 = f1_score(y_test_tensor, pred_label, average='weighted')

    end_time = time.time()
    cpu = psutil.cpu_percent(interval=0.1)

    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1:2d} | Loss: {epoch_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | LR: {scheduler.get_last_lr()[0]:.6f} | Time: {end_time - start_time:.2f}s | CPU: {cpu}%")


Epoch  5 | Loss: 96.7761 | Acc: 0.5223 | F1: 0.4135 | LR: 0.001000 | Time: 1.31s | CPU: 12.5%
Epoch 10 | Loss: 94.8041 | Acc: 0.5453 | F1: 0.4935 | LR: 0.000500 | Time: 1.31s | CPU: 1.8%
Epoch 15 | Loss: 94.3800 | Acc: 0.5446 | F1: 0.4874 | LR: 0.000500 | Time: 1.28s | CPU: 10.2%
Epoch 20 | Loss: 94.0692 | Acc: 0.5468 | F1: 0.4985 | LR: 0.000250 | Time: 1.32s | CPU: 7.1%
Epoch 25 | Loss: 93.8110 | Acc: 0.5466 | F1: 0.4968 | LR: 0.000250 | Time: 1.23s | CPU: 7.3%
Epoch 30 | Loss: 93.6370 | Acc: 0.5467 | F1: 0.4956 | LR: 0.000125 | Time: 1.19s | CPU: 15.6%


wine_node_model

In [None]:
# read wine data
wine_red_data = pd.read_csv('../data/wine_quality/winequality_red.csv', sep=';', header=0)
wine_white_data = pd.read_csv('../data/wine_quality/winequality_white.csv', sep=';', header=0)

# cleaning data
wine_red_data = wine_red_data.apply(pd.to_numeric, errors='coerce').dropna()
wine_white_data = wine_white_data.apply(pd.to_numeric, errors='coerce').dropna()

# label encoding
X_red = wine_red_data.drop(columns=['quality'])
y_red = wine_red_data['quality']
X_white = wine_white_data.drop(columns=['quality'])
y_white = wine_white_data['quality']

# standardization
scaler_red = StandardScaler()
X_red_scaled = scaler_red.fit_transform(X_red)
scaler_white = StandardScaler()
X_white_scaled = scaler_white.fit_transform(X_white)

# transform to tensor
X_red_tensor = torch.tensor(X_red_scaled, dtype=torch.float32)
y_red_tensor = torch.tensor(y_red.values, dtype=torch.long)
X_white_tensor = torch.tensor(X_white_scaled, dtype=torch.float32)
y_white_tensor = torch.tensor(y_white.values, dtype=torch.long)

# create dataset and dataloader
BATCH_SIZE = 256
train_loader_red = DataLoader(TensorDataset(X_red_tensor, y_red_tensor), batch_size=BATCH_SIZE, shuffle=True)
train_loader_white = DataLoader(TensorDataset(X_white_tensor, y_white_tensor), batch_size=BATCH_SIZE, shuffle=True)

# define the model
class NodeModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=11, dropout=0.3):
        super(NodeModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.model(x)

def train_model(X_tensor, y_tensor, loader, label='Red Wine'):
    input_dim = X_tensor.shape[1]
    model = NodeModel(input_dim=input_dim, output_dim=11)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

    EPOCHS = 30
    print(f"\n--- Training on {label} Dataset ---")
    for epoch in range(EPOCHS):
        model.train()
        epoch_loss = 0
        start_time = time.time()
        for batch_X, batch_y in loader:
            optimizer.zero_grad()
            output = model(batch_X)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        scheduler.step()

        model.eval()
        with torch.no_grad():
            pred = model(X_tensor)
            pred_label = torch.argmax(pred, dim=1)
            acc = accuracy_score(y_tensor, pred_label)
            f1 = f1_score(y_tensor, pred_label, average='weighted')

        cpu = psutil.cpu_percent(interval=0.1)
        elapsed = time.time() - start_time

        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1:2d} | Loss: {epoch_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | LR: {scheduler.get_last_lr()[0]:.6f} | Time: {elapsed:.2f}s | CPU: {cpu}%")

# train the model on both datasets
train_model(X_red_tensor, y_red_tensor, train_loader_red, label='Red Wine')
train_model(X_white_tensor, y_white_tensor, train_loader_white, label='White Wine')



--- Training on Red Wine Dataset ---
Epoch  5 | Loss: 9.9527 | Acc: 0.5572 | F1: 0.4961 | LR: 0.001000 | Time: 0.12s | CPU: 43.4%
Epoch 10 | Loss: 7.6873 | Acc: 0.5941 | F1: 0.5662 | LR: 0.000500 | Time: 0.15s | CPU: 26.7%
Epoch 15 | Loss: 7.4430 | Acc: 0.5947 | F1: 0.5659 | LR: 0.000500 | Time: 0.12s | CPU: 27.8%
Epoch 20 | Loss: 7.3417 | Acc: 0.5960 | F1: 0.5703 | LR: 0.000250 | Time: 0.14s | CPU: 20.4%
Epoch 25 | Loss: 7.1813 | Acc: 0.5985 | F1: 0.5734 | LR: 0.000250 | Time: 0.14s | CPU: 18.2%
Epoch 30 | Loss: 7.1662 | Acc: 0.5966 | F1: 0.5715 | LR: 0.000125 | Time: 0.14s | CPU: 21.8%

--- Training on White Wine Dataset ---
Epoch  5 | Loss: 24.6664 | Acc: 0.5310 | F1: 0.4649 | LR: 0.001000 | Time: 0.18s | CPU: 16.7%
Epoch 10 | Loss: 22.5499 | Acc: 0.5498 | F1: 0.5160 | LR: 0.000500 | Time: 0.20s | CPU: 23.6%
Epoch 15 | Loss: 22.5320 | Acc: 0.5551 | F1: 0.5218 | LR: 0.000500 | Time: 0.20s | CPU: 27.1%
Epoch 20 | Loss: 22.0785 | Acc: 0.5570 | F1: 0.5239 | LR: 0.000250 | Time: 0.20s |

California Housing

In [15]:
from sklearn.datasets import fetch_california_housing


# Load California Housing dataset
california_data = fetch_california_housing()

# Extract features and target
X = california_data.data
y = california_data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)  # Convert target to column vector
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# DataLoader
BATCH_SIZE = 256
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=BATCH_SIZE, shuffle=True)

# Define the model (for regression)
class NodeModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=1, dropout=0.3):
        super(NodeModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.model(x)

# Initialize model
input_dim = X_train_tensor.shape[1]
model = NodeModel(input_dim=input_dim)

# Loss function and optimizer
criterion = nn.MSELoss()  # For regression, use MSELoss
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# Training loop
EPOCHS = 30
for epoch in range(EPOCHS):
    start_time = time.time()
    model.train()
    epoch_loss = 0.0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_X)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    scheduler.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        pred = model(X_test_tensor)
        mse = criterion(pred, y_test_tensor)
        rmse = torch.sqrt(mse)  # RMSE

    cpu = psutil.cpu_percent(interval=0.1)
    elapsed = time.time() - start_time

    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1:2d} | Loss: {epoch_loss:.4f} | RMSE: {rmse.item():.4f} | LR: {scheduler.get_last_lr()[0]:.6f} | Time: {elapsed:.2f}s | CPU: {cpu}%")


Epoch  5 | Loss: 42.9162 | RMSE: 0.7011 | LR: 0.001000 | Time: 0.34s | CPU: 30.4%
Epoch 10 | Loss: 36.8705 | RMSE: 0.6592 | LR: 0.000500 | Time: 0.36s | CPU: 25.0%
Epoch 15 | Loss: 33.6907 | RMSE: 0.6417 | LR: 0.000500 | Time: 0.37s | CPU: 29.8%
Epoch 20 | Loss: 31.8468 | RMSE: 0.6303 | LR: 0.000250 | Time: 0.36s | CPU: 27.9%
Epoch 25 | Loss: 31.2817 | RMSE: 0.6242 | LR: 0.000250 | Time: 0.38s | CPU: 17.9%
Epoch 30 | Loss: 30.1810 | RMSE: 0.6188 | LR: 0.000125 | Time: 0.35s | CPU: 23.6%


HIGGS_node_model

In [None]:
# read wine data
HIGGS_data = pd.read_csv('../data/HIGGS/HIGGS.csv', header=None)

X = HIGGS_data.iloc[:, 1:].values  
y = HIGGS_data.iloc[:, 0].values   

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# transform to tensor
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# create dataset and dataloader
BATCH_SIZE = 64
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=BATCH_SIZE, shuffle=True)

# define the model (for binary classification)
class NodeModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=2, dropout=0.3):
        super(NodeModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.model(x)

# initialize the model
input_dim = X_train_tensor.shape[1]
model = NodeModel(input_dim=input_dim)

# initialize the model, loss function, optimizer
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# train the model
EPOCHS = 30
for epoch in range(EPOCHS):
    start_time = time.time()
    model.train()
    epoch_loss = 0.0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_X)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    scheduler.step()

    # 评估
    model.eval()
    with torch.no_grad():
        pred = model(X_test_tensor)
        pred_label = torch.argmax(pred, dim=1)
        acc = accuracy_score(y_test_tensor, pred_label)
        f1 = f1_score(y_test_tensor, pred_label)

    cpu = psutil.cpu_percent(interval=0.1)
    elapsed = time.time() - start_time

    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1:2d} | Loss: {epoch_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | LR: {scheduler.get_last_lr()[0]:.6f} | Time: {elapsed:.2f}s | CPU: {cpu}%")


Epoch  5 | Loss: 73650.1854 | Acc: 0.7413 | F1: 0.7547 | LR: 0.001000 | Time: 296.09s | CPU: 1.8%
Epoch 10 | Loss: 73509.3767 | Acc: 0.7433 | F1: 0.7557 | LR: 0.000500 | Time: 311.60s | CPU: 0.0%
Epoch 15 | Loss: 72969.6374 | Acc: 0.7452 | F1: 0.7653 | LR: 0.000500 | Time: 307.21s | CPU: 0.0%
Epoch 20 | Loss: 72931.0982 | Acc: 0.7454 | F1: 0.7617 | LR: 0.000250 | Time: 307.54s | CPU: 0.0%
Epoch 25 | Loss: 72668.1137 | Acc: 0.7461 | F1: 0.7667 | LR: 0.000250 | Time: 337.17s | CPU: 3.3%


KeyboardInterrupt: 

Telco Customer Churn

In [5]:
# Load Telco Customer Churn dataset
TC_data = pd.read_csv('../data/Telco Customer Churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Inspect and clean the dataset (handle missing values, encode categorical variables)
# Drop any unnecessary columns if needed (e.g., customerID) and convert to appropriate types
TC_data = TC_data.drop(columns=['customerID'])

# Convert categorical columns using label encoding or one-hot encoding
categorical_cols = TC_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    TC_data[col] = TC_data[col].astype('category').cat.codes  # Label encoding

# Split the data into features and target
X_TC = TC_data.drop(columns=['Churn'])
y_TC = TC_data['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)  # Convert Churn to binary

# Standardize the features
scaler_TC = StandardScaler()
X_TC_scaled = scaler_TC.fit_transform(X_TC)

# Convert to tensors
X_TC_tensor = torch.tensor(X_TC_scaled, dtype=torch.float32)
y_TC_tensor = torch.tensor(y_TC.values, dtype=torch.long)

# Create a DataLoader
BATCH_SIZE = 256
train_loader_TC = DataLoader(TensorDataset(X_TC_tensor, y_TC_tensor), batch_size=BATCH_SIZE, shuffle=True)

# Define the model (same as the NodeModel used for the wine dataset)
class NodeModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=2, dropout=0.3):
        super(NodeModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.model(x)

# Train the model on the Telco Customer Churn dataset
def train_model(X_tensor, y_tensor, loader, label='Telco Customer Churn'):
    input_dim = X_tensor.shape[1]
    model = NodeModel(input_dim=input_dim, output_dim=2)  # Binary classification
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

    EPOCHS = 30
    print(f"\n--- Training on {label} Dataset ---")
    for epoch in range(EPOCHS):
        model.train()
        epoch_loss = 0
        start_time = time.time()
        for batch_X, batch_y in loader:
            optimizer.zero_grad()
            output = model(batch_X)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        scheduler.step()

        model.eval()
        with torch.no_grad():
            pred = model(X_tensor)
            pred_label = torch.argmax(pred, dim=1)
            acc = accuracy_score(y_tensor, pred_label)
            f1 = f1_score(y_tensor, pred_label, average='weighted')

        cpu = psutil.cpu_percent(interval=0.1)
        elapsed = time.time() - start_time

        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1:2d} | Loss: {epoch_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | LR: {scheduler.get_last_lr()[0]:.6f} | Time: {elapsed:.2f}s | CPU: {cpu}%")

# Train the model on the Telco dataset
train_model(X_TC_tensor, y_TC_tensor, train_loader_TC, label='Telco Customer Churn')



--- Training on Telco Customer Churn Dataset ---
Epoch  5 | Loss: 0.0384 | Acc: 1.0000 | F1: 1.0000 | LR: 0.001000 | Time: 0.28s | CPU: 26.3%
Epoch 10 | Loss: 0.0095 | Acc: 1.0000 | F1: 1.0000 | LR: 0.000500 | Time: 0.26s | CPU: 41.1%
Epoch 15 | Loss: 0.0063 | Acc: 1.0000 | F1: 1.0000 | LR: 0.000500 | Time: 0.25s | CPU: 36.5%
Epoch 20 | Loss: 0.0044 | Acc: 1.0000 | F1: 1.0000 | LR: 0.000250 | Time: 0.23s | CPU: 28.6%
Epoch 25 | Loss: 0.0034 | Acc: 1.0000 | F1: 1.0000 | LR: 0.000250 | Time: 0.35s | CPU: 32.6%
Epoch 30 | Loss: 0.0031 | Acc: 1.0000 | F1: 1.0000 | LR: 0.000125 | Time: 0.23s | CPU: 21.1%


In [None]:
# read wine data
TC_data = pd.read_csv('../data/Telco Customer Churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Inspect and clean the dataset (handle missing values, encode categorical variables)
# Drop any unnecessary columns if needed (e.g., customerID) and convert to appropriate types
numerical_columns = TC_data.select_dtypes(include=['float64', 'int64']).columns
TC_data[numerical_columns] = TC_data[numerical_columns].fillna(TC_data[numerical_columns].mean())

# drop any rows with missing values in the target column
categorical_columns = TC_data.select_dtypes(include=['object']).columns
for column in categorical_columns:
    TC_data[column] = TC_data[column].fillna(TC_data[column].mode()[0])

if 'Churn' in TC_data.columns:
    TC_data['Churn'] = TC_data['Churn'].fillna(TC_data['Churn'].mode()[0])

if TC_data['Churn'].dtype == 'object':
    TC_data['Churn'] = TC_data['Churn'].map({'No': 0, 'Yes': 1})


X_TC = TC_data.drop(columns=['Churn'])
y_TC = TC_data['Churn']

X_TC = pd.get_dummies(X_TC, drop_first=True)  # One-hot encoding
X_TC = X_TC.astype('float32')

scaler = StandardScaler()
X_TC_scaled = scaler.fit_transform(X_TC)


if not np.issubdtype(X_TC_scaled.dtype, np.number):
    raise ValueError("X_TC_scaled contains non-numeric values. Please check the preprocessing steps.")


if not np.issubdtype(y_TC.values.dtype, np.number):
    raise ValueError("y_TC contains non-numeric values. Please check the preprocessing steps.")

X_TC_tensor = torch.tensor(X_TC_scaled, dtype=torch.float32)
y_TC_tensor = torch.tensor(y_TC.values, dtype=torch.long)

BATCH_SIZE = 256
train_loader = DataLoader(TensorDataset(X_TC_tensor, y_TC_tensor), batch_size=BATCH_SIZE, shuffle=True)

class NodeModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=2, dropout=0.3):
        super(NodeModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )
    
    def forward(self, x):
        return self.model(x)


def train_model(X_tensor, y_tensor, loader, label='Telco Customer Churn'):
    input_dim = X_tensor.shape[1]
    model = NodeModel(input_dim=input_dim, output_dim=2)  # 2类（Churn or Not Churn）
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

    EPOCHS = 30
    print(f"\n--- Training on {label} Dataset ---")
    for epoch in range(EPOCHS):
        model.train()
        epoch_loss = 0
        start_time = time.time()
        for batch_X, batch_y in loader:
            optimizer.zero_grad()
            output = model(batch_X)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        scheduler.step()

        
        model.eval()
        with torch.no_grad():
            pred = model(X_tensor)
            pred_label = torch.argmax(pred, dim=1)
            acc = accuracy_score(y_tensor, pred_label)
            f1 = f1_score(y_tensor, pred_label, average='weighted')

        cpu = psutil.cpu_percent(interval=0.1)
        elapsed = time.time() - start_time

        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1:2d} | Loss: {epoch_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | LR: {scheduler.get_last_lr()[0]:.6f} | Time: {elapsed:.2f}s | CPU: {cpu}%")

train_model(X_TC_tensor, y_TC_tensor, train_loader, label='Telco Customer Churn')



--- Training on Telco Customer Churn Dataset ---
Epoch  5 | Loss: 0.7081 | Acc: 1.0000 | F1: 1.0000 | LR: 0.001000 | Time: 1.85s | CPU: 25.0%
Epoch 10 | Loss: 0.0874 | Acc: 1.0000 | F1: 1.0000 | LR: 0.000500 | Time: 1.73s | CPU: 30.9%
Epoch 15 | Loss: 0.0623 | Acc: 1.0000 | F1: 1.0000 | LR: 0.000500 | Time: 1.95s | CPU: 22.2%
Epoch 20 | Loss: 0.0261 | Acc: 1.0000 | F1: 1.0000 | LR: 0.000250 | Time: 2.15s | CPU: 24.4%
Epoch 25 | Loss: 0.0237 | Acc: 1.0000 | F1: 1.0000 | LR: 0.000250 | Time: 1.83s | CPU: 36.8%
Epoch 30 | Loss: 0.0219 | Acc: 1.0000 | F1: 1.0000 | LR: 0.000125 | Time: 1.93s | CPU: 39.7%


CreditcCard Fraud Detection

In [7]:
# Load Credit Card Fraud Detection dataset
CC_data = pd.read_csv('../data/Credit Card Fraud Detection/creditcard.csv')

# Inspect the data (e.g., check for missing values, imbalanced classes)
print(CC_data.info())
print(CC_data.describe())

# Drop the 'Time' column and separate features and target variable
X_CC = CC_data.drop(columns=['Class', 'Time'])
y_CC = CC_data['Class']  # 'Class' is the target variable (fraud = 1, non-fraud = 0)

# Standardize the features
scaler_CC = StandardScaler()
X_CC_scaled = scaler_CC.fit_transform(X_CC)

# Convert to tensors
X_CC_tensor = torch.tensor(X_CC_scaled, dtype=torch.float32)
y_CC_tensor = torch.tensor(y_CC.values, dtype=torch.long)

# Create a DataLoader
BATCH_SIZE = 256
train_loader_CC = DataLoader(TensorDataset(X_CC_tensor, y_CC_tensor), batch_size=BATCH_SIZE, shuffle=True)

# Define the model (same as the NodeModel used for the wine dataset)
class NodeModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=2, dropout=0.3):
        super(NodeModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.model(x)

# Train the model on the Credit Card Fraud dataset
def train_model(X_tensor, y_tensor, loader, label='Credit Card Fraud Detection'):
    input_dim = X_tensor.shape[1]
    model = NodeModel(input_dim=input_dim, output_dim=2)  # Binary classification (fraud = 1, non-fraud = 0)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

    EPOCHS = 30
    print(f"\n--- Training on {label} Dataset ---")
    for epoch in range(EPOCHS):
        model.train()
        epoch_loss = 0
        start_time = time.time()
        for batch_X, batch_y in loader:
            optimizer.zero_grad()
            output = model(batch_X)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        scheduler.step()

        model.eval()
        with torch.no_grad():
            pred = model(X_tensor)
            pred_label = torch.argmax(pred, dim=1)
            acc = accuracy_score(y_tensor, pred_label)
            f1 = f1_score(y_tensor, pred_label, average='weighted')

        cpu = psutil.cpu_percent(interval=0.1)
        elapsed = time.time() - start_time

        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1:2d} | Loss: {epoch_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | LR: {scheduler.get_last_lr()[0]:.6f} | Time: {elapsed:.2f}s | CPU: {cpu}%")

# Train the model on the Credit Card Fraud dataset
train_model(X_CC_tensor, y_CC_tensor, train_loader_CC, label='Credit Card Fraud Detection')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

seed

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
import time
import psutil
import os
import random

# Set random seed for reproducibility
def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # for multi-GPU

# read data
train_data = pd.read_csv('../data/adult/adult.data', header=None)
test_data = pd.read_csv('../data/adult/adult.test', header=None, skiprows=1)

columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]
train_data.columns = test_data.columns = columns

# cleaning data
def clean_data(df):
    df = df.replace('?', np.nan)
    df = df.dropna()
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df["income"] = df["income"].replace({">50K.": 1, "<=50K.": 0, ">50K": 1, "<=50K": 0})
    return df

train_data = clean_data(train_data)
test_data = clean_data(test_data)

# label encoding
X_train = train_data.drop(columns=["income"])
y_train = train_data["income"]
X_test = test_data.drop(columns=["income"])
y_test = test_data["income"]

# One-Hot Encoding 
X_combined = pd.get_dummies(pd.concat([X_train, X_test], axis=0))
X_train = X_combined.iloc[:len(X_train)].copy().astype(np.float32)
X_test = X_combined.iloc[len(X_train):].copy().astype(np.float32)

# standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# transform to tensor
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# create dataset and dataloader
batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# define the model
class NodeModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=2, dropout=0.3):
        super(NodeModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.model(x)

# Train the model for a given seed
def train_model_with_seed(seed):
    # Set random seed for reproducibility
    set_random_seed(seed)

    # Re-initialize model, optimizer, and scheduler
    model = NodeModel(input_dim=X_train_tensor.shape[1])
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

    start_time = time.time()
    cpu_usage_start = psutil.cpu_percent(interval=1)

    EPOCHS = 30
    for epoch in range(EPOCHS):
        model.train()
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            output = model(batch_x)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
        scheduler.step()

        if (epoch + 1) % 5 == 0:
            model.eval()
            all_preds, all_labels = [], []
            with torch.no_grad():
                for batch_x, batch_y in test_loader:
                    output = model(batch_x)
                    pred_label = torch.argmax(output, dim=1)
                    all_preds.append(pred_label)
                    all_labels.append(batch_y)

            all_preds = torch.cat(all_preds)
            all_labels = torch.cat(all_labels)
            acc = accuracy_score(all_labels, all_preds)
            f1 = f1_score(all_labels, all_preds)
            print(f"Seed {seed} | Epoch {epoch+1} | Loss: {loss.item():.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | LR: {optimizer.param_groups[0]['lr']:.6f}")

    # Evaluate the model
    end_time = time.time()
    cpu_usage_end = psutil.cpu_percent(interval=1)
    duration = end_time - start_time
    avg_cpu = (cpu_usage_start + cpu_usage_end) / 2

    print(f"\nSeed {seed} | train_time: {duration:.2f} seconds")
    print(f"Seed {seed} | avg_cpu: {avg_cpu:.2f}%")

# Run the training for different seeds
for seed_value in [999, 888, 777]:
    train_model_with_seed(seed_value)


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
import time
import psutil
import os
import random

# Set random seed for reproducibility
def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # for multi-GPU

# read data
train_data = pd.read_csv('../data/adult/adult.data', header=None)
test_data = pd.read_csv('../data/adult/adult.test', header=None, skiprows=1)

columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]
train_data.columns = test_data.columns = columns

# cleaning data
def clean_data(df):
    df = df.replace('?', np.nan)
    df = df.dropna()
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df["income"] = df["income"].replace({">50K.": 1, "<=50K.": 0, ">50K": 1, "<=50K": 0})
    return df

train_data = clean_data(train_data)
test_data = clean_data(test_data)

# label encoding
X_train = train_data.drop(columns=["income"])
y_train = train_data["income"]
X_test = test_data.drop(columns=["income"])
y_test = test_data["income"]

# One-Hot Encoding 
X_combined = pd.get_dummies(pd.concat([X_train, X_test], axis=0))
X_train = X_combined.iloc[:len(X_train)].copy().astype(np.float32)
X_test = X_combined.iloc[len(X_train):].copy().astype(np.float32)

# standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# transform to tensor
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# create dataset and dataloader
batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# define the model
class NodeModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=2, dropout=0.3):
        super(NodeModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.model(x)

# Train the model for a given seed
def train_model_with_seed(seed):
    # Set random seed for reproducibility
    set_random_seed(seed)

    # Re-initialize model, optimizer, and scheduler
    model = NodeModel(input_dim=X_train_tensor.shape[1])
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

    start_time = time.time()
    cpu_usage_start = psutil.cpu_percent(interval=1)

    EPOCHS = 30
    for epoch in range(EPOCHS):
        model.train()
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            output = model(batch_x)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
        scheduler.step()

        if (epoch + 1) % 5 == 0:
            model.eval()
            all_preds, all_labels = [], []
            with torch.no_grad():
                for batch_x, batch_y in test_loader:
                    output = model(batch_x)
                    pred_label = torch.argmax(output, dim=1)
                    all_preds.append(pred_label)
                    all_labels.append(batch_y)

            all_preds = torch.cat(all_preds)
            all_labels = torch.cat(all_labels)
            acc = accuracy_score(all_labels, all_preds)
            f1 = f1_score(all_labels, all_preds)
            print(f"Seed {seed} | Epoch {epoch+1} | Loss: {loss.item():.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | LR: {optimizer.param_groups[0]['lr']:.6f}")

    # Evaluate the model
    end_time = time.time()
    cpu_usage_end = psutil.cpu_percent(interval=1)
    duration = end_time - start_time
    avg_cpu = (cpu_usage_start + cpu_usage_end) / 2

    print(f"\nSeed {seed} | train_time: {duration:.2f} seconds")
    print(f"Seed {seed} | avg_cpu: {avg_cpu:.2f}%")

# Run the training for different seeds
for seed_value in [999, 888, 777]:
    train_model_with_seed(seed_value)
