### 导入必要的包

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression # Explicitly import sklearn's LR
from scipy.io import loadmat # To load .mat files
import os # For file path handling


### 导入数据集

In [2]:
LogisticRegressionModel = LogisticRegression
DATA_DIR = './data'
SFFSD_PATH = os.path.join(DATA_DIR, 'S-FFSD.csv')
SFFSD_TIME_COLUMN = 'Time'
SFFSD_LABEL_COLUMN = 'Labels'
SFFSD_CATEGORICAL_COLUMNS = ['Source', 'Target', 'Location', 'Type']
SFFSD_NUMERIC_COLUMNS = ['Time', 'Amount']

AMAZON_PATH = os.path.join(DATA_DIR, 'Amazon.mat')
YELPCHI_PATH = os.path.join(DATA_DIR, 'YelpChi.mat')

### 设置训练S-FFSD数据集的超参数
##### S-FFSD为半监督学习，训练时需要设置超参数。

In [3]:
CONFIDENCE_THRESHOLD = 0.9
MAX_ITERATIONS = 10
MIN_SAMPLES_PER_ITERATION = 100

### 对S-FFSD数据集进行预处理，以便进行半监督学习。

In [4]:
def preprocess_sffsd():
    print(f"\n--- Preprocessing S-FFSD data from: {SFFSD_PATH} ---")
    data = pd.read_csv(SFFSD_PATH)

    labelled_data = data[data[SFFSD_LABEL_COLUMN].isin([0, 1])].copy()
    unlabelled_data = data[data[SFFSD_LABEL_COLUMN] == 2].copy()

    print(f"S-FFSD total: {len(data)}, labelled: {len(labelled_data)}, unlabelled: {len(unlabelled_data)}")

    data.sort_values(SFFSD_TIME_COLUMN, inplace=True)
    labelled_data.sort_values(SFFSD_TIME_COLUMN, inplace=True)
    unlabelled_data.sort_values(SFFSD_TIME_COLUMN, inplace=True)

    all_data_for_preprocessing = pd.concat([labelled_data.drop(columns=[SFFSD_LABEL_COLUMN]),
                                            unlabelled_data.drop(columns=[SFFSD_LABEL_COLUMN])], ignore_index=True)

    high_card_cols = ['Source', 'Target']
    for col in high_card_cols:
        all_categories = pd.concat([labelled_data[col], unlabelled_data[col]]).unique()
        le = LabelEncoder()
        le.fit(all_categories)
        all_data_for_preprocessing[col] = le.transform(all_data_for_preprocessing[col])

    low_card_cols = ['Location', 'Type']
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_features = encoder.fit_transform(all_data_for_preprocessing[low_card_cols])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(low_card_cols), index=all_data_for_preprocessing.index)

    processed_data_temp = all_data_for_preprocessing.drop(columns=SFFSD_CATEGORICAL_COLUMNS)
    processed_data_full = pd.concat([processed_data_temp, encoded_df], axis=1)

    scaler = StandardScaler()
    processed_data_full[SFFSD_NUMERIC_COLUMNS] = scaler.fit_transform(processed_data_full[SFFSD_NUMERIC_COLUMNS])

    processed_labelled_X = processed_data_full.iloc[:len(labelled_data)]
    processed_unlabelled_X = processed_data_full.iloc[len(labelled_data):]
    y_labelled = labelled_data[SFFSD_LABEL_COLUMN]

    print(f"Processed S-FFSD labelled X shape: {processed_labelled_X.shape}, y shape: {y_labelled.shape}")
    print(f"Processed S-FFSD unlabelled X shape: {processed_unlabelled_X.shape}")

    return processed_labelled_X, y_labelled, processed_unlabelled_X


### 加载和预处理 Amazon.mat与YelpChi.mat
#### 这些文件被假设为完全监督，并具有 'features' 和 'label' 键。将特征转换为密集的 NumPy 数组并进行标准化。

In [5]:
def preprocess_mat_data(file_path):
    print(f"\n--- Preprocessing .mat data from: {file_path} ---")
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    mat_data = loadmat(file_path)
    features_sparse = mat_data['features']
    labels_raw = mat_data['label'].flatten()

    if hasattr(features_sparse, 'todense'):
        features = features_sparse.todense().A
    else:
        features = features_sparse

    labels = labels_raw.astype(np.int32)

    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    features_df = pd.DataFrame(features_scaled)

    print(f"Processed data shape: Features={features_df.shape}, Labels={labels.shape}")
    return features_df, pd.Series(labels)


### 实现基于逻辑回归的半监督学习中的自训练算法（Self-Training），并评估其性能

In [6]:
def train_and_evaluate_semisupervised_logistic_regression(
    X_labelled, y_labelled, X_unlabelled,
    confidence_threshold=CONFIDENCE_THRESHOLD, max_iterations=MAX_ITERATIONS,
    min_samples_per_iteration=MIN_SAMPLES_PER_ITERATION):
    print("\n=== Running Semi-Supervised Logistic Regression (S-FFSD) ===")
    X_train_initial, X_test_eval, y_train_initial, y_test_eval = train_test_split(
        X_labelled, y_labelled, test_size=0.4, random_state=42, stratify=y_labelled
    )
    print(f"Initial labelled train set: {X_train_initial.shape}, test set: {X_test_eval.shape}")

    X_train_self_training = X_train_initial.copy()
    y_train_self_training = y_train_initial.copy()
    current_unlabelled_X = X_unlabelled.copy()

    for iteration in range(max_iterations):
        print(f"\n--- Self-training iteration {iteration + 1}/{max_iterations} ---")
        if len(current_unlabelled_X) == 0:
            print("All unlabelled data processed. Stopping self-training.")
            break

        model = LogisticRegressionModel(max_iter=2000, solver='liblinear', random_state=42, n_jobs=-1)
        print(f"Current training set size: {len(X_train_self_training)}")
        model.fit(X_train_self_training, y_train_self_training)

        if len(current_unlabelled_X) > 0:
            unlabelled_probs = model.predict_proba(current_unlabelled_X)

            confident_indices = (unlabelled_probs[:, 0] > confidence_threshold) | \
                                (unlabelled_probs[:, 1] > confidence_threshold)

            confident_X = current_unlabelled_X[confident_indices]
            confident_pseudo_labels = (unlabelled_probs[confident_indices, 1] > 0.5).astype(int)

            print(f"Iteration {iteration + 1}: Found {len(confident_X)} confident pseudo-labels.")

            if len(confident_X) < min_samples_per_iteration and iteration < max_iterations - 1:
                print(f"Too few confident pseudo-labels ({len(confident_X)} < {min_samples_per_iteration}), skipping this iteration.")
                continue

            if len(confident_X) > 0:
                X_train_self_training = pd.concat([X_train_self_training, confident_X], ignore_index=True)
                y_train_self_training = pd.concat([y_train_self_training, pd.Series(confident_pseudo_labels)], ignore_index=True)
                current_unlabelled_X = current_unlabelled_X[~confident_indices]
                print(f"New training set size: {len(X_train_self_training)}")
                print(f"Remaining unlabelled pool size: {len(current_unlabelled_X)}")
            else:
                print("No new confident pseudo-labels found this iteration. Stopping self-training.")
                break
        else:
            print("Unlabelled data pool is empty. Stopping self-training.")
            break

    print("\n--- Final model training on complete semi-supervised set ---")
    final_model = LogisticRegressionModel(max_iter=2000, solver='liblinear', random_state=42, n_jobs=-1)
    final_model.fit(X_train_self_training, y_train_self_training)
    print("Final model training complete.")

    print("\n--- Evaluating S-FFSD model on original test set ---")
    y_pred_eval = final_model.predict(X_test_eval)
    y_prob_eval = final_model.predict_proba(X_test_eval)[:, 1]

    cm = confusion_matrix(y_test_eval, y_pred_eval)
    accuracy = accuracy_score(y_test_eval, y_pred_eval)
    
    f1 = f1_score(y_test_eval, y_pred_eval)
    
    avg_precision = average_precision_score(y_test_eval, y_prob_eval)

    print("\nS-FFSD Evaluation Metrics:")
    print("Confusion Matrix:\n", cm)
    print(f'Accuracy: {accuracy:.4f}')
   
    print(f'F1 Score: {f1:.4f}')
   
    print(f'Average Precision: {avg_precision:.4f}')

    return {'accuracy': accuracy,  'f1': f1, 'avg_precision': avg_precision}


### 执行标准的有监督逻辑回归模型的训练和评估过程

In [7]:
def train_and_evaluate_supervised_logistic_regression(X, y, dataset_name):
    print(f"\n=== Running Supervised Logistic Regression ({dataset_name}) ===")
    print(f"Data shape: X={X.shape}, y={y.shape}")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")

    model = LogisticRegressionModel(max_iter=2000, solver='liblinear', random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    print("Model training complete.")

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    avg_precision = average_precision_score(y_test, y_prob)

    print(f"\n{dataset_name} Evaluation Metrics:")
    print("Confusion Matrix:\n", cm)
    print(f'Accuracy: {accuracy:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'Average Precision: {avg_precision:.4f}')

    return {'accuracy': accuracy, 'f1': f1, 'avg_precision': avg_precision}


### S-FFSD训练结果

In [8]:
all_results = {}
sffsd_X_labelled, sffsd_y_labelled, sffsd_X_unlabelled = preprocess_sffsd()
sffsd_results = train_and_evaluate_semisupervised_logistic_regression(
    sffsd_X_labelled, sffsd_y_labelled, sffsd_X_unlabelled
)
all_results['S-FFSD_SemiSupervised_LR'] = sffsd_results


--- Preprocessing S-FFSD data from: ./data\S-FFSD.csv ---
S-FFSD total: 77881, labelled: 29643, unlabelled: 48238
Processed S-FFSD labelled X shape: (29643, 464), y shape: (29643,)
Processed S-FFSD unlabelled X shape: (48238, 464)

=== Running Semi-Supervised Logistic Regression (S-FFSD) ===
Initial labelled train set: (17785, 464), test set: (11858, 464)

--- Self-training iteration 1/10 ---
Current training set size: 17785
Iteration 1: Found 9691 confident pseudo-labels.




New training set size: 27476
Remaining unlabelled pool size: 38547

--- Self-training iteration 2/10 ---
Current training set size: 27476




Iteration 2: Found 7397 confident pseudo-labels.
New training set size: 34873
Remaining unlabelled pool size: 31150

--- Self-training iteration 3/10 ---
Current training set size: 34873




Iteration 3: Found 10848 confident pseudo-labels.
New training set size: 45721
Remaining unlabelled pool size: 20302

--- Self-training iteration 4/10 ---
Current training set size: 45721




Iteration 4: Found 11713 confident pseudo-labels.
New training set size: 57434
Remaining unlabelled pool size: 8589

--- Self-training iteration 5/10 ---
Current training set size: 57434




Iteration 5: Found 2836 confident pseudo-labels.
New training set size: 60270
Remaining unlabelled pool size: 5753

--- Self-training iteration 6/10 ---
Current training set size: 60270




Iteration 6: Found 1550 confident pseudo-labels.
New training set size: 61820
Remaining unlabelled pool size: 4203

--- Self-training iteration 7/10 ---
Current training set size: 61820




Iteration 7: Found 1586 confident pseudo-labels.
New training set size: 63406
Remaining unlabelled pool size: 2617

--- Self-training iteration 8/10 ---
Current training set size: 63406




Iteration 8: Found 262 confident pseudo-labels.
New training set size: 63668
Remaining unlabelled pool size: 2355

--- Self-training iteration 9/10 ---
Current training set size: 63668




Iteration 9: Found 32 confident pseudo-labels.
Too few confident pseudo-labels (32 < 100), skipping this iteration.

--- Self-training iteration 10/10 ---
Current training set size: 63668




Iteration 10: Found 32 confident pseudo-labels.
New training set size: 63700
Remaining unlabelled pool size: 2323

--- Final model training on complete semi-supervised set ---




Final model training complete.

--- Evaluating S-FFSD model on original test set ---

S-FFSD Evaluation Metrics:
Confusion Matrix:
 [[9637  118]
 [1196  907]]
Accuracy: 0.8892
F1 Score: 0.5799
Average Precision: 0.5441


### Amazon训练结果

In [9]:
amazon_features, amazon_labels = preprocess_mat_data(AMAZON_PATH)
amazon_results = train_and_evaluate_supervised_logistic_regression(
    amazon_features, amazon_labels, "Amazon"
)
all_results['Amazon_Supervised_LR'] = amazon_results




--- Preprocessing .mat data from: ./data\Amazon.mat ---
Processed data shape: Features=(11944, 25), Labels=(11944,)

=== Running Supervised Logistic Regression (Amazon) ===
Data shape: X=(11944, 25), y=(11944,)
Train set: (8360, 25), Test set: (3584, 25)
Model training complete.

Amazon Evaluation Metrics:
Confusion Matrix:
 [[3323   15]
 [  58  188]]
Accuracy: 0.9796
F1 Score: 0.8374
Average Precision: 0.8486




### YelpChi训练结果

In [10]:
yelpchi_features, yelpchi_labels = preprocess_mat_data(YELPCHI_PATH)
yelpchi_results = train_and_evaluate_supervised_logistic_regression(
    yelpchi_features, yelpchi_labels, "YelpChi"
)
all_results['YelpChi_Supervised_LR'] = yelpchi_results




--- Preprocessing .mat data from: ./data\YelpChi.mat ---
Processed data shape: Features=(45954, 32), Labels=(45954,)

=== Running Supervised Logistic Regression (YelpChi) ===
Data shape: X=(45954, 32), y=(45954,)
Train set: (32167, 32), Test set: (13787, 32)




Model training complete.

YelpChi Evaluation Metrics:
Confusion Matrix:
 [[11634   150]
 [ 1745   258]]
Accuracy: 0.8626
F1 Score: 0.2140
Average Precision: 0.4042


### 汇总

In [11]:
for dataset_name, metrics in all_results.items():
    print(f"\nDataset: {dataset_name}")
    for metric, value in metrics.items():
        print(f"  {metric.replace('_', ' ').title()}: {value:.4f}")


Dataset: S-FFSD_SemiSupervised_LR
  Accuracy: 0.8892
  F1: 0.5799
  Avg Precision: 0.5441

Dataset: Amazon_Supervised_LR
  Accuracy: 0.9796
  F1: 0.8374
  Avg Precision: 0.8486

Dataset: YelpChi_Supervised_LR
  Accuracy: 0.8626
  F1: 0.2140
  Avg Precision: 0.4042


In [13]:
from nbconvert import HTMLExporter
import nbformat
 
# 加载notebook文件
with open('LogisticRegression_liuxiyuan.ipynb') as f:
    nb = nbformat.read(f, as_version=4)
 
# 创建HTML导出器实例
html_exporter = HTMLExporter()
html, resources = html_exporter.from_notebook_node(nb)
 
# 写入HTML文件
with open('LogisticRegression_lxy.html', 'w') as f:
    f.write(html)