In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
file_path = 'D:/EDM/数据集相关/22级学生行为数据集/DataSet_V5_new_filled.xlsx'
df = pd.read_excel(file_path)
df = df.drop(columns=['xh'])

In [None]:
from sklearn.feature_selection import VarianceThreshold

# 进行方差选择法特征选择
selector = VarianceThreshold(threshold=0.01)
df_new= selector.fit_transform(df)

# 输出选择的特征
print("原始特征数：", df.shape[1])
print("过滤后的特征数：", df_new.shape[1])
selected_features = df.columns[selector.get_support()]
print("选择的特征：", selected_features)
# 将 df_new 转换为 DataFrame，并使用原始的列名
df_new = pd.DataFrame(df_new, columns=selected_features)

In [3]:
X = df_new.drop(columns=['label'])
# X = df.drop(columns=['xh','label'])
y = df_new['label']

scaler = StandardScaler()
min_max_scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X = min_max_scaler.fit_transform(X)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.7, stratify=y, random_state=42)

# 第二次划分，得到测试集 (20%) 和验证集 (10%)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, stratify=y_temp, random_state=42)

# X_train, X_test, y_train, y_test = train_test_split(df[df.columns[1:31]], df["label"], train_size=0.8, stratify=df["label"])

数据生成

In [5]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, RandomOverSampler
# from pytorch_tabnet.tab_model import TabNetClassifier
import warnings
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# 过滤警告信息
warnings.filterwarnings("ignore", message="Best weights from best epoch are automatically used!")
warnings.filterwarnings("ignore", message="Detected call of `lr_scheduler.step()` before `optimizer.step()`")
sampling_strategy = {1:300, 2:300, 3:300,4:300}
# sampling_strategy = {1:300, 2:300,3:300}
smote = SMOTE(sampling_strategy=sampling_strategy,k_neighbors=3)
bsmote = BorderlineSMOTE(sampling_strategy=sampling_strategy,k_neighbors=3)
svmsmote = SVMSMOTE(sampling_strategy=sampling_strategy,k_neighbors=3)
# X_smote,y_smote=smote.fit_resample(X_train,y_train)
X_smote,y_smote=bsmote.fit_resample(X_train,y_train)
# X_smote,y_smote=svmsmote.fit_resample(X_train,y_train)
print("数据生成完毕")

数据生成完毕


定义FocalLoss

In [None]:
from sklearn.utils import compute_class_weight
from torch import nn
import torch

labels = df_new['label'].values

# 计算类别权重
classes = torch.unique(torch.tensor(labels))
class_weights = compute_class_weight(class_weight='balanced', classes=classes.numpy(), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float)
# 定义TabNet模型
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, input, target):
        ce_loss = nn.CrossEntropyLoss(reduction=self.reduction)(input, target)

        if self.alpha is not None:
            alpha = self.alpha.to(target.device)
            pt = torch.exp(-ce_loss)
            focal_loss = alpha * (1 - pt) ** self.gamma * ce_loss
        else:
            focal_loss = (1 - torch.exp(-ce_loss)) ** self.gamma * ce_loss

        return focal_loss

采用新数据训练模型

In [None]:
from sklearn.utils import compute_class_weight
from torch import nn
import torch



tabnet_model = TabNetClassifier()
tabnet_model.loss_fn=FocalLoss(gamma=2.0, alpha=class_weights, reduction='mean')
# tabnet_model.loss_fn=FocalLoss(class_weights)

# tabnet_model = TabNetClassifier()
avg_accuracy =0
tabnet_model.fit(
    X_train=X_smote,
    y_train=y_smote,
    eval_set=[(X_val, y_val), (X_test, y_test)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy'],
    max_epochs=100
)

    # 训练模型
test_preds = tabnet_model.predict(X_test)

    # 计算准确率
acc = accuracy_score(y_test, test_preds)
avg_accuracy += acc
result = confusion_matrix(y_test, test_preds)
# print("Confusion Matrix:")
# print(result)
result1 = classification_report(y_test, test_preds)
print("Classification Report:", )
print(result1)
print(f"Test accuracy score: {acc}")

加入群优化算法

In [None]:
from sklearn import datasets, metrics
from mealpy import Problem, FloatVar, IntegerVar

data = {
    "X_train": X_smote,
    "X_test": X_test,
    "y_train": y_smote,
    "y_test": y_test,
    "X_val":X_val,
    "y_val":y_val
}

# 定义优化问题中的Problem
class TabNetOptimizedProblem(Problem):
    def __init__(self, bounds=None, minmax="max", data=None, **kwargs):
        self.data = data
        super().__init__(bounds, minmax, **kwargs)

    def obj_func(self, x):
        x_decoded = self.decode_solution(x)
        n_d, n_a = x_decoded["n_d"], x_decoded["n_a"]
        n_steps, n_independent, n_shared = x_decoded["n_steps"], x_decoded["n_independent"], x_decoded["n_shared"]
        gamma, momentum = x_decoded["gamma"], x_decoded["momentum"]
        tabnet_model = TabNetClassifier(n_d=n_d, n_a=n_a, n_steps=n_steps,
                                        gamma=gamma, n_independent=n_independent, n_shared=n_shared,
                                        momentum=momentum, verbose=0)
        tabnet_model.loss_fn=FocalLoss(gamma=2.0, alpha=class_weights, reduction='mean')

        # Fit the model
        tabnet_model.fit(self.data["X_train"], self.data["y_train"],)
        # Make the predictions
        y_predict = tabnet_model.predict(self.data["X_test"])
        # Measure the performance
        return metrics.accuracy_score(self.data["y_test"], y_predict)


my_bounds = [
    FloatVar(lb=1.0, ub=2., name="gamma"),
    FloatVar(lb=0.01, ub=0.4, name="momentum"),
    IntegerVar(lb=8, ub=64, name="n_d"),
    IntegerVar(lb=8, ub=64, name="n_a"),
    IntegerVar(lb=3, ub=10, name="n_steps"),
    IntegerVar(lb=1, ub=5, name="n_independent"),
    IntegerVar(lb=1, ub=5, name="n_shared"),
]



In [None]:
from mealpy.system_based import PSO
import warnings
# warnings.filterwarnings("ignore", message="Best weights from best epoch are automatically used!")
# warnings.filterwarnings("ignore", message="Detected call of `lr_scheduler.step()` before `optimizer.step()`")
warnings.filterwarnings("ignore", category=UserWarning)
problem = TabNetOptimizedProblem(bounds=my_bounds, minmax="max", data=data)
model = PSO.OriginalPSO(epoch=10, pop_size=20,verbose=True)
model.solve(problem)

print(f"Best agent: {model.g_best}")
print(f"Best solution: {model.g_best.solution}")
print(f"Best accuracy: {model.g_best.target.fitness}")
print(f"Best parameters: {model.problem.decode_solution(model.g_best.solution)}")