In [18]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from load_data import *
from config import *
from cal_accuracy import *

In [8]:
path = r'./input/train_features_100_columns.xlsx'
data = pd.read_excel(path)

# 这一行记得删掉
data = data[data['category'] != 80]

X = data[used_feature_columns].values
y = data['score'].values
X

array([[-2.52127609e+02,  8.61285858e+01,  1.04619980e+01, ...,
        -8.59494486e-02, -1.58480361e-01,  2.76203319e-01],
       [-2.79456085e+02,  1.08127800e+02, -1.38982105e+01, ...,
        -6.24734813e-02, -5.12937559e-01,  8.82252009e-02],
       [-3.05366821e+02,  1.48540115e+02, -3.92370453e+01, ...,
         1.26810337e-01, -4.24679665e-01, -1.44282138e-01],
       ...,
       [-4.16110138e+02,  1.76578857e+02, -5.46953316e+01, ...,
        -8.75555227e-02,  3.62524560e-02,  1.87776582e-01],
       [-3.92867310e+02,  1.72415955e+02, -4.48302155e+01, ...,
        -1.61726862e-01,  2.60044699e-01,  3.11749954e-01],
       [-3.96851105e+02,  1.76097061e+02, -3.97865868e+01, ...,
         1.69581815e-03,  9.68816644e-02, -4.63630866e-02]])

In [9]:
# 将数据集划分为训练集、验证集和测试集
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1, random_state=42)

In [10]:
# 2. 定义模型并进行交叉验证
params = {
    "objective": "reg:squarederror",
    "eta": 0.1,
    "max_depth": 6,
    "min_child_weight": 1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42,
}
num_rounds = 100

In [11]:
# 使用交叉验证进行模型选择
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5折交叉验证
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_val)):
    print(f"Training fold {fold}...")
    X_fold_train, y_fold_train = X_train_val[train_idx], y_train_val[train_idx]
    X_fold_val, y_fold_val = X_train_val[val_idx], y_train_val[val_idx]

    dtrain = xgb.DMatrix(X_fold_train, label=y_fold_train)
    dval = xgb.DMatrix(X_fold_val, label=y_fold_val)

    bst = xgb.train(params, dtrain, num_rounds, evals=[(dval, "val")], verbose_eval=10)
    # print(f"Fold {fold} best score: {bst.best_score}\n")

# 3. 在全量数据上重新训练模型，并在测试集上评估模型表现
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

watchlist = [(dtrain, "train"), (dval, "val")]
bst = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=10, verbose_eval=10)

# 在测试集上评估模型表现
y_pred = bst.predict(dtest)
mse = mean_squared_error(y_test, y_pred)
print(f"Test set MSE: {mse:.4f}")

Training fold 0...
[0]	val-rmse:4.47665
[10]	val-rmse:1.76476
[20]	val-rmse:0.90167
[30]	val-rmse:0.65854
[40]	val-rmse:0.60689
[50]	val-rmse:0.59181
[60]	val-rmse:0.58593
[70]	val-rmse:0.58183
[80]	val-rmse:0.57926
[90]	val-rmse:0.57879
[99]	val-rmse:0.57944
Training fold 1...
[0]	val-rmse:4.34722
[10]	val-rmse:1.62225
[20]	val-rmse:0.76968
[30]	val-rmse:0.56556
[40]	val-rmse:0.53033
[50]	val-rmse:0.52605
[60]	val-rmse:0.52346
[70]	val-rmse:0.52229
[80]	val-rmse:0.52174
[90]	val-rmse:0.52004
[99]	val-rmse:0.51951
Training fold 2...
[0]	val-rmse:4.45845
[10]	val-rmse:1.78589
[20]	val-rmse:0.93723
[30]	val-rmse:0.71047
[40]	val-rmse:0.65558
[50]	val-rmse:0.64743
[60]	val-rmse:0.64239
[70]	val-rmse:0.64186
[80]	val-rmse:0.64234
[90]	val-rmse:0.64238
[99]	val-rmse:0.64065
Training fold 3...
[0]	val-rmse:4.42467
[10]	val-rmse:1.68188
[20]	val-rmse:0.83505
[30]	val-rmse:0.62743
[40]	val-rmse:0.58412
[50]	val-rmse:0.57288
[60]	val-rmse:0.57169
[70]	val-rmse:0.56690
[80]	val-rmse:0.56697
[90]

In [12]:
# 保存模型
bst.save_model("xgboost_model_test.model")
# bst.save_model("xgboost_model.model")

In [26]:
a = pd.DataFrame()
a['true'] = y_test
a['predict'] = y_pred
a['difference'] = abs(y_pred - y_test)
a

Unnamed: 0,true,predict,difference
0,4.2,4.360538,0.160538
1,7.2,5.849472,1.350528
2,4.6,4.265465,0.334535
3,5.4,4.824483,0.575517
4,5.2,4.769087,0.430913
...,...,...,...
102,4.7,4.341842,0.358158
103,4.7,4.552692,0.147308
104,4.1,4.934181,0.834181
105,4.3,4.459157,0.159157


In [19]:
predict_class_list = []
true_class_list = []
for i in range(len(a)):
    predict_class, true_class = classify(a.predict[i], a.true[i])
    predict_class_list.append(predict_class)
    true_class_list.append(true_class)

acc = accuracy(predict_class_list, true_class_list)
acc

0.9532710280373832

In [27]:
from sklearn.metrics import r2_score
r2 = r2_score(a['true'], a['predict'])
print('r2:', r2)

r2: 0.8286441202481112
