In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from load_data import *
from config import *
from cal_accuracy import *

In [2]:
path = r'./input/train_mfcc_features.xlsx'
data = pd.read_excel(path)

# 这一行记得删掉
# data = data[data['category'] != 80]

X = data[used_feature_columns].values
y = data['score'].values
X

array([[-3.45536407e+02,  1.53117401e+02, -3.95242233e+01, ...,
        -2.61893914e-02,  1.44600853e-01, -1.22696232e-01],
       [-3.12047394e+02,  1.72199463e+02, -4.61398582e+01, ...,
         1.45827789e-01, -3.15922471e-01, -1.31190235e-01],
       [-4.32839844e+02,  2.02102585e+02, -4.00666580e+01, ...,
         3.76228370e-01,  8.28478671e-01,  2.99108658e-01],
       ...,
       [-4.00168701e+02,  2.18579895e+02, -2.46945438e+01, ...,
         2.60418874e-02, -6.55715518e-02,  1.60245924e-01],
       [-4.16314362e+02,  2.12160629e+02, -2.31564560e+01, ...,
         2.36106313e-01,  1.74130710e-01,  1.19963453e-01],
       [-4.14621216e+02,  2.15601944e+02, -1.86578865e+01, ...,
         1.88304041e-01, -5.55893974e-02,  1.49449814e-01]])

In [3]:
# 将数据集划分为训练集、验证集和测试集
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1, random_state=42)

In [4]:
# 2. 定义模型并进行交叉验证
params = {
    "objective": "reg:squarederror",
    "eta": 0.1,
    "max_depth": 6,
    "min_child_weight": 1,
    "subsample": 0.8,
    # "colsample_bytree": 0.8,
    "colsample_bytree": 0.3,
    "seed": 42,
}
num_rounds = 100

In [5]:
# 使用交叉验证进行模型选择
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5折交叉验证
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_val)):
    print(f"Training fold {fold}...")
    X_fold_train, y_fold_train = X_train_val[train_idx], y_train_val[train_idx]
    X_fold_val, y_fold_val = X_train_val[val_idx], y_train_val[val_idx]

    dtrain = xgb.DMatrix(X_fold_train, label=y_fold_train)
    dval = xgb.DMatrix(X_fold_val, label=y_fold_val)

    bst = xgb.train(params, dtrain, num_rounds, evals=[(dval, "val")], verbose_eval=10)
    # print(f"Fold {fold} best score: {bst.best_score}\n")

# 3. 在全量数据上重新训练模型，并在测试集上评估模型表现
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

watchlist = [(dtrain, "train"), (dval, "val")]
bst = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=10, verbose_eval=10)

# 在测试集上评估模型表现
y_pred = bst.predict(dtest)
mse = mean_squared_error(y_test, y_pred)
print(f"Test set MSE: {mse:.4f}")

Training fold 0...
[0]	val-rmse:4.28542
[10]	val-rmse:1.73877
[20]	val-rmse:1.01883
[30]	val-rmse:0.86047
[40]	val-rmse:0.83388
[50]	val-rmse:0.82781
[60]	val-rmse:0.82600
[70]	val-rmse:0.82686
[80]	val-rmse:0.82529
[90]	val-rmse:0.82322
[99]	val-rmse:0.82067
Training fold 1...
[0]	val-rmse:4.24544
[10]	val-rmse:1.74116
[20]	val-rmse:0.97714
[30]	val-rmse:0.78369
[40]	val-rmse:0.74098
[50]	val-rmse:0.72644
[60]	val-rmse:0.72610
[70]	val-rmse:0.72294
[80]	val-rmse:0.71954
[90]	val-rmse:0.71818
[99]	val-rmse:0.71790
Training fold 2...
[0]	val-rmse:4.26036
[10]	val-rmse:1.71157
[20]	val-rmse:0.95130
[30]	val-rmse:0.77950
[40]	val-rmse:0.73736
[50]	val-rmse:0.72993
[60]	val-rmse:0.73004
[70]	val-rmse:0.72866
[80]	val-rmse:0.72860
[90]	val-rmse:0.73024
[99]	val-rmse:0.72993
Training fold 3...
[0]	val-rmse:4.33593
[10]	val-rmse:1.75961
[20]	val-rmse:0.99700
[30]	val-rmse:0.81242
[40]	val-rmse:0.77477
[50]	val-rmse:0.75819
[60]	val-rmse:0.75091
[70]	val-rmse:0.74220
[80]	val-rmse:0.73793
[90]

In [6]:
# 保存模型
bst.save_model("xgboost_model.model")
# bst.save_model("xgboost_model.model")

In [7]:
a = pd.DataFrame()
a['true'] = y_test
a['predict'] = y_pred
a['difference'] = abs(y_pred - y_test)
a

Unnamed: 0,true,predict,difference
0,3.8,3.802764,0.002764
1,5.8,6.107949,0.307949
2,4.3,4.360485,0.060485
3,4.8,5.327379,0.527379
4,7.0,6.147086,0.852914
...,...,...,...
134,4.7,5.036708,0.336708
135,7.3,5.626446,1.673554
136,5.0,5.067968,0.067968
137,7.0,6.207727,0.792273


In [8]:
# predict_class_list = []
# true_class_list = []
# for i in range(len(a)):
#     predict_class, true_class = classify(a.predict[i], a.true[i])
#     predict_class_list.append(predict_class)
#     true_class_list.append(true_class)
#
# acc = accuracy(predict_class_list, true_class_list)
# acc

In [9]:
from sklearn.metrics import r2_score

r2 = r2_score(a['true'], a['predict'])
print('r2:', r2)

r2: 0.745841385390802
