In [2]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from load_data import *
from config import *
from cal_accuracy import *

In [3]:
# path = r'./input/train_features_100_columns.xlsx'
path = r'./input/train_mfcc_features.xlsx'
data = pd.read_excel(path)

# 这一行记得删掉
# data = data[data['category'] != 80]

X = data[used_feature_columns].values
y = data['score'].values
X

array([[-2.71991577e+02,  8.61285858e+01,  1.04619980e+01, ...,
        -8.59493083e-02, -1.58480387e-01,  2.76203179e-01],
       [-2.37500397e+02,  1.08127800e+02, -1.38982105e+01, ...,
        -6.24737520e-02, -5.12937490e-01,  8.82254842e-02],
       [-3.57435699e+02,  1.48540115e+02, -3.92370453e+01, ...,
         1.26810181e-01, -4.24679764e-01, -1.44282221e-01],
       ...,
       [-3.33466522e+02,  1.76578857e+02, -5.46953354e+01, ...,
        -8.75556170e-02,  3.62523379e-02,  1.87776623e-01],
       [-3.42536560e+02,  1.72415955e+02, -4.48302116e+01, ...,
        -1.61726880e-01,  2.60044641e-01,  3.11749669e-01],
       [-3.50182678e+02,  1.76097061e+02, -3.97865868e+01, ...,
         1.69565890e-03,  9.68818259e-02, -4.63632737e-02]])

In [4]:
# 将数据集划分为训练集、验证集和测试集
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1, random_state=42)

In [5]:
# 2. 定义模型并进行交叉验证
params = {
    "objective": "reg:squarederror",
    "eta": 0.1,
    "max_depth": 6,
    "min_child_weight": 1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42,
}
num_rounds = 100

In [6]:
# 使用交叉验证进行模型选择
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5折交叉验证
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_val)):
    print(f"Training fold {fold}...")
    X_fold_train, y_fold_train = X_train_val[train_idx], y_train_val[train_idx]
    X_fold_val, y_fold_val = X_train_val[val_idx], y_train_val[val_idx]

    dtrain = xgb.DMatrix(X_fold_train, label=y_fold_train)
    dval = xgb.DMatrix(X_fold_val, label=y_fold_val)

    bst = xgb.train(params, dtrain, num_rounds, evals=[(dval, "val")], verbose_eval=10)
    # print(f"Fold {fold} best score: {bst.best_score}\n")

# 3. 在全量数据上重新训练模型，并在测试集上评估模型表现
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

watchlist = [(dtrain, "train"), (dval, "val")]
bst = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=10, verbose_eval=10)

# 在测试集上评估模型表现
y_pred = bst.predict(dtest)
mse = mean_squared_error(y_test, y_pred)
print(f"Test set MSE: {mse:.4f}")

Training fold 0...
[0]	val-rmse:4.28218
[10]	val-rmse:1.71807
[20]	val-rmse:0.92561
[30]	val-rmse:0.75091
[40]	val-rmse:0.71707
[50]	val-rmse:0.70460
[60]	val-rmse:0.70079
[70]	val-rmse:0.69941
[80]	val-rmse:0.69576
[90]	val-rmse:0.69601
[99]	val-rmse:0.69297
Training fold 1...
[0]	val-rmse:4.24342
[10]	val-rmse:1.70257
[20]	val-rmse:0.89853
[30]	val-rmse:0.67666
[40]	val-rmse:0.62805
[50]	val-rmse:0.61387
[60]	val-rmse:0.60907
[70]	val-rmse:0.60831
[80]	val-rmse:0.60779
[90]	val-rmse:0.60784
[99]	val-rmse:0.60688
Training fold 2...
[0]	val-rmse:4.25717
[10]	val-rmse:1.69740
[20]	val-rmse:0.93104
[30]	val-rmse:0.75824
[40]	val-rmse:0.72060
[50]	val-rmse:0.70921
[60]	val-rmse:0.70260
[70]	val-rmse:0.69812
[80]	val-rmse:0.69768
[90]	val-rmse:0.69980
[99]	val-rmse:0.70020
Training fold 3...
[0]	val-rmse:4.34271
[10]	val-rmse:1.74105
[20]	val-rmse:0.92668
[30]	val-rmse:0.71792
[40]	val-rmse:0.67429
[50]	val-rmse:0.65511
[60]	val-rmse:0.65136
[70]	val-rmse:0.64850
[80]	val-rmse:0.64598
[90]

In [7]:
# 保存模型
bst.save_model("xgboost_model.model")
# bst.save_model("xgboost_model.model")

In [8]:
a = pd.DataFrame()
a['true'] = y_test
a['predict'] = y_pred
a['difference'] = abs(y_pred - y_test)
a

Unnamed: 0,true,predict,difference
0,3.8,3.917978,0.117978
1,5.8,6.150219,0.350219
2,4.3,4.383216,0.083216
3,4.8,5.374737,0.574737
4,7.0,6.212053,0.787947
...,...,...,...
134,4.7,4.918137,0.218137
135,7.3,5.626795,1.673205
136,5.0,5.326736,0.326736
137,7.0,6.802062,0.197938


In [9]:
# predict_class_list = []
# true_class_list = []
# for i in range(len(a)):
#     predict_class, true_class = classify(a.predict[i], a.true[i])
#     predict_class_list.append(predict_class)
#     true_class_list.append(true_class)
#
# acc = accuracy(predict_class_list, true_class_list)
# acc

0.9424460431654677

In [10]:
from sklearn.metrics import r2_score
r2 = r2_score(a['true'], a['predict'])
print('r2:', r2)

r2: 0.814757195762998
