In [3]:

# 导入Pandas库（Pandas常用于数据清洗、特征工程、数据探索）
import pandas as pd

# 1、加载数据
data = pd.read_csv('../file/红酒品质分类.csv')

'''
    名称：红酒品质分类
    数据量：1599条
    字段：12个 存在空字段：无
    字段类型：浮点型、整型
    占用内存： 150.0 KB
'''
print(f'数据信息：{data.info()}')
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
数据信息：None


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [13]:
# 导入sklearn.model_selection数据集分割模块（model_selection提供了数据分割和交叉验证工具）
from sklearn.model_selection import train_test_split

def split_and_save_train_valid(d):
    '''
    数据集划分并保存
    :param d: 
    :return: 
    '''
    x = d.iloc[:, :-1]
    y = d.iloc[:, -1] - 3
    
    # 2、训练集测试集划分
    x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size = 0.2, random_state = 22, stratify = y)
    
    pd.concat([x_train, y_train], axis = 1).to_csv('../file/红酒品质分类-train.csv')
    pd.concat([x_valid, y_valid], axis = 1).to_csv('../file/红酒品质分类-valid.csv')

In [16]:
# 导入XBBoost包
import xgboost as xgb
# 导入sklearn.metrics中分类评估模型（其中包含分类指标、精确度、召回率、F1分出）
from sklearn.metrics import classification_report
# 导入joblib模块（序列化Python对象的工具库：适合于大数据集和模型的高效存储和加载）
import joblib

def model_training():
    '''
    模型训练并保存
    :return: 
    '''
    
    # 1、数据集加载并划分
    train_data = pd.read_csv('../file/红酒品质分类-train.csv')
    valid_data = pd.read_csv('../file/红酒品质分类-valid.csv')

    # 训练集
    x_train = train_data.iloc[:, :-1]
    y_train = train_data.iloc[:, -1]

    # 测试集
    x_valid = valid_data.iloc[:, :-1]
    y_valid = valid_data.iloc[:, -1]
    
    # 2、XGBoost模型训练
    '''
        n_estimators:构建树的数量
        objective:定义学习任务和学习目标（multi:softmax 表示使用softmax目标进行多分类）
        eval_metri:模型评估标准（merror 表示分类错误率）
        eta：学习率/步长
        use_label_encoder：是否需要对标签编码
        random_state:随机种子，确保每次运行代码结果的可重复性
    '''
    estimator = xgb.XGBClassifier(n_estimators = 100,
                                  objective = 'multi:softmax',
                                  eval_metric = 'merror',
                                  eta = 0.1,
                                  use_label_encoder = False,
                                  random_state = 22)
    
    estimator.fit(x_train, y_train)
    
    # 3、模型评估
    y_predict = estimator.predict(x_valid)
    report = classification_report(y_true = y_valid, y_pred = y_predict)
    print(f'模型评估报告：{report}')
    
    # 4、模型保存
    estimator_path = '../file/红酒品质分类XGBoost模型.pth'
    joblib.dump(estimator, estimator_path)
    

In [47]:
# 样本不均衡问题处理
import numpy as np
from sklearn.utils import class_weight
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

def parameters_tune():
    '''
    模型参数调优（样本不均衡问题处理）
    :return: 
    '''

    # 1、数据加载
    train_data = pd.read_csv('../file/红酒品质分类-train.csv')
    valid_data = pd.read_csv('../file/红酒品质分类-valid.csv')

    # 训练集
    x_train = train_data.iloc[:, :-1]
    y_train = train_data.iloc[:, -1]

    # 测试集
    x_valid = valid_data.iloc[:, :-1]
    y_valid = valid_data.iloc[:, -1]

    # 2、训练的时候、指定样本的权重
    classes_weights = class_weight.compute_sample_weight(class_weight = 'balanced', y = y_train)
    # 加载模型
    estimator = joblib.load('../file/红酒品质分类XGBoost模型.pth')
    estimator.fit(x_train, y_train, sample_weight = classes_weights)

    y_predict = estimator.predict(x_valid)
    report = classification_report(y_true = y_valid, y_pred = y_predict)
    print(f'重新定义权重模型评估报告：{report}')



    # 3、 增加超参数和分层校验
    estimator = xgb.XGBClassifier(n_estimators = 100,
                                  objective = 'multi:softmax',
                                  eval_metric = 'merror',
                                  eta = 0.1,
                                  use_label_encoder = False,
                                  random_state = 22)
    # 定义超参数
    param_grid = {
        'max_depth': np.arange(3, 5, 1),
        'n_estimators': np.arange(50, 150, 50),
        'eta': np.arange(0.1, 1, 0.3)
    }
    # 分层交叉验证
    spliter = StratifiedKFold(n_splits=5, shuffle=True)

    # 交叉验证，网格搜索
    cv = GridSearchCV(estimator, param_grid, cv = spliter)
    cv.fit(x_train, y_train, sample_weight = classes_weights)
    cv_y_predict = cv.predict(x_valid)
    
    report = classification_report(y_true = y_valid, y_pred = cv_y_predict)
    print(f'调优模型评估报告：{report}')

In [50]:
parameters_tune()

Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



重新定义权重模型评估报告：              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.25      0.18      0.21        11
           2       0.71      0.84      0.77       136
           3       0.67      0.55      0.60       128
           4       0.46      0.47      0.47        40
           5       0.17      0.33      0.22         3

    accuracy                           0.64       320
   macro avg       0.38      0.40      0.38       320
weighted avg       0.64      0.64      0.64       320



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

调优模型评估报告：              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00        11
           2       0.68      0.82      0.74       136
           3       0.68      0.57      0.62       128
           4       0.56      0.57      0.57        40
           5       0.25      0.33      0.29         3

    accuracy                           0.65       320
   macro avg       0.36      0.38      0.37       320
weighted avg       0.63      0.65      0.64       320



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
split_and_save_train_valid(data)


In [17]:
model_training()

Parameters: { "use_label_encoder" } are not used.



模型评估报告：              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00        11
           2       0.70      0.83      0.76       136
           3       0.62      0.57      0.60       128
           4       0.57      0.50      0.53        40
           5       0.33      0.33      0.33         3

    accuracy                           0.65       320
   macro avg       0.37      0.37      0.37       320
weighted avg       0.62      0.65      0.63       320



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [49]:
!pip install xgboost==1.2.0

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Collecting xgboost==1.2.0
  Downloading http://mirrors.tencentyun.com/pypi/packages/f6/5c/1133b5b8f4f2fa740ff27abdd35b8e79ce6e1f8d6480a07e9bce1cdafea2/xgboost-1.2.0-py3-none-manylinux2010_x86_64.whl (148.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.9/148.9 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.1.1
    Uninstalling xgboost-2.1.1:
      Successfully uninstalled xgboost-2.1.1
Successfully installed xgboost-1.2.0
[0m