In [17]:
import pandas as pd
import datetime
import warnings
import numpy as np
import pickle as pkl
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_error
from catboost import Pool, cv
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SMOTENC
from xgboost import XGBRegressor
from pandas_profiling import ProfileReport
import xgboost as xgb
from chinese_calendar import is_workday
import lightgbm as lgb
import optuna
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['SimHei']  #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  #用来正常显示负号

In [18]:
PATH = os.path.realpath(r"E:\document\code\python\02-competition\E-commerce-Sales-Forecast")
dataset_path = os.path.join(PATH, "dataset")

In [19]:
def generate_dataset(data_df: pd.DataFrame, cols=None):
    if cols is None:
        cols = ['总销量', '直播个数', '直播销量', '视频销量', '直播达人', '直播销量占比', '视频销量占比']
    remove_cols = list()
    data_df["时间"] = data_df["时间"].apply(lambda x: str(x).split(" ")[0])
    data_df["时间"] = pd.to_datetime(data_df["时间"])
    data_df["星期数"] = data_df["时间"].dt.isocalendar().week
    data_df["星期数"] = data_df["星期数"].astype(int)
    data_df["直播销量占比"] = data_df.apply(lambda x: 0.0 if x["总销量"] == 0 else x["直播销量"] / x["总销量"], axis=1)
    data_df["视频销量占比"] = data_df.apply(lambda x: 0.0 if x["总销量"] == 0 else x["视频销量"] / x["总销量"], axis=1)
    backward_list = list()
    for name in cols:
        forward_list = list()
        for _idx in range(1, 15):
            data_df[name + "_forward_" + str(_idx)] = data_df[name].shift(periods=_idx, fill_value=0)
            remove_cols.append(name + "_forward_" + str(_idx))
            forward_list.append(name + "_forward_" + str(_idx))
        forward_list.append(name)
        tmp_df_1 = data_df[forward_list[:7]]
        tmp_df_2 = data_df[forward_list[7:]]
        tmp_df = data_df[forward_list]
        data_df[name + "_前第一周天平均值"] = tmp_df_1.mean(axis=1)
        data_df[name + "_前第二周天平均值"] = tmp_df_2.mean(axis=1)
        data_df[name + "_前第一周天方差值"] = tmp_df_1.mean(axis=1)
        data_df[name + "_前第二周天方差值"] = tmp_df_2.mean(axis=1)
        data_df[name + "_平均值增量"] = data_df[name + "_前第一周天平均值"] - data_df[name + "_前第二周天平均值"]
        data_df[name + "_方差值增量"] = data_df[name + "_前第一周天方差值"] - data_df[name + "_前第二周天方差值"]
        data_df[name + "_前两周最大值"] = tmp_df.max(axis=1)
        data_df[name + "_前两周最小值"] = tmp_df.min(axis=1)
        if name == "总销量":
            for j in range(1, 8):
                data_df[name + "_backward_" + str(j)] = data_df[name].shift(periods=j * (-1), fill_value=0)
                backward_list.append(name + "_backward_" + str(j))
                remove_cols.append(name + "_backward_" + str(j))
            tmp_df = data_df[backward_list]
            data_df["后七天平均销量"] = tmp_df.mean(axis=1)
        # remove_cols.append(name)
    for _idx, date_time in enumerate(
            ["2021-12-26", "2021-12-25", "2021-12-24", "2021-12-23", "2021-12-22", "2021-12-21"]):
        data_df.loc[data_df[data_df["时间"] == date_time].index, "后七天平均销量"] = data_df.loc[
            data_df[data_df["时间"] == date_time].index, backward_list[:_idx + 1]].mean(axis=1)
    data_df.drop(columns=remove_cols, inplace=True)

In [20]:
df_time = pd.DataFrame()
df_time["时间"] = pd.date_range("2021-07-08", "2022-01-03")
date_lst = list()
date_lst.extend(pd.date_range("2021-08-01", "2021-08-18"))
date_lst.extend(pd.date_range("2021-08-25", "2021-09-01"))
date_lst.extend(pd.date_range("2021-09-10", "2021-09-10"))
date_lst.extend(pd.date_range("2021-09-29", "2021-10-10"))
date_lst.extend(pd.date_range("2021-10-27", "2021-11-11"))
date_lst.extend(pd.date_range("2021-12-01", "2021-12-12"))
date_lst.extend(pd.date_range("2021-12-31", "2022-01-01"))
lst_date = list()
for str_date in date_lst:
    lst_date.append(str(str_date).split(" ")[0])
df_time["时间"] = df_time["时间"].apply(lambda x: str(x).split(" ")[0])
df_time["是否为购物节"] = df_time["时间"].apply(lambda x: 1 if x in lst_date else 0)
backward_dates = list()
forward_dates = list()
for dt_j in range(1, 8):
    df_time["购物节_backward_" + str(dt_j)] = df_time["是否为购物节"].shift(periods=dt_j * (-1), fill_value=0)
    df_time["购物节_forward_" + str(dt_j)] = df_time["是否为购物节"].shift(periods=dt_j, fill_value=0)
    backward_dates.append("购物节_backward_" + str(dt_j))
    forward_dates.append("购物节_forward_" + str(dt_j))
tmp_backward_time_df = df_time[backward_dates]
tmp_forward_time_df = df_time[forward_dates]
df_time["未来一周购物节天数"] = tmp_backward_time_df.sum(axis=1)
df_time["前一周购物节天数"] = tmp_forward_time_df.sum(axis=1)
df_time = df_time[df_time["时间"] <= "2021-12-27"]
df_time = df_time[df_time["时间"] >= "2021-07-08"]

In [13]:
df_time.tail(20)

Unnamed: 0,时间,是否为购物节,购物节_backward_1,购物节_forward_1,购物节_backward_2,购物节_forward_2,购物节_backward_3,购物节_forward_3,购物节_backward_4,购物节_forward_4,购物节_backward_5,购物节_forward_5,购物节_backward_6,购物节_forward_6,购物节_backward_7,购物节_forward_7,未来一周购物节天数,前一周购物节天数
153,2021-12-08,1,1,1,1,1,1,1,1,1,0,1,0,1,0,1,4,7
154,2021-12-09,1,1,1,1,1,1,1,0,1,0,1,0,1,0,1,3,7
155,2021-12-10,1,1,1,1,1,0,1,0,1,0,1,0,1,0,1,2,7
156,2021-12-11,1,1,1,0,1,0,1,0,1,0,1,0,1,0,1,1,7
157,2021-12-12,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,7
158,2021-12-13,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,7
159,2021-12-14,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,6
160,2021-12-15,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,5
161,2021-12-16,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,4
162,2021-12-17,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,3


In [21]:
path_list = os.listdir(os.path.join(PATH, "dataset"))
csv_list = list()
for file_name in path_list:
    df = pd.read_csv(os.path.join(PATH, "dataset", file_name))
    df["是否为购物节"] = df_time["是否为购物节"].copy(deep=True)
    df["前一周购物节天数"] = df_time["前一周购物节天数"].copy(deep=True)
    df["未来一周购物节天数"] = df_time["未来一周购物节天数"].copy(deep=True)
    generate_dataset(df)
    csv_list.append(df)
dataset = pd.concat(csv_list, axis=0)
dataset_goods = list(dataset.商品id.unique())

In [26]:
dataset[["商品id", "时间", "总销量", "后七天平均销量"]].tail(12)

Unnamed: 0,商品id,时间,总销量,后七天平均销量
161,1449,2021-12-16,1700,1291.0
162,1449,2021-12-17,1101,1239.571429
163,1449,2021-12-18,865,1268.285714
164,1449,2021-12-19,1582,1402.285714
165,1449,2021-12-20,1168,1664.285714
166,1449,2021-12-21,1103,1757.833333
167,1449,2021-12-22,1706,1768.2
168,1449,2021-12-23,1512,1832.25
169,1449,2021-12-24,741,2196.0
170,1449,2021-12-25,1066,2761.0


In [23]:
dic_goods = dict()
for i, goods in enumerate(dataset.商品id.unique()):
    dic_goods.setdefault(goods, i + 1)
dataset["商品id"] = dataset["商品id"].map(dic_goods)

In [31]:
df_group_by_day_sum = dataset.groupby(by=["时间"], as_index=False).sum()

In [33]:
df_day_sum = df_group_by_day_sum[["时间", "总销量", "浏览量", "直播销量", "视频销量"]].copy(deep=True)
df_day_sum.rename(columns={"总销量": "总销量_all", "浏览量": "浏览量_all", "直播销量": "直播销量_all", "视频销量": "视频销量_all"}, inplace=True)

In [34]:
dataset = pd.merge(left=dataset, right=df_day_sum, how="inner")

In [36]:
dataset["总销量总比"] = dataset.apply(lambda x: 0.0 if x["总销量_all"] == 0 else x["总销量"] / x["总销量_all"], axis=1)
dataset["浏览量总比"] = dataset.apply(lambda x: 0.0 if x["浏览量_all"] == 0 else x["浏览量"] / x["浏览量_all"], axis=1)
dataset["直播销量总比_all"] = dataset.apply(lambda x: 0.0 if x["直播销量_all"] == 0 else x["直播销量"] / x["直播销量_all"], axis=1)
dataset["视频销量总比"] = dataset.apply(lambda x: 0.0 if x["视频销量_all"] == 0 else x["视频销量"] / x["视频销量_all"], axis=1)

In [37]:
start_time = "2021-12-21"  # datetime.datetime.strptime("2021-12-21", "%Y-%m-%d").date()
end_time = "2021-12-27"  # datetime.datetime.strptime("2021-12-27", "%Y-%m-%d").date()
dataset_train = dataset[dataset["时间"] < "2021-12-25"]
dataset_test = dataset[dataset["时间"] == end_time]

In [38]:
list_sum_columns = list()
for col in dataset_train.columns:
    if col.endswith("_前第一周天平均值") or col in ['总销量', '浏览量', '抖音转化率', '视频个数', '直播个数', '直播销量', '视频销量', '视频达人', '直播达人']:
        list_sum_columns.append(col)
df_tmp_ = dataset_train[list_sum_columns]
dataset_train["是否为0"] = df_tmp_.sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [39]:
dataset_train = dataset_train[dataset_train["是否为0"] >= 1e-6]

In [40]:
test_dataset = dataset_test.drop(columns=["后七天平均销量", "时间"])
dataset_train.drop(columns=["时间", "是否为0"], inplace=True)
train_dataset_x = dataset_train.drop(columns=["后七天平均销量"])
train_dataset_y = dataset_train["后七天平均销量"].copy(deep=True)

In [41]:
train_columns = train_dataset_x.columns
train_goods = list(train_dataset_x.商品id.unique())
test_goods = list(test_dataset.商品id.unique())

In [42]:
# 构建模型
params_catboost = {"iterations": 1000, "learning_rate": 0.03, "l2_leaf_reg": 3, "bagging_temperature": 1,
                   "subsample": 0.66, "random_strength": 1, "depth": 6,
                   "rsm": 1, "one_hot_max_size": 2, "leaf_estimation_method": "Gradient", "fold_len_multiplier": 2,
                   "border_count": 128, "random_seed": 2022, "loss_function": "RMSE", "eval_metric": "RMSE",
                   "od_type": "Iter", "od_wait": 50}

params_lightgbm = {"n_estimators": 300,
                   "learning_rate": 0.1,
                   "num_leaves": 500,
                   "max_depth": 6,
                   "min_data_in_leaf": 1000,
                   "lambda_l1": 1,
                   "lambda_l2": 1,
                   "min_gain_to_split": 1,
                   "bagging_fraction": 0.8,
                   "feature_fraction": 0.8,
                   "random_state": 2022,
                   "metric": "rmse"}

params_xgboost = {'booster': 'gbtree',
                  'objective': 'reg:squarederror',
                  'eval_metric': 'rmse',
                  'seed': 2022,
                  'learning_rate': 0.01,
                  'gamma': 0.1,
                  'min_child_weight': 1.1,
                  'max_depth': 5,
                  'lambda': 10,
                  'subsample': 0.7,
                  'colsample_bytree': 0.7,
                  'colsample_bylevel': 0.7,
                  'tree_method': 'exact'
                  }

In [43]:
# 构建模型
params_catboost = {"iterations": 1000, "learning_rate": 0.03, "l2_leaf_reg": 3, "bagging_temperature": 1,
                   "subsample": 0.66, "random_strength": 1, "depth": 6,
                   "rsm": 1, "one_hot_max_size": 2, "leaf_estimation_method": "Gradient", "fold_len_multiplier": 2,
                   "border_count": 128, "random_seed": 2022, "loss_function": "RMSE", "eval_metric": "RMSE",
                   "od_type": "Iter", "od_wait": 50}

params_lightgbm = {"n_estimators": 300,
                   "learning_rate": 0.1,
                   "num_leaves": 500,
                   "max_depth": 6,
                   "min_data_in_leaf": 1000,
                   "lambda_l1": 1,
                   "lambda_l2": 1,
                   "min_gain_to_split": 1,
                   "bagging_fraction": 0.8,
                   "feature_fraction": 0.8,
                   "random_state": 2022,
                   "metric": "rmse"}

params_xgboost = {'booster': 'gbtree',
                  'objective': 'reg:squarederror',
                  'eval_metric': 'rmse',
                  'seed': 2022,
                  'learning_rate': 0.01,
                  'gamma': 0.1,
                  'min_child_weight': 1.1,
                  'max_depth': 5,
                  'lambda': 10,
                  'subsample': 0.7,
                  'colsample_bytree': 0.7,
                  'colsample_bylevel': 0.7,
                  'tree_method': 'exact'
                  }

In [44]:
def model_fun(train_sample, train_label, params, model, model_name="catboost"):
    print("------------------- 训练 {} 模型 -------------------".format(model_name))
    x_train, x_test, y_train, y_test = train_test_split(train_sample, train_label, test_size=0.2, random_state=2022)
    _model = model(**params)

    print("x_train.shape : {}".format(x_train.shape))
    print("y_train.shape : {}".format(y_train.shape))
    print("x_test.shape : {}".format(x_test.shape))
    print("y_test.shape : {}".format(y_test.shape))
    _model.fit(x_train, y_train)
    if model_name.lower() == "catboost":
        _model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)], cat_features=["商品id"], verbose=50)
    elif model_name.lower() == "lightgbm":
        _model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)], eval_metric='rmse', verbose=50)
    else:
        _model.fit(x_train, y_train)

    train_pred = _model.predict(x_train)
    train_mse = mean_squared_error(y_train, train_pred)

    test_pred = _model.predict(x_test)
    test_mse = mean_squared_error(y_test, test_pred)

    result_dic = {"train_mse": train_mse, "test_mse": test_mse}
    return _model, result_dic

In [45]:
def feature_importance_model(model, model_name="catboost", cols=None):
    print("------------------------- feature_importance_model -------------------------")
    if model_name.lower() == "lightgbm":
        booster = model.booster_
        importance = booster.feature_importance(importance_type='split')
        feature_name = booster.feature_name()
    elif model_name.lower() == "catboost":
        importance = model.get_feature_importance()
        feature_name = model.feature_names_
    elif model_name == "xgboost":
        importance = model.feature_importances_
        feature_name = cols
    else:
        print("请选择正确的训练模型：[catboost, lightgbm, xgboost]")
        return pd.DataFrame()
    result = pd.DataFrame(importance, index=feature_name, columns=["FeatureImportance"])
    return result.sort_values("FeatureImportance", ascending=False)

In [46]:
# Xgboost
model_tree_xgb, model_result_xgb = model_fun(train_dataset_x, train_dataset_y, params_xgboost, xgb.XGBRegressor,
                                             "xgboost")
feature_importance_xgb = feature_importance_model(model_tree_xgb, "xgboost", train_columns)
feature_importance_xgb.head(20)

------------------- 训练 xgboost 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)
------------------------- feature_importance_model -------------------------


Unnamed: 0,FeatureImportance
总销量_前第一周天平均值,0.105982
总销量_前第一周天方差值,0.103502
总销量_前第二周天平均值,0.100102
总销量_前第二周天方差值,0.078535
总销量_前两周最小值,0.062634
直播销量_前两周最小值,0.060711
直播销量_前第一周天方差值,0.042956
总销量,0.041661
视频销量_前第一周天方差值,0.032761
直播个数_前两周最小值,0.032591


In [52]:
def transfer_result(result_score_model, model_name: str, data_goods, _train_goods, _dic_goods):
    print("------------------------- transfer_result -------------------------")
    # res_arr = result_score_model.reshape([-1, 7])
    # lst = list()
    # for val in res_arr:
    #     lst.append(val[-1])
    lst = list(result_score_model)
    dic_col = dict(zip(data_goods, lst))
    dic_res = dict()
    for col_name in data_goods:
        if col_name in _train_goods:
            dic_res.setdefault(col_name, dic_col.get(col_name))
        else:
            dic_res.setdefault(col_name, 0.0)

    submission = pd.read_csv(os.path.join(PATH, "提交示例.csv"))
    submission["编码id"] = submission["商品id"].map(_dic_goods)
    submission["未来一周天均销量"] = submission["编码id"].map(dic_res)
    submission.drop(columns=["编码id"], inplace=True)
    submission.to_csv(os.path.join(PATH, "result_" + model_name + ".csv"), encoding="utf_8_sig", index=False)
    return submission

In [53]:

# def get_params_dic(model_name: str):
#     print("------------------------- get_params_dic -------------------------")
#
#     params_catboost = {"iterations": 1000, "learning_rate": 0.03, "l2_leaf_reg": 3, "bagging_temperature": 1,
#                        "subsample": 0.66, "random_strength": 1, "depth": 6,
#                        "rsm": 1, "one_hot_max_size": 2, "leaf_estimation_method": "Gradient", "fold_len_multiplier": 2,
#                        "border_count": 128, "random_seed": 2022, "loss_function": "RMSE", "eval_metric": "RMSE",
#                        "od_type": "Iter", "od_wait": 50}
#
#     params_lightgbm = {"n_estimators": 300,
#                        "learning_rate": 0.1,
#                        "num_leaves": 500,
#                        "max_depth": 6,
#                        "min_data_in_leaf": 1000,
#                        "lambda_l1": 1,
#                        "lambda_l2": 1,
#                        "min_gain_to_split": 1,
#                        "bagging_fraction": 0.8,
#                        "feature_fraction": 0.8,
#                        "random_state": 2022,
#                        "metric": "rmse"}
#
#     params_xgboost = {'booster': 'gbtree',
#                       'objective': 'reg:squarederror',
#                       'eval_metric': 'rmse',
#                       'seed': 2022,
#                       'learning_rate': 0.01,
#                       'gamma': 0.1,
#                       'min_child_weight': 1.1,
#                       'max_depth': 5,
#                       'lambda': 10,
#                       'subsample': 0.7,
#                       'colsample_bytree': 0.7,
#                       'colsample_bylevel': 0.7,
#                       'tree_method': 'exact'}
#
#     if model_name.lower() == "catboost":
#         if os.path.exists(os.path.join(path, "catboost_params.pkl")):
#             with open(os.path.join(path, "catboost_params.pkl"), "rb") as f:
#                 cat_params = pkl.load(f)
#             return cat_params
#         with open(os.path.join(path, "catboost_params.pkl"), "wb") as f:
#             pkl.dump(params_catboost, f)
#         return params_catboost
#     elif model_name.lower() == "lightgbm":
#         if os.path.exists(os.path.join(path, "lightgbm_params.pkl")):
#             with open(os.path.join(path, "lightgbm_params.pkl"), "rb") as f:
#                 lgbm_params = pkl.load(f)
#             return lgbm_params
#         with open(os.path.join(path, "params_lightgbm.pkl"), "wb") as f:
#             pkl.dump(params_catboost, f)
#         return params_lightgbm
#     elif model_name.lower() == "xgboost":
#         if os.path.exists(os.path.join(path, "params_xgboost.pkl")):
#             with open(os.path.join(path, "params_xgboost.pkl"), "rb") as f:
#                 params_xgboost = pkl.load(f)
#             return params_xgboost
#         with open(os.path.join(path, "params_xgboost.pkl"), "wb") as f:
#             pkl.dump(params_xgboost, f)
#         return params_xgboost
#     else:
#         print("-----------> 数据的模型名称不在 {catboost, lightgbm, xgboost}, 请重新输入")
#         return dict()


# catboost模型调参
def model_catboost(dic_param):
    print("------------------------- model_catboost -------------------------")

    def catboost_cv(iterations, learning_rate, depth, subsample, l2_leaf_reg):
        cat_params = {"iterations": 1000, "learning_rate": 0.03, "l2_leaf_reg": 3, "bagging_temperature": 1,
                      "subsample": 0.66, "random_strength": 1, "depth": 6,
                      "rsm": 1, "one_hot_max_size": 2, "leaf_estimation_method": "Gradient", "fold_len_multiplier": 2,
                      "border_count": 128, "random_seed": 2022, "loss_function": "RMSE", "eval_metric": "RMSE",
                      "od_type": "Iter", "od_wait": 50}
        cat_params.update(
            {"iterations": int(iterations), "depth": int(depth), "learning_rate": learning_rate, "subsample": subsample,
             "l2_leaf_reg": int(l2_leaf_reg)})
        _, result = model_fun(dic_param["train_sample"], dic_param["train_label"], cat_params, dic_param["model"],
                              dic_param["model_name"])
        return result.get("test_mse")

    cat_bayes = BayesianOptimization(catboost_cv, dic_param["param_value_dics"], random_state=2022)
    cat_bayes.maximize(init_points=1, n_iter=25)
    return cat_bayes.max.get("params")


# lightgbm模型调参
def model_lightgbm(dic_param):
    print("------------------------- model_lightgbm -------------------------")

    def lightgbm_cv(n_estimators, learning_rate, max_depth, feature_fraction, bagging_fraction):
        lgbm_params = {"n_estimators": 300,
                       "learning_rate": 0.1,
                       "num_leaves": 500,
                       "max_depth": 6,
                       "min_data_in_leaf": 1000,
                       "lambda_l1": 1,
                       "lambda_l2": 1,
                       "min_gain_to_split": 1,
                       "bagging_fraction": 0.8,
                       "feature_fraction": 0.8,
                       "random_state": 2022,
                       "metric": "rmse"}
        lgbm_params.update(
            {"n_estimators": int(n_estimators), "max_depth": int(max_depth), "learning_rate": learning_rate,
             "feature_fraction": feature_fraction,
             "bagging_fraction": bagging_fraction})
        _, result = model_fun(dic_param["train_sample"], dic_param["train_label"], lgbm_params, dic_param["model"],
                              dic_param["model_name"])
        return result.get("test_mse")

    lightgbm_bayes = BayesianOptimization(lightgbm_cv, dic_param["param_value_dics"], random_state=2022)
    lightgbm_bayes.maximize(init_points=1, n_iter=25)
    return lightgbm_bayes.max.get("params")


# xgboost模型调参
def model_xgboost(dic_param):
    print("------------------------- model_xgboost -------------------------")

    def xgboost_cv(n_estimators, learning_rate, max_depth, colsample_bytree, colsample_bylevel, gamma):
        xgb_params = {'booster': 'gbtree',
                      'objective': 'reg:squarederror',
                      'eval_metric': 'rmse',
                      'seed': 2022,
                      'learning_rate': 0.01,
                      'gamma': 0.1,
                      'min_child_weight': 1.1,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'tree_method': 'exact'
                      }
        xgb_params.update(
            {"n_estimators": int(n_estimators), "max_depth": int(max_depth), "learning_rate": learning_rate,
             "colsample_bytree": colsample_bytree,
             "colsample_bylevel": colsample_bylevel,
             "gamma": gamma})
        _, result = model_fun(dic_param["train_sample"], dic_param["train_label"], xgb_params, dic_param["model"],
                              dic_param["model_name"])
        return result.get("test_mse")

    xgboost_bayes = BayesianOptimization(xgboost_cv, dic_param["param_value_dics"], random_state=2022)
    xgboost_bayes.maximize(init_points=1, n_iter=25)
    return xgboost_bayes.max.get("params")


# 模型预测
def get_result(params_dic, model, model_name: str, x_data, y_data, test_data):
    print("------------------------- get_result -------------------------")
    ans = []
    mean_score = 0
    sk = KFold(n_splits=10, shuffle=True, random_state=2022)
    _model = model(**params_dic)
    for train_index, test_index in sk.split(x_data, y_data):
        x_train = x_data.iloc[train_index]
        y_train = y_data.iloc[train_index]
        x_test = x_data.iloc[test_index]
        y_test = y_data.iloc[test_index]
        if model_name.lower() == "catboost":
            regressor_model = _model.fit(x_train, y_train, eval_set=(x_test, y_test), verbose=500,
                                         cat_features=["商品id"])
        elif model_name.lower() == "lightgbm":
            regressor_model = _model.fit(x_train, y_train, eval_set=(x_test, y_test), verbose=500)
        else:
            regressor_model = _model.fit(x_train, y_train)
        y_pred = regressor_model.predict(x_test)
        test_mse = mean_squared_error(y_test, y_pred)
        print("model 验证MSE：{}".format(test_mse))
        mean_score += test_mse / 10.
        y_test_pred = regressor_model.predict(test_data)
        ans.append(y_test_pred)
    print("10折平均MSE：{}".format(mean_score))
    model_pred = sum(ans) / 10.
    return model_pred

In [54]:
# catboost
catboost_params_dic = {"iterations": (50, 600), "depth": (3, 10), "learning_rate": (0.02, 0.3),
                       "subsample": (0.6, 1.0),
                       "l2_leaf_reg": (1, 10)}
catboost_bayes_dic = {"param_value_dics": catboost_params_dic, "train_sample": train_dataset_x,
                      "train_label": train_dataset_y, "model": CatBoostRegressor, "model_name": "catboost"}
catboost_best_params = model_catboost(catboost_bayes_dic)

------------------------- model_catboost -------------------------
|   iter    |  target   |   depth   | iterat... | l2_lea... | learni... | subsample |
-------------------------------------------------------------------------------------
------------------- 训练 catboost 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)
0:	learn: 4808.5461907	total: 13.1ms	remaining: 4.25s
1:	learn: 4728.8647214	total: 19.2ms	remaining: 3.08s
2:	learn: 4649.2536341	total: 25.3ms	remaining: 2.71s
3:	learn: 4578.8758657	total: 32.1ms	remaining: 2.57s
4:	learn: 4507.2355308	total: 38.8ms	remaining: 2.47s
5:	learn: 4435.6003636	total: 44.7ms	remaining: 2.37s
6:	learn: 4369.4960157	total: 51.1ms	remaining: 2.31s
7:	learn: 4304.9433153	total: 58.1ms	remaining: 2.29s
8:	learn: 4245.3337884	total: 64.8ms	remaining: 2.27s
9:	learn: 4182.0474844	total: 70.5ms	remaining: 2.21s
10:	learn: 4131.4648042	total: 76.9ms	remaining: 2.19s
11:	lea

In [55]:
catboost_best_params["depth"] = round(catboost_best_params["depth"])
catboost_best_params["iterations"] = round(catboost_best_params["iterations"])
catboost_best_params["learning_rate"] = round(catboost_best_params["learning_rate"], 4)
catboost_best_params["l2_leaf_reg"] = round(catboost_best_params["l2_leaf_reg"])
catboost_best_params["subsample"] = round(catboost_best_params["subsample"], 4)
params_catboost.update(catboost_best_params)

with open(os.path.join(PATH, "catboost_best_params.pkl"), "wb") as f:
    pkl.dump(params_catboost, f)
result_score_cat = get_result(params_catboost, CatBoostRegressor, "catboost", train_dataset_x, train_dataset_y,
                              test_dataset)
result_cat = transfer_result(result_score_cat, "catboost", test_goods, train_goods, dic_goods)
model_cat, result_dic_cat = model_fun(train_dataset_x, train_dataset_y, params_catboost, CatBoostRegressor, "catboost")
feature_importance_cat = feature_importance_model(model_cat, "catboost", train_columns)
feature_importance_cat.to_csv(os.path.join(PATH, "feature_importance_cat.csv"), encoding="utf_8_sig")

------------------------- get_result -------------------------
0:	learn: 5045.7292061	test: 5090.1524895	best: 5090.1524895 (0)	total: 33.9ms	remaining: 2.1s
62:	learn: 2563.8568954	test: 2490.5543046	best: 2490.5543046 (62)	total: 1.75s	remaining: 0us

bestTest = 2490.554305
bestIteration = 62

model 验证MSE：6202860.530715625
0:	learn: 4824.6648686	test: 6709.7790080	best: 6709.7790080 (0)	total: 48.4ms	remaining: 3s
62:	learn: 2532.6583829	test: 3369.3166104	best: 3369.3166104 (62)	total: 1.73s	remaining: 0us

bestTest = 3369.31661
bestIteration = 62

model 验证MSE：11352294.394434473
0:	learn: 5068.2218642	test: 4867.8911375	best: 4867.8911375 (0)	total: 85.3ms	remaining: 5.29s
62:	learn: 2547.0642248	test: 2852.8037506	best: 2852.8037506 (62)	total: 1.76s	remaining: 0us

bestTest = 2852.803751
bestIteration = 62

model 验证MSE：8138489.02570332
0:	learn: 5159.7932697	test: 3867.8479420	best: 3867.8479420 (0)	total: 91ms	remaining: 5.64s
62:	learn: 2557.9801531	test: 2324.4705806	best: 2324

In [56]:
# lightgbm
lightgbm_params_dic = {"n_estimators": (50, 2000), "max_depth": (3, 10), "learning_rate": (0.02, 0.3),
                       "feature_fraction": (0.6, 1.0),
                       "bagging_fraction": (0.6, 1.)}
lightgbm_bayes_dic = {"param_value_dics": lightgbm_params_dic, "train_sample": train_dataset_x,
                      "train_label": train_dataset_y, "model": lgb.LGBMRegressor, "model_name": "lightgbm"}
lightgbm_best_params = model_lightgbm(lightgbm_bayes_dic)

lightgbm_best_params["max_depth"] = round(lightgbm_best_params["max_depth"])
lightgbm_best_params["n_estimators"] = round(lightgbm_best_params["n_estimators"])
lightgbm_best_params["learning_rate"] = round(lightgbm_best_params["learning_rate"], 4)
lightgbm_best_params["feature_fraction"] = round(lightgbm_best_params["feature_fraction"], 4)
lightgbm_best_params["bagging_fraction"] = round(lightgbm_best_params["bagging_fraction"], 4)
params_lightgbm.update(lightgbm_best_params)
with open(os.path.join(PATH, "lightgbm_best_params.pkl"), "wb") as f:
    pkl.dump(params_lightgbm, f)
result_score_lgbm = get_result(params_lightgbm, lgb.LGBMRegressor, "lightgbm", train_dataset_x, train_dataset_y,
                               test_dataset)
result_lgbm = transfer_result(result_score_lgbm, "lightgbm", test_goods, train_goods, dic_goods)

model_lgbm, result_dic_lgbm = model_fun(train_dataset_x, train_dataset_y, params_lightgbm, lgb.LGBMRegressor,
                                        "lightgbm")
feature_importance_lgbm = feature_importance_model(model_lgbm, "lightgbm", train_columns)
feature_importance_lgbm.to_csv(os.path.join(PATH, "feature_importance_lgbm.csv"), encoding="utf_8_sig")

------------------------- model_lightgbm -------------------------
|   iter    |  target   | baggin... | featur... | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------------------
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3569.17	valid_1's rmse: 4795.13
[100]	training's rmse: 3405.95	valid_1's rmse: 4571.44
[150]	training's rmse: 3305.31	valid_1's rmse: 4425.56
[200]	training's rmse: 3228.96	valid_1's rmse: 4312.71
[250]	training's rmse: 3172.16	valid_1's rmse: 4227.71
[300]	training's rmse: 3127.92	valid_1's rmse: 4156.83
[350]	training's rmse: 3090.56	valid_1's rmse: 4094.83
[400]	training's rmse: 3053.9	valid_1's rmse: 4033.31
[450]	training's rmse: 3025.94	valid_1's rmse: 3986.24
[500]	training's rmse: 2999.49	valid_1's rmse: 3941.35
[550]	training's rmse: 2973.76	valid_1's rmse: 3898.34
[600]	training's rmse: 2953.74	valid_1's rmse: 3861.09
[650]	training's rmse: 2931.05	valid_1's rmse: 3822.47
[700]	training's rmse: 2911.16	valid_1's rmse: 3786.99
[750]	training's rmse: 2892.4	valid_1's rmse: 3754.03
[800]	training's rmse: 2875.3	valid_1's rmse: 3724.1
[850]	training's rmse: 2859.42	valid_1's rmse: 3695.24
[900]	training's rmse: 2843.69	valid_1's rmse: 3667.35
[950]	training'



[50]	training's rmse: 3134.07	valid_1's rmse: 4172.3
[100]	training's rmse: 2948.37	valid_1's rmse: 3872.18
[150]	training's rmse: 2836.55	valid_1's rmse: 3680.5
[200]	training's rmse: 2744.01	valid_1's rmse: 3523.8
[250]	training's rmse: 2660.97	valid_1's rmse: 3395.08
[300]	training's rmse: 2593.42	valid_1's rmse: 3284.32
[350]	training's rmse: 2525.24	valid_1's rmse: 3182.2
[400]	training's rmse: 2469.05	valid_1's rmse: 3095.45
[450]	training's rmse: 2420.08	valid_1's rmse: 3014.69
[500]	training's rmse: 2376.13	valid_1's rmse: 2949.55
[550]	training's rmse: 2337.05	valid_1's rmse: 2889
[600]	training's rmse: 2294.63	valid_1's rmse: 2829.12
[650]	training's rmse: 2259.14	valid_1's rmse: 2771.24
[700]	training's rmse: 2224.06	valid_1's rmse: 2717.42
[750]	training's rmse: 2192.1	valid_1's rmse: 2669.17
[800]	training's rmse: 2160	valid_1's rmse: 2622.6
[850]	training's rmse: 2133.73	valid_1's rmse: 2581.02
[900]	training's rmse: 2106.35	valid_1's rmse: 2538.56
[950]	training's rmse: 



[50]	training's rmse: 3182.69	valid_1's rmse: 4259.98
[100]	training's rmse: 2998.32	valid_1's rmse: 3966.96
[150]	training's rmse: 2885.2	valid_1's rmse: 3774.21
[200]	training's rmse: 2798.35	valid_1's rmse: 3636.63
[250]	training's rmse: 2723.75	valid_1's rmse: 3518.72
[300]	training's rmse: 2658.83	valid_1's rmse: 3410.65
[350]	training's rmse: 2600.98	valid_1's rmse: 3319
[400]	training's rmse: 2542.28	valid_1's rmse: 3225.87
[450]	training's rmse: 2491.44	valid_1's rmse: 3146.66
[500]	training's rmse: 2447.75	valid_1's rmse: 3075.82
[550]	training's rmse: 2407.45	valid_1's rmse: 3012.83
[600]	training's rmse: 2366.69	valid_1's rmse: 2949.48
[650]	training's rmse: 2332.38	valid_1's rmse: 2898.5
[700]	training's rmse: 2300.26	valid_1's rmse: 2849.36
[750]	training's rmse: 2271.13	valid_1's rmse: 2805.73
[800]	training's rmse: 2245.82	valid_1's rmse: 2762.17
[850]	training's rmse: 2217.33	valid_1's rmse: 2720.46
[900]	training's rmse: 2191.27	valid_1's rmse: 2681.08
[950]	training's



[50]	training's rmse: 3496.76	valid_1's rmse: 4713.12
[100]	training's rmse: 3321.12	valid_1's rmse: 4471.29
| [95m 4       [0m | [95m 1.999e+0[0m | [95m 0.7696  [0m | [95m 0.6704  [0m | [95m 0.06562 [0m | [95m 5.89    [0m | [95m 100.6   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3175.15	valid_1's rmse: 4267.2
[100]	training's rmse: 2967.8	valid_1's rmse: 3965.58
[150]	training's rmse: 2832.51	valid_1's rmse: 3759.09
[200]	training's rmse: 2728.89	valid_1's rmse: 3601.06
[250]	training's rmse: 2642.12	valid_1's rmse: 3475.06
[300]	training's rmse: 2563.56	valid_1's rmse: 3354.47
[350]	training's rmse: 2491.37	valid_1's rmse: 3252.56
[400]	training's rmse: 2431.88	valid_1's rmse: 3171.3
[450]	training's rmse: 2373.26	valid_1's rmse: 3090.94
[500]	training's rmse: 2320.2	valid_1's rmse: 3019.21
[550]	training's rmse: 2276.43	valid_1's rmse: 2958.76
[600]	training's rmse: 2231.94	valid_1's rmse: 2897.25
[650]	training's rmse: 2192.79	valid_1's rmse: 2845.39
[700]	training's rmse: 2150.84	valid_1's rmse: 2788.57
[750]	training's rmse: 2113.56	valid_1's rmse: 2742.85
[800]	training's rmse: 2078.92	valid_1's rmse: 2692.68
| [0m 5       [0m | [0m 7.189e+0[0m | [0m 0.8902  [0m | [0m 0.9713  [0m | [0m 0.2057  [0m | [0m 9.833   [0m | [0



[50]	training's rmse: 3229.28	valid_1's rmse: 4339.42
[100]	training's rmse: 3036.56	valid_1's rmse: 4054.86
| [0m 6       [0m | [0m 1.508e+0[0m | [0m 0.7766  [0m | [0m 0.8807  [0m | [0m 0.1756  [0m | [0m 6.157   [0m | [0m 147.9   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3218.45	valid_1's rmse: 4311.12
[100]	training's rmse: 3039.32	valid_1's rmse: 4030.3
[150]	training's rmse: 2934.26	valid_1's rmse: 3848.46
[200]	training's rmse: 2845.75	valid_1's rmse: 3702.22
[250]	training's rmse: 2776.25	valid_1's rmse: 3588.95
[300]	training's rmse: 2712.24	valid_1's rmse: 3490.58
[350]	training's rmse: 2655.32	valid_1's rmse: 3398.83
[400]	training's rmse: 2601.71	valid_1's rmse: 3316.12
[450]	training's rmse: 2556.8	valid_1's rmse: 3240.36
[500]	training's rmse: 2510.15	valid_1's rmse: 3166.71
[550]	training's rmse: 2470.48	valid_1's rmse: 3106.17
[600]	training's rmse: 2435.45	valid_1's rmse: 3047.27
[650]	training's rmse: 2394.34	valid_1's rmse: 2985.86
[700]	training's rmse: 2363.04	valid_1's rmse: 2938.93
[750]	training's rmse: 2333.82	valid_1's rmse: 2895.39
[800]	training's rmse: 2303.93	valid_1's rmse: 2851.43
[850]	training's rmse: 2279.13	valid_1's rmse: 2812.12
[900]	training's rmse: 2251.58	valid_1's rmse: 2771.2
[950]	training



[50]	training's rmse: 3139.28	valid_1's rmse: 4224.24
[100]	training's rmse: 2928.36	valid_1's rmse: 3906.57
[150]	training's rmse: 2784.31	valid_1's rmse: 3691.27
[200]	training's rmse: 2682.2	valid_1's rmse: 3545.04
[250]	training's rmse: 2590.76	valid_1's rmse: 3412.16
[300]	training's rmse: 2513.07	valid_1's rmse: 3303.78
[350]	training's rmse: 2436.45	valid_1's rmse: 3196.69
[400]	training's rmse: 2371	valid_1's rmse: 3106.87
[450]	training's rmse: 2312.25	valid_1's rmse: 3028.08
[500]	training's rmse: 2257.54	valid_1's rmse: 2954.45
[550]	training's rmse: 2208.62	valid_1's rmse: 2890.81
[600]	training's rmse: 2163.82	valid_1's rmse: 2832.28
[650]	training's rmse: 2122.66	valid_1's rmse: 2781.61
[700]	training's rmse: 2081.76	valid_1's rmse: 2727.92
[750]	training's rmse: 2043.68	valid_1's rmse: 2681.45
[800]	training's rmse: 2008.66	valid_1's rmse: 2633.67
[850]	training's rmse: 1973.42	valid_1's rmse: 2589.12
[900]	training's rmse: 1941.61	valid_1's rmse: 2548.24
[950]	training'



[50]	training's rmse: 3094.95	valid_1's rmse: 4135.14
[100]	training's rmse: 2894.34	valid_1's rmse: 3820.72
| [0m 9       [0m | [0m 1.46e+07[0m | [0m 0.9129  [0m | [0m 0.9558  [0m | [0m 0.2781  [0m | [0m 6.485   [0m | [0m 100.7   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3213.5	valid_1's rmse: 4310.97
[100]	training's rmse: 3028.48	valid_1's rmse: 4031.82
| [0m 10      [0m | [0m 1.626e+0[0m | [0m 0.6414  [0m | [0m 0.8017  [0m | [0m 0.1936  [0m | [0m 5.413   [0m | [0m 100.9   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3184.79	valid_1's rmse: 4266.89
| [0m 11      [0m | [0m 1.576e+0[0m | [0m 0.7524  [0m | [0m 0.6909  [0m | [0m 0.2183  [0m | [0m 5.802   [0m | [0m 99.77   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3634.81	valid_1's rmse: 4891.06
[100]	training's rmse: 3458.76	valid_1's rmse: 4667.38
| [95m 12      [0m | [95m 2.054e+0[0m | [95m 0.9898  [0m | [95m 0.9821  [0m | [95m 0.03804 [0m | [95m 7.094   [0m | [95m 147.2   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3199.49	valid_1's rmse: 4298.73
[100]	training's rmse: 2993.37	valid_1's rmse: 3999
| [0m 13      [0m | [0m 1.455e+0[0m | [0m 0.6454  [0m | [0m 0.8365  [0m | [0m 0.194   [0m | [0m 8.347   [0m | [0m 147.3   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3704.95	valid_1's rmse: 4967.33
[100]	training's rmse: 3518.08	valid_1's rmse: 4739.5
| [95m 14      [0m | [95m 2.13e+07[0m | [95m 0.6631  [0m | [95m 0.755   [0m | [95m 0.03091 [0m | [95m 5.987   [0m | [95m 146.3   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3123.22	valid_1's rmse: 4181.53
[100]	training's rmse: 2923.17	valid_1's rmse: 3868.46
| [0m 15      [0m | [0m 1.347e+0[0m | [0m 0.7748  [0m | [0m 0.9558  [0m | [0m 0.2516  [0m | [0m 7.207   [0m | [0m 145.6   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3125.62	valid_1's rmse: 4182.37
[100]	training's rmse: 2921.8	valid_1's rmse: 3866.25
| [0m 16      [0m | [0m 1.34e+07[0m | [0m 0.7542  [0m | [0m 0.9566  [0m | [0m 0.2568  [0m | [0m 6.232   [0m | [0m 147.0   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3334.94	valid_1's rmse: 4495.37
[100]	training's rmse: 3142.74	valid_1's rmse: 4217.4
| [0m 17      [0m | [0m 1.648e+0[0m | [0m 0.9135  [0m | [0m 0.9464  [0m | [0m 0.1204  [0m | [0m 6.216   [0m | [0m 145.6   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3111.65	valid_1's rmse: 4164.19
[100]	training's rmse: 2907.21	valid_1's rmse: 3848.94
| [0m 18      [0m | [0m 1.329e+0[0m | [0m 0.7725  [0m | [0m 0.7503  [0m | [0m 0.267   [0m | [0m 6.246   [0m | [0m 145.7   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3360.45	valid_1's rmse: 4530.58
[100]	training's rmse: 3167.61	valid_1's rmse: 4256.69
| [0m 19      [0m | [0m 1.812e+0[0m | [0m 0.7658  [0m | [0m 0.6955  [0m | [0m 0.1107  [0m | [0m 6.247   [0m | [0m 100.4   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3208.98	valid_1's rmse: 4290.46
[100]	training's rmse: 3034.75	valid_1's rmse: 4016.89
[150]	training's rmse: 2921.75	valid_1's rmse: 3826.33
[200]	training's rmse: 2828.21	valid_1's rmse: 3677
[250]	training's rmse: 2755.46	valid_1's rmse: 3556.57
[300]	training's rmse: 2694.15	valid_1's rmse: 3454.67
[350]	training's rmse: 2633.86	valid_1's rmse: 3358.87
[400]	training's rmse: 2581.91	valid_1's rmse: 3280.1
[450]	training's rmse: 2537.04	valid_1's rmse: 3201.73
[500]	training's rmse: 2489.77	valid_1's rmse: 3130.18
[550]	training's rmse: 2449.2	valid_1's rmse: 3064.35
[600]	training's rmse: 2416.33	valid_1's rmse: 3011.31
[650]	training's rmse: 2382.61	valid_1's rmse: 2957.9
[700]	training's rmse: 2347.42	valid_1's rmse: 2906.81
[750]	training's rmse: 2315.81	valid_1's rmse: 2858.97
[800]	training's rmse: 2287.53	valid_1's rmse: 2813.94
[850]	training's rmse: 2260.79	valid_1's rmse: 2774.79
[900]	training's rmse: 2235.09	valid_1's rmse: 2738.26
[950]	training's 



[50]	training's rmse: 3122.36	valid_1's rmse: 4198.39
[100]	training's rmse: 2916.33	valid_1's rmse: 3888.91
| [0m 21      [0m | [0m 1.361e+0[0m | [0m 0.9913  [0m | [0m 0.722   [0m | [0m 0.245   [0m | [0m 7.444   [0m | [0m 147.7   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3137.31	valid_1's rmse: 4203.85
[100]	training's rmse: 2925.37	valid_1's rmse: 3889.69
| [0m 22      [0m | [0m 1.364e+0[0m | [0m 0.8566  [0m | [0m 0.7971  [0m | [0m 0.2453  [0m | [0m 7.431   [0m | [0m 146.8   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3141.08	valid_1's rmse: 4187.28
[100]	training's rmse: 2948.8	valid_1's rmse: 3888.51
| [0m 23      [0m | [0m 1.363e+0[0m | [0m 0.9488  [0m | [0m 0.9879  [0m | [0m 0.2543  [0m | [0m 5.965   [0m | [0m 145.7   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3144.87	valid_1's rmse: 4220.73
[100]	training's rmse: 2947.3	valid_1's rmse: 3921.76
[150]	training's rmse: 2813.79	valid_1's rmse: 3711.37
[200]	training's rmse: 2706.24	valid_1's rmse: 3548.48
[250]	training's rmse: 2614.82	valid_1's rmse: 3414.25
[300]	training's rmse: 2538.56	valid_1's rmse: 3304.69
[350]	training's rmse: 2466.64	valid_1's rmse: 3197.57
[400]	training's rmse: 2403.97	valid_1's rmse: 3114.36
[450]	training's rmse: 2347.71	valid_1's rmse: 3038.95
[500]	training's rmse: 2297.15	valid_1's rmse: 2966.83
[550]	training's rmse: 2254.53	valid_1's rmse: 2904.24
[600]	training's rmse: 2211.29	valid_1's rmse: 2848.02
[650]	training's rmse: 2170.56	valid_1's rmse: 2792.5
[700]	training's rmse: 2130.77	valid_1's rmse: 2741.51
[750]	training's rmse: 2093.24	valid_1's rmse: 2689.89
[800]	training's rmse: 2059.23	valid_1's rmse: 2641.94
[850]	training's rmse: 2027.61	valid_1's rmse: 2599.42
[900]	training's rmse: 1992.24	valid_1's rmse: 2555.69
| [0m 24    



[50]	training's rmse: 3181.25	valid_1's rmse: 4272.26
[100]	training's rmse: 2979.46	valid_1's rmse: 3973.28
| [0m 25      [0m | [0m 1.441e+0[0m | [0m 0.8587  [0m | [0m 0.8598  [0m | [0m 0.2088  [0m | [0m 6.163   [0m | [0m 146.0   [0m |
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3424.57	valid_1's rmse: 4597.13
[100]	training's rmse: 3255.58	valid_1's rmse: 4352.86
[150]	training's rmse: 3152.9	valid_1's rmse: 4197.49
[200]	training's rmse: 3079.42	valid_1's rmse: 4078.66
[250]	training's rmse: 3023.56	valid_1's rmse: 3984.82
[300]	training's rmse: 2979.71	valid_1's rmse: 3909.7
[350]	training's rmse: 2937.58	valid_1's rmse: 3834.81
[400]	training's rmse: 2902.75	valid_1's rmse: 3770.88
[450]	training's rmse: 2868.65	valid_1's rmse: 3710.61
| [0m 26      [0m | [0m 1.359e+0[0m | [0m 0.6723  [0m | [0m 0.6342  [0m | [0m 0.09574 [0m | [0m 3.828   [0m | [0m 475.5   [0m |
------------------------- get_result -------------------------




model 验证MSE：12602432.2608983




model 验证MSE：29031025.97795085




model 验证MSE：12447904.512637623




model 验证MSE：5549524.418225511




model 验证MSE：7237132.3991170665




model 验证MSE：7802630.967199808




model 验证MSE：8950284.466018125




model 验证MSE：21279720.99984263




model 验证MSE：8560407.355402825




model 验证MSE：19691526.844549764
10折平均MSE：13315259.02018425
------------------------- transfer_result -------------------------
------------------- 训练 lightgbm 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)




[50]	training's rmse: 3701.77	valid_1's rmse: 4965.74
[100]	training's rmse: 3511.32	valid_1's rmse: 4735.94
------------------------- feature_importance_model -------------------------


In [57]:
 # xgboost
xgboost_params_dic = {"n_estimators": (50, 2000), "max_depth": (3, 10), "learning_rate": (0.02, 0.3),
                      "colsample_bytree": (0.5, 1.0),
                      "colsample_bylevel": (0.6, 1.),
                      "gamma": (0.01, 1.0)}
xgboost_bayes_dic = {"param_value_dics": xgboost_params_dic, "train_sample": train_dataset_x,
                     "train_label": train_dataset_y, "model": xgb.XGBRegressor, "model_name": "xgboost"}
xgboost_best_params = model_xgboost(xgboost_bayes_dic)

xgboost_best_params["max_depth"] = round(xgboost_best_params["max_depth"])
xgboost_best_params["n_estimators"] = round(xgboost_best_params["n_estimators"])
xgboost_best_params["learning_rate"] = round(xgboost_best_params["learning_rate"], 4)
xgboost_best_params["colsample_bytree"] = round(xgboost_best_params["colsample_bytree"], 4)
xgboost_best_params["colsample_bylevel"] = round(xgboost_best_params["colsample_bylevel"], 4)
xgboost_best_params["gamma"] = round(xgboost_best_params["gamma"], 4)

params_xgboost.update(xgboost_best_params)
with open(os.path.join(PATH, "xgboost_best_params.pkl"), "wb") as f:
    pkl.dump(params_xgboost, f)

result_score_xgb = get_result(params_xgboost, xgb.XGBRegressor, "xgboost", train_dataset_x, train_dataset_y,
                              test_dataset)
result_xgb = transfer_result(result_score_xgb, "xgboost", test_goods, train_goods, dic_goods)

model_xgb, result_dic_xgb = model_fun(train_dataset_x, train_dataset_y, params_xgboost, xgb.XGBRegressor,
                                      "xgboost")
feature_importance_xgb = feature_importance_model(model_xgb, "xgboost", train_columns)
feature_importance_xgb.to_csv(os.path.join(PATH, "feature_importance_xgb.csv"), encoding="utf_8_sig")

------------------------- model_xgboost -------------------------
|   iter    |  target   | colsam... | colsam... |   gamma   | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------------------------------
------------------- 训练 xgboost 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)
| [0m 1       [0m | [0m 2.631e+0[0m | [0m 0.6037  [0m | [0m 0.7495  [0m | [0m 0.1222  [0m | [0m 0.03399 [0m | [0m 7.798   [0m | [0m 999.6   [0m |
------------------- 训练 xgboost 模型 -------------------
x_train.shape : (84495, 80)
y_train.shape : (84495,)
x_test.shape : (21124, 80)
y_test.shape : (21124,)
| [95m 2       [0m | [95m 2.642e+0[0m | [95m 0.9914  [0m | [95m 0.7455  [0m | [95m 0.102   [0m | [95m 0.07926 [0m | [95m 9.066   [0m | [95m 238.1   [0m |
------------------- 训练 xgboost 模型 -------------------
x_train.shape : (84495, 80)
y_train.

In [67]:
result_fuse = pd.read_csv(os.path.join(PATH, "提交示例.csv"))
result_fuse["未来一周天均销量"] = (result_cat["未来一周天均销量"] + result_lgbm["未来一周天均销量"] + result_xgb["未来一周天均销量"]) / 3.8
result_fuse.to_csv(os.path.join(PATH, "result_fuse.csv"), encoding="utf_8_sig", index=False)

In [68]:
result_1764 = pd.read_csv(os.path.join(PATH, "1764.csv"))
np.sqrt(mean_squared_error(result_fuse["未来一周天均销量"], result_1764["未来一周天均销量"]))

518.9771338752354