In [9]:
import numpy as np
import pandas as pd
from math import *
import matplotlib.pyplot as plt
import cmath

def read_excel(filepath,kind):
	excel_file = pd.ExcelFile(filepath)
	active_sheet = excel_file.sheet_names[0]
	df = pd.read_excel(filepath,sheet_name=active_sheet)
	headers = df.columns.tolist()
	if kind == 1: # 列
		columns_data = {}
		for header in headers:
			columns_data[header] = df[header].tolist()
		return columns_data
	elif kind == 0: # 行
		rows_data = []
		for index, row in df.iterrows():
			rows_data.append(row.tolist())
		columns_data = {}
		columns_data[headers[0]] = headers[1:]
		for lists in rows_data:
			columns_data[lists[0]] = lists[1:]
		return columns_data
df = read_excel('./HANDLE_data/ThetaData.xlsx',1)
DELTA_THETA = df['θ/θideal']
M = df['M']
OMEGA = df['w']
TR = df['Tr']
PR = df['pr']

In [3]:
from xgboost import XGBRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor, VotingRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

X = np.column_stack((M, OMEGA, TR, PR))
y = np.array(DELTA_THETA)

# 特征工程
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
poly = PolynomialFeatures(degree=2,include_bias=False)
X_poly = poly.fit_transform(X_scaled)
print(X_poly)
print(f"原始特征数量: {X.shape[1]}")
print(f"特征工程后的特征数量: {X_poly.shape[1]}")

[[ 5.02026555e-01 -1.45417130e+00 -1.88084135e+00 ...  3.53756418e+00
  -4.63387142e+00  6.06992925e+00]
 [ 5.02026555e-01 -1.45417130e+00 -1.88084135e+00 ...  3.53756418e+00
  -3.67610101e-01  3.82006316e-02]
 [ 5.02026555e-01 -1.45417130e+00 -1.88084135e+00 ...  3.53756418e+00
   1.16443792e-01  3.83290764e-03]
 ...
 [ 4.17045587e-01  2.08303922e+00  5.08053909e-01 ...  2.58118774e-01
  -1.58567743e-01  9.74114690e-02]
 [ 4.17045587e-01  2.08303922e+00  5.08053909e-01 ...  2.58118774e-01
  -1.60533704e-01  9.98419054e-02]
 [ 4.17045587e-01  2.08303922e+00  5.08053909e-01 ...  2.58118774e-01
  -1.62294901e-01  1.02044631e-01]]
原始特征数量: 4
特征工程后的特征数量: 14


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [7]:
# 划分
X_train, X_temp, y_train, y_temp = train_test_split(X_poly, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=42)

def eval_model(model):
	y_pred = model.predict(X_test)

	cp = 0
	for m, n in zip(y_pred, y_test):
		if abs(m / n - 1) > 0.2:
			print('预测值为{0}, 真是结果为{1}, 预测结果偏差大于20%'.format(m, n))
			cp = cp + 1
	if cp == 0:
		print("预测值与真实值差距均在20%内部")

	def metrics_sklearn(y_valid, y_pred_): # 传入测试集和对于测试集的预测
		"""模型效果评估"""
		r2 = r2_score(y_valid, y_pred_)
		print('r2_score:{0}'.format(r2))

		mse = mean_squared_error(y_valid, y_pred_)
		print('mse:{0}'.format(mse))

	"""模型效果评估"""
	metrics_sklearn(y_test, y_pred)
	return y_pred

In [8]:
# 训练xgboost
model_xbg = XGBRegressor()
model_xbg.fit(X_train,y_train,verbose=True)
c = eval_model(model_xbg)
print(c)

预测值与真实值差距均在20%内部
r2_score:0.9999676563807013
mse:1.0100737886650835e-07
[1.1928681 1.1979465 1.3207273 ... 1.2809255 1.1581937 1.198114 ]


In [37]:
# 对xgboost进行调参
def adj_params():
    """模型调参"""
    params = {
              # 'booster': ['gbtree', 'gblinear'],
              # 'n_estimators': [20, 50, 100, 150, 200],
              'n_estimators': [75, 125, 200, 250, 300],
              'learning_rate': [0.01, 0.03, 0.05, 0.1],
              # 'max_depth': [5, 8, 10, 12]
              }
    other_params = {'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 123}
    model_adj = XGBRegressor(**other_params)

    # sklearn提供的调参工具，训练集k折交叉验证(消除数据切分产生数据分布不均匀的影响)
    optimized_param = GridSearchCV(estimator=model_adj, param_grid=params, scoring='r2', cv=5, verbose=1)
    # 模型训练
    optimized_param.fit(X_train, y_train)
    # 对应参数的k折交叉验证平均得分
    means = optimized_param.cv_results_['mean_test_score']
    params = optimized_param.cv_results_['params']
    for mean, param in zip(means, params):
        print("mean_score: %f,  params: %r" % (mean, param))
    # 最佳模型参数
    print('参数的最佳取值：{0}'.format(optimized_param.best_params_))
    # 最佳参数模型得分
    print('最佳模型得分:{0}'.format(optimized_param.best_score_))
adj_params()


Fitting 5 folds for each of 20 candidates, totalling 100 fits
mean_score: 0.773614,  params: {'learning_rate': 0.01, 'n_estimators': 75}
mean_score: 0.915615,  params: {'learning_rate': 0.01, 'n_estimators': 125}
mean_score: 0.980590,  params: {'learning_rate': 0.01, 'n_estimators': 200}
mean_score: 0.992595,  params: {'learning_rate': 0.01, 'n_estimators': 250}
mean_score: 0.997096,  params: {'learning_rate': 0.01, 'n_estimators': 300}
mean_score: 0.988535,  params: {'learning_rate': 0.03, 'n_estimators': 75}
mean_score: 0.999238,  params: {'learning_rate': 0.03, 'n_estimators': 125}
mean_score: 0.999871,  params: {'learning_rate': 0.03, 'n_estimators': 200}
mean_score: 0.999890,  params: {'learning_rate': 0.03, 'n_estimators': 250}
mean_score: 0.999893,  params: {'learning_rate': 0.03, 'n_estimators': 300}
mean_score: 0.999288,  params: {'learning_rate': 0.05, 'n_estimators': 75}
mean_score: 0.999882,  params: {'learning_rate': 0.05, 'n_estimators': 125}
mean_score: 0.999896,  params

In [31]:
# 训练lightlgb
model_lgb = LGBMRegressor()
model_lgb.fit(X_train,y_train)
eval_model(model_lgb)

预测值与真实值差距均在20%内部
r2_score:0.9998783185475928
mse:3.8000461391798495e-07


In [33]:
# 训练gbdt
model_gbdt = GradientBoostingRegressor()
model_gbdt.fit(X_train,y_train)
eval_model(model_gbdt)

预测值与真实值差距均在20%内部
r2_score:0.9998934877611201
mse:3.3263197810660064e-07


In [25]:
"""训练模型"""
xgb_model = XGBRegressor()
gbdt_model = GradientBoostingRegressor()
lgb_model = LGBMRegressor()
voting_model = VotingRegressor(estimators=[
    ('xgb', xgb_model),
    ('gbdt', gbdt_model),
    ('lgb', lgb_model)
])
# 训练模型
voting_model.fit(X_train, y_train)

预测值与真实值差距均在20%内部
r2_score:0.9999626549494048
mse:1.1662657909186546e-07
