# 使用网格搜索优化钻石价格模型
钻石的定价对于钻石商家是重要的问题。构建多个回归模型预测钻石价格。使用K-Fold交叉验证法选择最优模型。基于选择出来的最优模型，使用网格搜索优化模型。

数据集包含53940条钻石数据

In [1]:
## 导入包
import numpy as np
import pandas as pd 
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
## 导入数据
data=pd.read_csv("./data/diamonds.csv")
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   x        53940 non-null  float64
 7   y        53940 non-null  float64
 8   z        53940 non-null  float64
 9   price    53940 non-null  int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,334
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335


In [3]:
#统计缺失数据
def calcNull(data):
    nullSum=data.isnull().sum()
    nullSum=nullSum.drop(nullSum[nullSum.values==0].index)
    return nullSum
missing_data=calcNull(data)
print(missing_data)

Series([], dtype: int64)


In [4]:
#分离自变量和因变量
X=data.iloc[:,:-1]
Y=data.iloc[:,-1]
print(Y)
X.head()

0         326
1         326
2         327
3         334
4         335
         ... 
53935    2757
53936    2757
53937    2757
53938    2757
53939    2757
Name: price, Length: 53940, dtype: int64


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


In [5]:
#将类别数据转换成哑变量
X=pd.get_dummies(X,drop_first=True)
X.head()

Unnamed: 0,carat,depth,table,x,y,z,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,61.5,55.0,3.95,3.98,2.43,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0.21,59.8,61.0,3.89,3.84,2.31,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,0.23,56.9,65.0,4.05,4.07,2.31,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.29,62.4,58.0,4.2,4.23,2.63,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
4,0.31,63.3,58.0,4.34,4.35,2.75,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [34]:
#拆分数据集和训练集
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=12)

In [35]:
# 特征缩放
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_Y = StandardScaler()
Y_train = np.ravel(sc_Y.fit_transform(Y_train.values.reshape(-1, 1)))

In [8]:
# Lasso 回归模型
from sklearn.linear_model import Lasso
lasso= Lasso(alpha=0.0005,random_state=1)


In [9]:
# 弹性网络回归
from sklearn.linear_model import ElasticNet
enet=ElasticNet(alpha=0.0005,l1_ratio=0.9,random_state=3)

In [10]:
# 岭回归
# from sklearn.kernel_ridge import KernelRidge
# krr=KernelRidge(alpha=0.6,kernel="polynomial",degree=2,coef0=2.5)

In [11]:
# 梯度加强回归
# from sklearn.ensemble import GradientBoostingRegressor
# gboost =GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
#                                   max_depth=4, max_features='sqrt', min_samples_leaf=15,
#                                   min_samples_split=10, loss='huber', random_state =5)


In [12]:
# 随机森林回归
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 500, random_state =0)

In [13]:
# XGBoost回归
import xgboost as xgb
xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05,
                       max_depth=3, min_child_weight=1.7817, n_estimators=2200, 
                       reg_alpha=0.4640, reg_lambda=0.8571, subsample=0.5213, silent=1, random_state =7, nthread = -1)

In [14]:
# LightGBM 回归
import lightgbm as lgb
lgb =lgb.LGBMRegressor(objective='regression',num_leaves=5,learning_rate=0.05,
                  n_estimators=720, max_bin = 55, bagging_fraction = 0.8, 
                       bagging_freq = 5, feature_fraction = 0.2319, feature_fraction_seed=9,
                       bagging_seed=9, min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [19]:
# 定义k-fold函数 进行交叉验证
from sklearn.model_selection import cross_val_score

def evaluation_model(model):
    rmse= np.sqrt(-cross_val_score(estimator=model,X=X_train,y=Y_train,scoring="neg_mean_squared_error",cv=10,n_jobs=-1,verbose=1))
    r2_score=cross_val_score(estimator=model,X=X_train,y=Y_train,scoring="r2",cv=10,n_jobs=-1,verbose=1)
    return(r2_score,rmse)

def print_result(r2_score,rmse,model_name):
    print("%s evalustion: r2=%.4f,[std=%.4f] ,rmse=%.4f,[std=%.4f]"%(model_name,r2_score.mean(),r2_score.std(),rmse.mean(),rmse.std()))

In [16]:
# 执行模型并输出评估结果
lasso_r2_score, lasso_rmse = evaluation_model(lasso)
print_result(lasso_r2_score, lasso_rmse, 'Lasso')

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    2.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Lasso evalustion: r2=0.9190,[std=0.0053] ,rmse=0.2843,[std=0.0088]


[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.7s finished


In [17]:
enet_r2_score, enet_rmse = evaluation_model(enet)
print_result(enet_r2_score, enet_rmse, 'Elastic Net')

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.8s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Elastic Net evalustion: r2=0.9190,[std=0.0053] ,rmse=0.2843,[std=0.0088]


[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.8s finished


In [None]:
# krr_r2_score, krr_rmse = evaluation_model(krr)
# print_result(krr_r2_score, krr_rmse, 'Kernel Ridge')

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


In [None]:
# gboost_r2_score, gboost_rmse = evaluation_model(gboost)
# print_result(gboost_r2_score, gboost_rmse, 'Gradient Boost')

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


In [17]:
rf_r2_score, rf_rmse = evaluation_model(rf)
print_result(rf_r2_score, rf_rmse, 'Random Forest')

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:  8.0min finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Random Forest evalustion: r2=0.9747,[std=0.0018] ,rmse=0.1590,[std=0.0063]


[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:  7.7min finished


In [18]:
xgb_r2_score, xgb_rmse = evaluation_model(xgb)
print_result(xgb_r2_score, xgb_rmse, 'XG Boost')

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:  1.6min finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


XG Boost evalustion: r2=0.9734,[std=0.0014] ,rmse=0.1629,[std=0.0044]


[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:  1.6min finished


In [21]:
lgb_r2_score, lgb_rmse = evaluation_model(lgb)
print_result(lgb_r2_score, lgb_rmse, 'Lightgbm')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Lightgbm evalustion: r2=0.9580,[std=0.0036] ,rmse=0.2046,[std=0.0065]


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.9s finished


In [26]:
## 选择Random Forest 挑选其最优参数
from sklearn.model_selection import GridSearchCV
para=[{"n_estimators":[500,600],
       "max_depth":[30,50,None]}]
grid=GridSearchCV(estimator=rf,param_grid=para,scoring="r2",n_jobs=-1,cv=10,verbose=1)
grid=grid.fit(X_train,Y_train)
best_ac = grid.best_score_ 
best_para = grid.best_params_ 
print('best_accuracy is: %.4f' %(best_ac))
print('best_parameters is: %s' %(best_para))

Fitting 10 folds for each of 6 candidates, totalling 60 fits
best_accuracy is: 0.9747
best_parameters is: {'max_depth': 50, 'n_estimators': 500}


In [None]:
# 挑选后选择参数为{'max_depth': 50, 'n_estimators': 500}
rf = RandomForestRegressor(n_estimators = 500, max_depth=50)
rf.fit(X_train,Y_train)
Y_pre=rf.predict(X_test)
y_pre=sc_Y.inverse_transform(Y_pre)
r2_score(Y_test,y_pre)

In [36]:
y_pre=sc_Y.inverse_transform(Y_pre)
r2_score(Y_test,y_pre)

0.9757473341104725