# 项目-基于随机森林的温度预测(模型调参)

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sys
print('Python version:', sys.version)
print('Pandas version:', pd.__version__)
print('Numpy version:', np.__version__)
print('Matplotlib version:', matplotlib.__version__)

Python version: 3.12.3 | packaged by conda-forge | (main, Apr 15 2024, 18:20:11) [MSC v.1938 64 bit (AMD64)]
Pandas version: 2.2.2
Numpy version: 1.26.4
Matplotlib version: 3.8.4


## 数据预处理

### 数据加载

In [6]:
data = pd.read_csv('./data/temps_extended.csv')  # 读取数据

data = pd.get_dummies(data)  # 对数据集中的离散变量进行独热编码

target = data['actual']  # 设定target
features = data.drop(labels=['actual', 'friend'],axis=1) # 提取特征

### 划分数据集

In [9]:
from sklearn.model_selection import train_test_split
from rich import print

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features,
                                                    target,
                                                    random_state=1210,
                                                    test_size=0.2)

In [10]:
print(f"训练集数据大小为: {X_train.shape}")
print(f"测试集数据集大小: {X_test.shape}")
print(f"训练集标签集大小为: {y_train.shape}")
print(f"测试集标签集大小: {y_test.shape}")

### 选择合适的特征进行建模

In [13]:
important_feature_name = ['temp_1','average','ws_1','day','temp_2','year','prcp_1']

X_train_important = X_train[important_feature_name]
X_test_important = X_test[important_feature_name]

## 基本模型

In [14]:
from sklearn.ensemble import RandomForestRegressor

rfg = RandomForestRegressor(random_state=1184)

print(rfg.get_params())

### 模型调参之RandomizedSearchCV

In [15]:
from sklearn.model_selection import RandomizedSearchCV

In [25]:
# 树的个数
n_estimators = [int(x) for x in np.linspace(200,2000,10)]
# 最大特征的选择方式
max_features = ['sqrt', 'log2']
# 树的最大深度 
max_depth = [10,20,None]
# 叶子节点最小分裂所需也的样本数
min_samples_split = [2,5,10,20]
# 叶子节点最小的样本数量
min_samples_leaf = [2,4,6,8]
# 样本采样方法
bootstrap = [True, False]

# 网格参数,范围设定需要自己把控
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': boostrap
}


In [30]:
%time

rfg = RandomForestRegressor()
rfg_random = RandomizedSearchCV(
    estimator=rfg,   # 你指定的算法是什么,
    param_distributions=random_grid,  # 参数的候选空间
    n_iter=500,  # 寻找参数组合的个数.
    scoring='neg_mean_absolute_error', # 评估方法
    cv=5,   # 交叉验证,
    verbose=2,  # 打印信息都数量
    random_state=1210,  #  随机种子
    n_jobs=-1,  #让所有的内核都参与计算
    )
rfg_random.fit(X_train_important,y_train)

CPU times: total: 0 ns
Wall time: 0 ns
Fitting 5 folds for each of 500 candidates, totalling 2500 fits


In [31]:
rfg_random.best_params_  # 查看最好的参数

{'n_estimators': 400,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': True}

### 新旧模型对比

In [33]:
def evaluate(model,X_test=X_test_important, target=y_test):
    predict = model.predict(X_test)
    error = abs(predict - target)
    mape = 100 * np.mean(error / target)
    accuracy = 100 - mape
    print(f"平均的气候误差: {np.mean(error)}")
    print(f"准确率: {accuracy:.2f}%")

In [34]:
# 基础模型
base_model = RandomForestRegressor(random_state=1210)
base_model.fit(X_train_important, y_train)
evaluate(base_model,  X_test=X_test_important, target=y_test)

In [36]:
# 调参之后的模型

best_random = rfg_random.best_estimator_ # 这是拿到最好的模型
evaluate(best_random, X_test=X_test_important, target=y_test)

In [37]:
rfg_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': True}

### 微调

In [45]:
params_grid = {
    'n_estimators': range(380,420,5),
    'max_features': [3,4,5,6,7],
    'max_depth': [None,3,4,5,6],
    'min_samples_split':  [3,4,5,6,7],
    'min_samples_leaf': [2,3,4,5,6],
    'bootstrap': [True]
}

In [46]:
%time
from sklearn.model_selection import GridSearchCV

rfg = RandomForestRegressor(random_state=1210)
rfg_gs = GridSearchCV(estimator=rfg,
                    param_grid=params_grid,
                    scoring='neg_mean_absolute_error', # 评估方法
                    cv=5,
                      n_jobs=-1,
                      verbose=2, 
                     )
rfg_gs.fit(X_train_important, y_train)

CPU times: total: 0 ns
Wall time: 0 ns
Fitting 5 folds for each of 5000 candidates, totalling 25000 fits


In [47]:
rfg_gs.best_params_

{'bootstrap': True,
 'max_depth': None,
 'max_features': 3,
 'min_samples_leaf': 5,
 'min_samples_split': 3,
 'n_estimators': 415}

In [48]:
best_grid_ad = rfg_gs.best_estimator_

In [49]:
evaluate(best_grid_ad, X_test_important, y_test)

In [52]:
print(best_grid_ad.get_params()) # 最终的参数组合

## 调参任务总结:

1.  参数空间是非常重要的,他会对结果产生决定性影响,所以在任务开始前,得选择一个合适的区间.
2. 采用随机搜索,可以更节约时间,尤其是 任务刚开始的时候,我们并不知道哪一个参数在哪一个位置效果更好,我们可以把参数与间隔值设定的大一些,先用随机搜索确定一些大致的位置,
3. 网格搜索,相当于地毯式搜索,当我们拿到大致位置之后,想要在这里寻找到最优的参数时,可以派上用场.搭配使用
