In [1]:
import pandas as pd

# 读取污染物浓度数据（附件1）和气象数据（附件2）
pollution_data = pd.read_excel("附件1：污染物浓度数据.xlsx", engine="openpyxl")
weather_data = pd.read_excel("附件2：气象数据.xlsx", engine="openpyxl")

# # 将质量等级转换为数值型数据（独热编码）
# all_categories = ['优', '良', '轻度污染', '中度污染', '重度污染', '严重污染']
# pollution_data['质量等级'] = pd.Categorical(pollution_data['质量等级'], categories=all_categories)
# pollution_data = pd.concat([pollution_data, pd.get_dummies(pollution_data["质量等级"])], axis=1)
# pollution_data.drop("质量等级", axis=1, inplace=True)

# 合并数据集
merged_data = pollution_data.merge(weather_data, left_on=["年", "月", "日"], right_on=["V04001", "V04002", "V04003"])

# 移除不需要的列
merged_data.drop(["年", "月", "日", "V04001", "V04002", "V04003", "V01301","质量等级","AQI"], axis=1, inplace=True)

# 删除包含缺失值的行
merged_data.dropna(axis=0, inplace=True)

# 划分训练集和测试集
train_data = merged_data.sample(frac=0.8, random_state=1)
test_data = merged_data.drop(train_data.index)

# 保存训练集和测试集到CSV文件
train_data.to_csv("train_data1.csv", index=False, encoding="utf-8")
test_data.to_csv("test_data1.csv", index=False, encoding="utf-8")


In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# 特征矩阵X，我们从merged_data中删除PM2.5列
X_train = train_data.drop("PM2.5", axis=1)
# 目标向量y，即PM2.5值
y_train = train_data["PM2.5"]

X_test = test_data.drop("PM2.5", axis=1)
y_test = test_data["PM2.5"]

#创建随机森林模型
rf = RandomForestRegressor(n_estimators=100, random_state=43)
#训练模型
rf.fit(X_train, y_train)
#使用模型预测测试集
y_pred = rf.predict(X_test)
#计算模型的均方根误差（RMSE）
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)
#输出特征重要性
feature_importances = rf.feature_importances_
for feature, importance in zip(X_train.columns, feature_importances):
    print(f"{feature}: {importance}")


RMSE: 11.355211237160107
PM10: 0.7021186288751761
O3: 0.009863808043710785
SO2: 0.013809234740810961
NO2: 0.016233448202952534
CO: 0.1154627604332011
V13305: 0.005273244505890729
V10004_700: 0.01157091697600541
V11291_700: 0.02166088345110405
V12001_700: 0.08899860689584263
V13003_700: 0.015008467875305699


In [12]:
feature_names = list(X_train.columns)

# 获取特征名称和对应的重要性值
feature_importances = list(zip(feature_names, rf.feature_importances_))

# 对特征重要性进行降序排序
sorted_feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

# 输出排序后的特征名称和重要性值
for feature, importance in sorted_feature_importances:
    print(f"Feature: {feature}, Importance: {importance}")


Feature: PM10, Importance: 0.7021186288751761
Feature: CO, Importance: 0.1154627604332011
Feature: V12001_700, Importance: 0.08899860689584263
Feature: V11291_700, Importance: 0.02166088345110405
Feature: NO2, Importance: 0.016233448202952534
Feature: V13003_700, Importance: 0.015008467875305699
Feature: SO2, Importance: 0.013809234740810961
Feature: V10004_700, Importance: 0.01157091697600541
Feature: O3, Importance: 0.009863808043710785
Feature: V13305, Importance: 0.005273244505890729
