In [63]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import pylab as plt
import joblib


path = r"D:\ALL\AIoT\GroupProject\data.xlsx"
df = pd.read_excel(path)
df1 = df['天气状况']

# 转换时间列为datetime格式
time_data = df["当地时间"]
time_data = pd.to_datetime(time_data)
df["年"] = time_data.dt.year
df["月"] = time_data.dt.month
df["日"] = time_data.dt.day
df["小时"] = time_data.dt.hour

df = df.drop('当地时间', axis=1)
df = df.drop('年', axis=1)
df = df.drop('Pa', axis=1)
df = df.drop('前一小时天气', axis=1)

# 选择特征列和目标列
target = '天气状况'
df = df.drop('天气状况', axis=1)
print(df)

# 计算过去一小时的气温、气压、湿度的变化值
# data['Temp_change_1'] = data['Temp'] - data['Temp'].shift(1)
# data['Pressure_change_1'] = data['Pressure'] - data['Pressure'].shift(1)
# data['humidity_change_1'] = data['humidity'] - data['humidity'].shift(1)

# 划分训练集和测试集

X_train, X_test, y_train, y_test = train_test_split(
    df, df1, test_size=0.15,
    random_state=42)



# 数据归一化
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# 定义随机森林模型
rf_model = RandomForestClassifier(max_depth=20, max_features='log2', min_samples_leaf=1, n_estimators=200)

# 训练随机森林模型
joblib.dump(rf_model, 'D:\ALL\AIoT\GroupProject/DN.pkl')
rf_model.fit(X_train_scaled, y_train)

# 在训练集上进行预测
y_train_pred = rf_model.predict(X_train_scaled)

# 计算训练集准确度
train_accuracy = accuracy_score(y_train, y_train_pred)
print('Train accuracy:', train_accuracy)

# 在测试集上进行预测
y_test_pred = rf_model.predict(X_test_scaled)

# 计算测试集准确度
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy:', test_accuracy)

weather_labels = ['晴朗', '多云', '阴天', '雾天', '下雨']
predicted_weather = [weather_labels[i] for i in y_test_pred]
print("测试集预测结果：", predicted_weather)

weather_labels = ['晴朗', '多云', '阴天', '雾天', '下雨']
actual_weather = [weather_labels[i] for i in y_test]
print("测试集实际结果：", actual_weather)

print(predicted_weather[-1])

      T（大气温度）  Po（气压）  相对湿度  降水   月  日  小时
0        25.7   756.6    86   1  11  7  17
1        25.5   758.0    88   1  11  7  14
2        25.9   758.2    88   1  11  7  11
3        26.6   757.3    83   1  11  7   8
4        27.0   756.0    82   0  11  7   5
...       ...     ...   ...  ..  .. ..  ..
2919     28.6   755.5    78   0  11  7  20
2920     30.3   754.3    67   0  11  7  17
2921     31.1   755.2    61   0  11  7  14
2922     29.3   757.0    72   0  11  7  11
2923     26.6   756.4    88   1  11  7   8

[2924 rows x 7 columns]
Train accuracy: 1.0
Test accuracy: 0.7927107061503417
测试集预测结果： ['下雨', '晴朗', '下雨', '多云', '下雨', '多云', '晴朗', '晴朗', '下雨', '晴朗', '多云', '晴朗', '晴朗', '晴朗', '下雨', '下雨', '多云', '下雨', '多云', '多云', '晴朗', '下雨', '晴朗', '晴朗', '晴朗', '多云', '下雨', '晴朗', '下雨', '晴朗', '晴朗', '晴朗', '晴朗', '多云', '晴朗', '晴朗', '下雨', '晴朗', '多云', '下雨', '晴朗', '晴朗', '多云', '晴朗', '晴朗', '多云', '晴朗', '下雨', '晴朗', '晴朗', '晴朗', '晴朗', '多云', '晴朗', '晴朗', '晴朗', '晴朗', '晴朗', '晴朗', '晴朗', '下雨', '晴朗', '多云', '晴朗', '晴朗', '晴朗',

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

path = r"D:\ALL\AIoT\GroupProject\data.xlsx"
df = pd.read_excel(path)
df1 = df['天气状况']

# 转换时间列为datetime格式
time_data = df["当地时间"]
time_data = pd.to_datetime(time_data)
df["年"] = time_data.dt.year
df["月"] = time_data.dt.month
df["日"] = time_data.dt.day
df["小时"] = time_data.dt.hour

print(df["小时"])
df = df.drop('当地时间', axis=1)
df = df.drop('年', axis=1)
df = df.drop('Pa', axis=1)
df = df.drop('前一小时天气', axis=1)

# 准备数据和目标变量
X = df
y = df1

# 定义要调优的参数网格
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 5],
    'n_estimators': [100, 150, 200, 250, 300, 350, 400]
}

# 创建随机森林分类器
rf_model = RandomForestClassifier()

# 创建GridSearchCV对象
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5)

# 执行网格搜索
grid_search.fit(X, y)

# 输出最佳参数组合和对应的模型性能
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)


0       17
1       14
2       11
3        8
4        5
        ..
2919    20
2920    17
2921    14
2922    11
2923     8
Name: 小时, Length: 2924, dtype: int64
Best Parameters:  {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'n_estimators': 200}
Best Score:  0.9993162393162394


In [61]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import pylab as plt


# 读取数据集
data = pd.read_excel('D:\ALL\AIoT\GroupProject\data - 副本.xlsx')

# 转换时间列为datetime格式
data['Time'] = pd.to_datetime(data['Time'])
data['Year'] = data['Time'].dt.year
data['Month'] = data['Time'].dt.month
data['Day'] = data['Time'].dt.day
data['Hour'] = data['Time'].dt.hour

# 选择特征列和目标列
features = ['Temp', 'Pressure', 'humidity', 'raindrop', 'Month', 'Day', 'Hour']
target = 'weather'

# 填补缺失值
data = data.fillna(data.mean())

# 计算过去一小时的气温、气压、湿度的变化值
# data['Temp_change_1'] = data['Temp'] - data['Temp'].shift(1)
# data['Pressure_change_1'] = data['Pressure'] - data['Pressure'].shift(1)
# data['humidity_change_1'] = data['humidity'] - data['humidity'].shift(1)

# 删除包含NaN值的行
data = data.dropna()

# 划分训练集和测试集
df = data[features]
df1 = data[target]


X_train, X_test, y_train, y_test = train_test_split(
    df,df1, test_size=0.15,
    random_state=42)



# 数据归一化
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 定义随机森林模型
rf_model = RandomForestClassifier(max_depth=20, max_features='log2', min_samples_leaf=1, n_estimators=200)

# 训练随机森林模型
rf_model.fit(X_train_scaled, y_train)

# 在训练集上进行预测
y_train_pred = rf_model.predict(X_train_scaled)

# 计算训练集准确度
train_accuracy = accuracy_score(y_train, y_train_pred)
print('Train accuracy:', train_accuracy)

# 在测试集上进行预测
y_test_pred = rf_model.predict(X_test_scaled)

# 计算测试集准确度
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy:', test_accuracy)

  data = data.fillna(data.mean())


Train accuracy: 1.0
Test accuracy: 0.7813211845102506
