In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from matplotlib import pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib.ticker import FuncFormatter, MaxNLocator, ScalarFormatter
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

plt.style.use('seaborn-v0_8-paper')
# # 设置全局参数
plt.rcParams['figure.facecolor'] = 'white'  # 设置图形的背景为透明
plt.rcParams['axes.facecolor'] = 'white'  # 设置轴域的背景为透明
plt.rcParams['savefig.facecolor'] = 'white'  # 保存图像时背景透明
plt.rcParams['axes.grid'] = False
import seaborn as sns
import joblib

# plt.rcParams['font.family']='Times New Roman,Microsoft YaHei'# 设置字体族，中文为微软雅黑，英文为Times New Roman
plt.rcParams['font.sans-serif'] = 'Times New Roman'
plt.rcParams['mathtext.fontset'] = 'stix'  # 设置数学公式字体为stix
plt.rcParams["text.usetex"] = False
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
import statsmodels.api as sm

In [2]:
data = pd.read_csv('dataebasnona.csv')


In [9]:
data.columns=['Date', 'Isoprene', 'Temperature', 'longitude', 'latitude', 'equipment',
       'station', 'id', 'resolution', 'Humidity', 'Radiation', 'NOx',
       'PM10', 'SO2', 'O3', 'Day', 'Hour']

In [3]:
# 确保 'Date' 列为 datetime 类型
data['Date'] = pd.to_datetime(data['Date'])
# 提取月份
data['Month'] = data['Date'].dt.month
# 选择夏季（6月、7月、8月）数据
data = data[data['Month'].isin([6, 7, 8])]
# 去掉辅助的 'Month' 列
data = data.drop(columns=['Month'])
data['isoprene'] = data['isoprene'] / 1000

In [11]:
# 初始化一个空的列表，用于存储每个 id 的处理结果
data_processed_list = []

# 遍历每个 id
for id_value in data['id'].unique():
    # 提取当前 id 的数据
    data_id = data[data['id'] == id_value].copy()
    
    # 获取当前 id 的分辨率
    resolution = data_id['resolution'].iloc[0]
    
    # 根据分辨率确定分组方式
    if resolution in ['1d', '35h', '5d']:
        # 按天（1号到31号）分组，不包含年和月
        data_id['Day'] = data_id['Date'].dt.day
        data_id['Hour'] = 0  # 没有小时信息，设为0
        group_cols = ['Day']
        # 重新构建 Date 列，使用虚拟的年份和月份
        data_id['Date'] = pd.to_datetime({'year': 2023, 'month': 1, 'day': data_id['Day']})
    elif resolution in ['1h', '2659s']:
        # 按小时（0点到24点）分组，不包含年、月和日
        data_id['Day'] = 1  # 没有日期信息，设为1
        data_id['Hour'] = data_id['Date'].dt.hour
        group_cols = ['Hour']
        # 重新构建 Date 列，使用虚拟的年份、月份和日期
        data_id['Date'] = pd.to_datetime({'year': 2023, 'month': 1, 'day': data_id['Day'], 'hour': data_id['Hour']})
    else:
        # 如果分辨率不在已知范围内，跳过该 id
        print(f"未知的分辨率：{resolution}，跳过 id：{id_value}")
        continue
    
 # 提取需要的列
    columns_to_keep = ['Date', 'Day', 'Hour', 'id', 'resolution', 'Isoprene', 'Temperature', 'Humidity', 'Radiation', 'NOx', 'PM10', 'SO2', 'O3', 'longitude', 'latitude']
    data_id = data_id[columns_to_keep]
    
    # 确保各列为数值型
    for col in ['Isoprene', 'Temperature', 'Humidity', 'Radiation', 'NOx', 'PM10', 'SO2', 'O3']:
        data_id[col] = pd.to_numeric(data_id[col], errors='coerce')
    
    # 按照分组列分组，计算均值和标准差
    grouped = data_id.groupby(group_cols)
    agg_dict = {
        'Isoprene': ['mean', 'std'],
        'Temperature': ['mean', 'std'],
        'Humidity': ['mean', 'std'],
        'Radiation': ['mean', 'std'],
        'NOx': ['mean', 'std'],
        'PM10': ['mean', 'std'],
        'SO2': ['mean', 'std'],
        'O3': ['mean', 'std'],
        'longitude': 'first',
        'latitude': 'first',
        'id': 'first',
        'resolution': 'first',
        'Day': 'first',
        'Hour': 'first'
    }
    data_id_agg = grouped.agg(agg_dict).reset_index()
    
    # 扁平化多级列名
    data_id_agg.columns = ['_'.join(col).strip('_') for col in data_id_agg.columns.values]
    
    # 重新构建 Date 列
    if resolution in ['1d', '35h', '5d']:
        data_id_agg['Date'] = pd.to_datetime({'year': 2023, 'month': 1, 'day': data_id_agg['Day_first']})
    elif resolution in ['1h', '2659s']:
        data_id_agg['Date'] = pd.to_datetime({'year': 2023, 'month': 1, 'day': data_id_agg['Day_first'], 'hour': data_id_agg['Hour_first']})
    
    # 选择需要的列
    columns_to_select = [
        'Date', 'Day_first', 'Hour_first', 'id_first', 'resolution_first',
        'Isoprene_mean', 'Isoprene_std', 'Temperature_mean', 'Temperature_std',
        'Humidity_mean', 'Humidity_std', 'Radiation_mean', 'Radiation_std',
        'NOx_mean', 'NOx_std', 'PM10_mean', 'PM10_std', 'SO2_mean', 'SO2_std', 'O3_mean', 'O3_std',
        'longitude_first', 'latitude_first'
    ]
    data_id_final = data_id_agg[columns_to_select]
    
    # 重命名列名
    data_id_final.rename(columns={
        'Day_first': 'Day',
        'Hour_first': 'Hour',
        'id_first': 'id',
        'resolution_first': 'resolution',
        'Isoprene_mean': 'Isoprene', 'Isoprene_std': 'Isoprene_std',
        'Temperature_mean': 'Temperature', 'Temperature_std': 'Temperature_std',
        'Humidity_mean': 'Humidity', 'Humidity_std': 'Humidity_std',
        'Radiation_mean': 'Radiation', 'Radiation_std': 'Radiation_std',
        'NOx_mean': 'NOx', 'NOx_std': 'NOx_std',
        'PM10_mean': 'PM10', 'PM10_std': 'PM10_std',
        'SO2_mean': 'SO2', 'SO2_std': 'SO2_std',
        'O3_mean': 'O3', 'O3_std': 'O3_std',
        'longitude_first': 'longitude',
        'latitude_first': 'latitude'
    }, inplace=True)
    
    # 将处理后的数据添加到列表中
    data_processed_list.append(data_id_final)
    
# 合并所有 id 的数据
data_final = pd.concat(data_processed_list, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_id_final.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_id_final.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_id_final.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_id_final.rename(columns={
A value is trying to be set on a copy of a s

In [5]:
data['Day'] = data['Date'].dt.day
data['Hour'] = data['Date'].dt.hour

In [12]:
data_final

Unnamed: 0,Date,Day,Hour,id,resolution,Isoprene,Isoprene_std,Temperature,Temperature_std,Humidity,...,NOx,NOx_std,PM10,PM10_std,SO2,SO2_std,O3,O3_std,longitude,latitude
0,2023-01-01 00:00:00,1,0,FR0030R,1d,0.064279,,13.392000,,64.583333,...,14.795833,,14.495833,,1.270619,,83.716667,,2.964886,45.772223
1,2023-01-02 00:00:00,2,0,FR0030R,5d,0.318000,0.149880,14.588361,3.854793,63.086905,...,14.760795,5.023284,12.095159,3.799197,1.270619,0.000000,49.251829,4.791676,2.964886,45.772223
2,2023-01-03 00:00:00,3,0,FR0030R,35h,0.063434,0.002485,14.837000,12.604885,54.816071,...,13.112597,11.415091,10.345760,9.626370,1.270619,0.000000,65.254351,10.149921,2.964886,45.772223
3,2023-01-04 00:00:00,4,0,FR0030R,5d,0.096000,0.003464,14.957300,8.456536,63.036905,...,17.669283,6.126421,14.641250,4.561708,1.270619,0.000000,54.301856,1.942638,2.964886,45.772223
4,2023-01-06 00:00:00,6,0,FR0030R,5d,0.253441,0.277977,18.878829,4.071279,55.462500,...,15.935697,7.752054,13.866124,4.177055,1.270619,0.000000,59.800035,12.032650,2.964886,45.772223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,2023-01-01 19:00:00,1,19,CH0010U,1h,0.064134,0.024972,19.899242,3.061881,67.446970,...,18.919383,6.417386,13.333171,4.454515,1.254127,0.220969,70.331228,22.321877,8.530419,47.377586
126,2023-01-01 20:00:00,1,20,CH0010U,1h,0.069162,0.046761,18.419697,2.901205,73.530303,...,19.184646,7.740268,13.445946,4.369649,1.241901,0.195904,63.881207,21.676234,8.530419,47.377586
127,2023-01-01 21:00:00,1,21,CH0010U,1h,0.072903,0.070800,17.300000,2.633142,77.776923,...,19.181815,7.628538,13.376698,3.581281,1.218334,0.179213,62.075746,17.192250,8.530419,47.377586
128,2023-01-01 22:00:00,1,22,CH0010U,1h,0.068124,0.043187,16.891176,2.675691,79.580882,...,17.595481,6.042149,13.458098,3.241726,1.216651,0.175464,61.141420,15.067919,8.530419,47.377586
