## 第零步：分析库导入与分析目标设置
为了更方便地进行数据处理与分析，我们定义了一个基类`DataframeAnalysis`,将输入数据统一处理成`pandas.DataFrame`格式.

目前支持的数据格式有：
- `csv`
- `xlsx`
- `parquet`

输入参数有：
- `root_path`: 所有数据存放的根路径
- `data_path`: 各个数据集在根路径下的子路径

In [58]:
pwd

'/Users/yumeng/Desktop/Data-Process-Library/example/XiexinForecasting/PhotovoltaicPower'

In [59]:
from Analysis.DataframeAnalysis import DataframeAnalysis
import plotly.express as px
import pandas as pd
import os
import plotly.graph_objects as go
# 导入中冶赛迪高炉应用数据
root_path = '/Users/yumeng/Desktop/data/Xiexindata/photovoltaic_power_plant_data'
data_path = 'ps_id_7538_merged.csv'
df = pd.read_csv(os.path.join(root_path, data_path), encoding='utf-8')
DA = DataframeAnalysis(dataFrame=df)
DA.root_path = root_path
DA.data_path = data_path

DataAnalysis loading data from DataFrame with shape: (92362, 30)


In [60]:
# 获得所有列名
DA.df_raw.columns

Index(['sn', 'ts', 'er', 'pac', 'lng', 'lat', 'roof_type', 'angle',
       'temperature_2m', 'relative_humidity_2m', 'dew_point_2m',
       'pressure_msl', 'surface_pressure', 'precipitation_probability',
       'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high',
       'shortwave_radiation', 'direct_radiation', 'direct_normal_irradiance',
       'diffuse_radiation', 'wind_speed_10m', 'wind_speed_80m',
       'wind_speed_120m', 'wind_direction_10m', 'wind_direction_80m',
       'wind_direction_120m', 'variable_date', 'full_time'],
      dtype='object')

In [61]:
DA.df_raw

Unnamed: 0,sn,ts,er,pac,lng,lat,roof_type,angle,temperature_2m,relative_humidity_2m,...,direct_normal_irradiance,diffuse_radiation,wind_speed_10m,wind_speed_80m,wind_speed_120m,wind_direction_10m,wind_direction_80m,wind_direction_120m,variable_date,full_time
0,GBBZT01500A231107192,2024-01-26 15:45:00,,3.46,119.595375,33.599117,斜屋顶双坡,15.0,6.3235,43.8145,...,353.5400,63.6639,1.3823,2.3048,2.5679,317.3759,312.3407,310.9549,2024-01-26 15:45:00,2024-01-26 15:45:00
1,GBBZT01500A231107192,2024-01-26 15:50:00,,3.34,119.595375,33.599117,斜屋顶双坡,15.0,,,...,,,,,,,,,,2024-01-26 15:50:00
2,GBBZT01500A231107192,2024-01-26 15:55:00,,2.80,119.595375,33.599117,斜屋顶双坡,15.0,,,...,,,,,,,,,,2024-01-26 15:55:00
3,GBBZT01500A231107192,2024-01-26 16:00:00,,3.07,119.595375,33.599117,斜屋顶双坡,15.0,6.0880,45.2670,...,296.0178,52.0977,1.2385,2.1577,2.4244,323.0050,316.5961,314.6463,2024-01-26 16:00:00,2024-01-26 16:00:00
4,GBBZT01500A231107192,2024-01-26 16:05:00,,0.02,119.595375,33.599117,斜屋顶双坡,15.0,,,...,,,,,,,,,,2024-01-26 16:05:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92357,GBBZT01500A231107192,2025-05-10 19:15:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.7458,72.9361,...,0.4203,1.0361,2.4739,5.0567,5.8143,281.8197,288.4035,289.9008,2025-05-10 19:15:00,2025-05-10 19:15:00
92358,GBBZT01500A231107192,2025-05-10 19:20:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,,,...,,,,,,,,,,2025-05-10 19:20:00
92359,GBBZT01500A231107192,2025-05-10 19:25:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,,,...,,,,,,,,,,2025-05-10 19:25:00
92360,GBBZT01500A231107192,2025-05-10 19:30:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.5385,73.4619,...,0.0000,0.0000,2.4603,5.0890,5.8638,276.3420,284.3351,286.1410,2025-05-10 19:30:00,2025-05-10 19:30:00


In [62]:
DA.plot_column_plotly(columns=['pac'], start_point=0, length=100000)

## 第一步：对齐时间戳
按照功率数据的标准，插值为5min频率

In [63]:
#检查full_time是否全是5min间隔


import pandas as pd

# 确保 full_time 为 datetime 类型
df['full_time'] = pd.to_datetime(df['full_time'])

# 创建严格的 5 分钟间隔时间戳，起止时间取自原 full_time 范围
new_full_time = pd.date_range(
    start=df['full_time'].min(),
    end=df['full_time'].max(),
    freq='5min'  # 推荐用 '5min' 替代 '5T'
)

# 替换旧的 full_time 或创建一个新列
df = df.set_index('full_time')         # 将 full_time 设为索引
df = df.reindex(new_full_time)         # 按严格 5min 间隔重建索引
df.index.name = 'full_time'            # 恢复索引名
df = df.reset_index()                  # 如需使用为普通列

# 保存回 DA.df_raw
DA.df_raw = df



DA.df_raw

Unnamed: 0,full_time,sn,ts,er,pac,lng,lat,roof_type,angle,temperature_2m,...,direct_radiation,direct_normal_irradiance,diffuse_radiation,wind_speed_10m,wind_speed_80m,wind_speed_120m,wind_direction_10m,wind_direction_80m,wind_direction_120m,variable_date
0,2024-01-26 15:45:00,GBBZT01500A231107192,2024-01-26 15:45:00,,3.46,119.595375,33.599117,斜屋顶双坡,15.0,6.3235,...,84.5228,353.5400,63.6639,1.3823,2.3048,2.5679,317.3759,312.3407,310.9549,2024-01-26 15:45:00
1,2024-01-26 15:50:00,GBBZT01500A231107192,2024-01-26 15:50:00,,3.34,119.595375,33.599117,斜屋顶双坡,15.0,,...,,,,,,,,,,
2,2024-01-26 15:55:00,GBBZT01500A231107192,2024-01-26 15:55:00,,2.80,119.595375,33.599117,斜屋顶双坡,15.0,,...,,,,,,,,,,
3,2024-01-26 16:00:00,GBBZT01500A231107192,2024-01-26 16:00:00,,3.07,119.595375,33.599117,斜屋顶双坡,15.0,6.0880,...,62.0328,296.0178,52.0977,1.2385,2.1577,2.4244,323.0050,316.5961,314.6463,2024-01-26 16:00:00
4,2024-01-26 16:05:00,GBBZT01500A231107192,2024-01-26 16:05:00,,0.02,119.595375,33.599117,斜屋顶双坡,15.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135402,2025-05-10 19:15:00,GBBZT01500A231107192,2025-05-10 19:15:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.7458,...,0.0493,0.4203,1.0361,2.4739,5.0567,5.8143,281.8197,288.4035,289.9008,2025-05-10 19:15:00
135403,2025-05-10 19:20:00,GBBZT01500A231107192,2025-05-10 19:20:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,,...,,,,,,,,,,
135404,2025-05-10 19:25:00,GBBZT01500A231107192,2025-05-10 19:25:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,,...,,,,,,,,,,
135405,2025-05-10 19:30:00,GBBZT01500A231107192,2025-05-10 19:30:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.5385,...,0.0000,0.0000,0.0000,2.4603,5.0890,5.8638,276.3420,284.3351,286.1410,2025-05-10 19:30:00


## 第二步：缺失值分析

检查数据集中有无缺失值，包括：
- 缺失的特征值
- 缺失的时间戳

In [64]:
# 获得'sn'列包含缺失值的index条目
DA.getNanIndex(start_col='sn',end_col='sn')

(Index([     5,     29,     30,     31,     32,     33,     34,     35,     36,
            37,
        ...
        135223, 135224, 135225, 135226, 135227, 135228, 135229, 135230, 135231,
        135232],
       dtype='int64', length=64578),
 True)

In [65]:
# 'sn'列值都是一样的，用上一行值进行填充
DA.df_raw[['sn']] = DA.df_raw[['sn']].ffill()

In [66]:
# 检查'sn'列空值情况
DA.getNanIndex(start_col='sn',end_col='sn')

(Index([], dtype='int64'), False)

In [67]:
# 获得'ts'列包含缺失值的index条目
DA.getNanIndex(start_col='ts',end_col='ts')

(Index([     5,     29,     30,     31,     32,     33,     34,     35,     36,
            37,
        ...
        135223, 135224, 135225, 135226, 135227, 135228, 135229, 135230, 135231,
        135232],
       dtype='int64', length=64578),
 True)

In [68]:
DA.df_raw['ts'] = DA.df_raw['ts'].fillna(DA.df_raw['full_time'])

In [69]:
# 检查'ts'列空值情况
DA.getNanIndex(start_col='ts',end_col='ts')

(Index([], dtype='int64'), False)

In [70]:
# 获得'er'列包含缺失值的index条目
DA.getNanIndex(start_col='er',end_col='er')

(Index([     0,      1,      2,      3,      4,      5,      6,      7,      8,
             9,
        ...
        135223, 135224, 135225, 135226, 135227, 135228, 135229, 135230, 135231,
        135232],
       dtype='int64', length=65519),
 True)

In [71]:
# 'er'列空值和 0 值都代表无异常，将所有空值全部填充为 0 值
DA.df_raw.loc[:, 'er': 'er'] = DA.df_raw.loc[:, 'er': 'er'].fillna(0)

In [72]:
# 检查'er'列空值情况
DA.getNanIndex(start_col='er',end_col='er')

(Index([], dtype='int64'), False)

In [73]:
# 检查'pac'列空值情况
DA.getNanIndex(start_col='pac',end_col='pac')

(Index([     5,     29,     30,     31,     32,     33,     34,     35,     36,
            37,
        ...
        135223, 135224, 135225, 135226, 135227, 135228, 135229, 135230, 135231,
        135232],
       dtype='int64', length=64578),
 True)

In [74]:
# 'pac'列空值和 0 值都代表无异常，将所有空值全部填充为 0 值
DA.df_raw.loc[:, 'pac': 'pac'] = DA.df_raw.loc[:, 'pac': 'pac'].fillna(0)

In [75]:
# 检查'pac'列空值情况
DA.getNanIndex(start_col='pac',end_col='pac')

(Index([], dtype='int64'), False)

In [76]:
# 检查'lng', 'lat', 'roof_type', 'angle'列的空值情况
DA.getNanIndex(start_col='lng',end_col='angle')

(Index([     5,     29,     30,     31,     32,     33,     34,     35,     36,
            37,
        ...
        135223, 135224, 135225, 135226, 135227, 135228, 135229, 135230, 135231,
        135232],
       dtype='int64', length=64578),
 True)

In [77]:
# 'lng', 'lat', 'roof_type', 'angle'列值都是一样的，用上一行值进行填充
DA.df_raw[['lng', 'lat', 'roof_type', 'angle']] = DA.df_raw[['lng', 'lat', 'roof_type', 'angle']].ffill()

In [78]:
# 检查'lng', 'lat', 'roof_type', 'angle'列的空值情况
DA.getNanIndex(start_col='lng',end_col='angle')

(Index([], dtype='int64'), False)

In [79]:
# 检查气象数据列的空值情况
DA.getNanIndex(start_col='temperature_2m',end_col='wind_direction_120m')

(Index([     1,      2,      4,      5,      7,      8,     10,     11,     13,
            14,
        ...
        135392, 135394, 135395, 135397, 135398, 135400, 135401, 135403, 135404,
        135406],
       dtype='int64', length=90271),
 True)

In [80]:
# 对气象数据进行线性插值
DA.getInterpolate(start_col='temperature_2m', end_col='wind_direction_120m', method='linear')

kwargs: {'method': 'linear'}


Unnamed: 0,full_time,sn,ts,er,pac,lng,lat,roof_type,angle,temperature_2m,...,direct_radiation,direct_normal_irradiance,diffuse_radiation,wind_speed_10m,wind_speed_80m,wind_speed_120m,wind_direction_10m,wind_direction_80m,wind_direction_120m,variable_date
0,2024-01-26 15:45:00,GBBZT01500A231107192,2024-01-26 15:45:00,0.0,3.46,119.595375,33.599117,斜屋顶双坡,15.0,6.323500,...,84.522800,353.540000,63.663900,1.382300,2.304800,2.567900,317.375900,312.340700,310.954900,2024-01-26 15:45:00
1,2024-01-26 15:50:00,GBBZT01500A231107192,2024-01-26 15:50:00,0.0,3.34,119.595375,33.599117,斜屋顶双坡,15.0,6.245000,...,77.026133,334.365933,59.808500,1.334367,2.255767,2.520067,319.252267,313.759167,312.185367,
2,2024-01-26 15:55:00,GBBZT01500A231107192,2024-01-26 15:55:00,0.0,2.80,119.595375,33.599117,斜屋顶双坡,15.0,6.166500,...,69.529467,315.191867,55.953100,1.286433,2.206733,2.472233,321.128633,315.177633,313.415833,
3,2024-01-26 16:00:00,GBBZT01500A231107192,2024-01-26 16:00:00,0.0,3.07,119.595375,33.599117,斜屋顶双坡,15.0,6.088000,...,62.032800,296.017800,52.097700,1.238500,2.157700,2.424400,323.005000,316.596100,314.646300,2024-01-26 16:00:00
4,2024-01-26 16:05:00,GBBZT01500A231107192,2024-01-26 16:05:00,0.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,6.044467,...,55.607100,274.903267,48.209300,1.215867,2.108800,2.368067,325.385000,318.439200,316.222033,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135402,2025-05-10 19:15:00,GBBZT01500A231107192,2025-05-10 19:15:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.745800,...,0.049300,0.420300,1.036100,2.473900,5.056700,5.814300,281.819700,288.403500,289.900800,2025-05-10 19:15:00
135403,2025-05-10 19:20:00,GBBZT01500A231107192,2025-05-10 19:20:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.676700,...,0.032867,0.280200,0.690733,2.469367,5.067467,5.830800,279.993800,287.047367,288.647533,
135404,2025-05-10 19:25:00,GBBZT01500A231107192,2025-05-10 19:25:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.607600,...,0.016433,0.140100,0.345367,2.464833,5.078233,5.847300,278.167900,285.691233,287.394267,
135405,2025-05-10 19:30:00,GBBZT01500A231107192,2025-05-10 19:30:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.538500,...,0.000000,0.000000,0.000000,2.460300,5.089000,5.863800,276.342000,284.335100,286.141000,2025-05-10 19:30:00


In [81]:
# 检查气象数据列的空值情况
DA.getNanIndex(start_col='temperature_2m',end_col='wind_direction_120m')

(Index([], dtype='int64'), False)

In [82]:
DA.df_raw['variable_date'] = DA.df_raw['variable_date'].fillna(DA.df_raw['full_time'])

In [83]:
# 检查所有列的空值情况
DA.getNanIndex(start_col='sn',end_col='full_time')

(RangeIndex(start=0, stop=135407, step=1), True)

In [84]:
# 检查时间戳缺失情况
missing_dates, flag, timestamps, expected_range = DA.checkDateContinuity('ts', freq='5min')
print(f"Missing dates {flag}: {missing_dates}")

Missing dates True: DatetimeIndex([], dtype='datetime64[ns]', freq='5min')


In [85]:
DA.df_raw=DA.df_raw.set_index(timestamps)
DA.df_raw

Unnamed: 0_level_0,full_time,sn,ts,er,pac,lng,lat,roof_type,angle,temperature_2m,...,direct_radiation,direct_normal_irradiance,diffuse_radiation,wind_speed_10m,wind_speed_80m,wind_speed_120m,wind_direction_10m,wind_direction_80m,wind_direction_120m,variable_date
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-26 15:45:00,2024-01-26 15:45:00,GBBZT01500A231107192,2024-01-26 15:45:00,0.0,3.46,119.595375,33.599117,斜屋顶双坡,15.0,6.323500,...,84.522800,353.540000,63.663900,1.382300,2.304800,2.567900,317.375900,312.340700,310.954900,2024-01-26 15:45:00
2024-01-26 15:50:00,2024-01-26 15:50:00,GBBZT01500A231107192,2024-01-26 15:50:00,0.0,3.34,119.595375,33.599117,斜屋顶双坡,15.0,6.245000,...,77.026133,334.365933,59.808500,1.334367,2.255767,2.520067,319.252267,313.759167,312.185367,2024-01-26 15:50:00
2024-01-26 15:55:00,2024-01-26 15:55:00,GBBZT01500A231107192,2024-01-26 15:55:00,0.0,2.80,119.595375,33.599117,斜屋顶双坡,15.0,6.166500,...,69.529467,315.191867,55.953100,1.286433,2.206733,2.472233,321.128633,315.177633,313.415833,2024-01-26 15:55:00
2024-01-26 16:00:00,2024-01-26 16:00:00,GBBZT01500A231107192,2024-01-26 16:00:00,0.0,3.07,119.595375,33.599117,斜屋顶双坡,15.0,6.088000,...,62.032800,296.017800,52.097700,1.238500,2.157700,2.424400,323.005000,316.596100,314.646300,2024-01-26 16:00:00
2024-01-26 16:05:00,2024-01-26 16:05:00,GBBZT01500A231107192,2024-01-26 16:05:00,0.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,6.044467,...,55.607100,274.903267,48.209300,1.215867,2.108800,2.368067,325.385000,318.439200,316.222033,2024-01-26 16:05:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-10 19:15:00,2025-05-10 19:15:00,GBBZT01500A231107192,2025-05-10 19:15:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.745800,...,0.049300,0.420300,1.036100,2.473900,5.056700,5.814300,281.819700,288.403500,289.900800,2025-05-10 19:15:00
2025-05-10 19:20:00,2025-05-10 19:20:00,GBBZT01500A231107192,2025-05-10 19:20:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.676700,...,0.032867,0.280200,0.690733,2.469367,5.067467,5.830800,279.993800,287.047367,288.647533,2025-05-10 19:20:00
2025-05-10 19:25:00,2025-05-10 19:25:00,GBBZT01500A231107192,2025-05-10 19:25:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.607600,...,0.016433,0.140100,0.345367,2.464833,5.078233,5.847300,278.167900,285.691233,287.394267,2025-05-10 19:25:00
2025-05-10 19:30:00,2025-05-10 19:30:00,GBBZT01500A231107192,2025-05-10 19:30:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.538500,...,0.000000,0.000000,0.000000,2.460300,5.089000,5.863800,276.342000,284.335100,286.141000,2025-05-10 19:30:00


In [86]:
DA.df_raw = DA.df_raw.reindex(expected_range)

In [87]:
# 获取插入位置（wind_direction_120m 的索引 + 1）
insert_pos = DA.df_raw.columns.get_loc('wind_direction_120m') + 1

# 插入 'pac_copy' 列作为 'pac' 的副本
DA.df_raw.insert(loc=insert_pos, column='pac_copy', value=DA.df_raw['pac'])

In [88]:
DA.df_raw

Unnamed: 0,full_time,sn,ts,er,pac,lng,lat,roof_type,angle,temperature_2m,...,direct_normal_irradiance,diffuse_radiation,wind_speed_10m,wind_speed_80m,wind_speed_120m,wind_direction_10m,wind_direction_80m,wind_direction_120m,pac_copy,variable_date
2024-01-26 15:45:00,2024-01-26 15:45:00,GBBZT01500A231107192,2024-01-26 15:45:00,0.0,3.46,119.595375,33.599117,斜屋顶双坡,15.0,6.323500,...,353.540000,63.663900,1.382300,2.304800,2.567900,317.375900,312.340700,310.954900,3.46,2024-01-26 15:45:00
2024-01-26 15:50:00,2024-01-26 15:50:00,GBBZT01500A231107192,2024-01-26 15:50:00,0.0,3.34,119.595375,33.599117,斜屋顶双坡,15.0,6.245000,...,334.365933,59.808500,1.334367,2.255767,2.520067,319.252267,313.759167,312.185367,3.34,2024-01-26 15:50:00
2024-01-26 15:55:00,2024-01-26 15:55:00,GBBZT01500A231107192,2024-01-26 15:55:00,0.0,2.80,119.595375,33.599117,斜屋顶双坡,15.0,6.166500,...,315.191867,55.953100,1.286433,2.206733,2.472233,321.128633,315.177633,313.415833,2.80,2024-01-26 15:55:00
2024-01-26 16:00:00,2024-01-26 16:00:00,GBBZT01500A231107192,2024-01-26 16:00:00,0.0,3.07,119.595375,33.599117,斜屋顶双坡,15.0,6.088000,...,296.017800,52.097700,1.238500,2.157700,2.424400,323.005000,316.596100,314.646300,3.07,2024-01-26 16:00:00
2024-01-26 16:05:00,2024-01-26 16:05:00,GBBZT01500A231107192,2024-01-26 16:05:00,0.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,6.044467,...,274.903267,48.209300,1.215867,2.108800,2.368067,325.385000,318.439200,316.222033,0.02,2024-01-26 16:05:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-10 19:15:00,2025-05-10 19:15:00,GBBZT01500A231107192,2025-05-10 19:15:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.745800,...,0.420300,1.036100,2.473900,5.056700,5.814300,281.819700,288.403500,289.900800,0.02,2025-05-10 19:15:00
2025-05-10 19:20:00,2025-05-10 19:20:00,GBBZT01500A231107192,2025-05-10 19:20:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.676700,...,0.280200,0.690733,2.469367,5.067467,5.830800,279.993800,287.047367,288.647533,0.02,2025-05-10 19:20:00
2025-05-10 19:25:00,2025-05-10 19:25:00,GBBZT01500A231107192,2025-05-10 19:25:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.607600,...,0.140100,0.345367,2.464833,5.078233,5.847300,278.167900,285.691233,287.394267,0.02,2025-05-10 19:25:00
2025-05-10 19:30:00,2025-05-10 19:30:00,GBBZT01500A231107192,2025-05-10 19:30:00,3000.0,0.02,119.595375,33.599117,斜屋顶双坡,15.0,15.538500,...,0.000000,0.000000,2.460300,5.089000,5.863800,276.342000,284.335100,286.141000,0.02,2025-05-10 19:30:00


In [89]:
# 检查所有列的空值情况
DA.getNanIndex(start_col='full_time',end_col='variable_date')

(DatetimeIndex([], dtype='datetime64[ns]', freq='5min'), False)

In [90]:
DA.plot_column_plotly(columns=['pac'])

## 第三步：基础统计量分析
借助数据中的统计量，我们可以了解到数据最直观的分布情况，为后续的分析打下基础。

目前包含的分析目标有：
- 数据形状
- 每列均值
- 每列方差
- 每列标准差
- 每列最大值
- 每列最小值
- 每列中位数
- 每列分位数

In [91]:
# 获取数据形状：（序列长度，变量数）
DA.getShape()

(135407, 31)

In [92]:
# 去掉非浮点数类型的变量，防止影响统计分析
DA.df_raw.drop(columns=['roof_type'], inplace=True)

In [93]:
# 获取数据每一列的均值
Average = DA.getAverageColumn(start_col='temperature_2m',end_col='pac_copy')
Average

Unnamed: 0,feature,average
0,temperature_2m,15.02696
1,relative_humidity_2m,66.848911
2,dew_point_2m,8.103955
3,pressure_msl,1016.861717
4,surface_pressure,1016.379153
5,precipitation_probability,11.119527
6,cloud_cover,56.471237
7,cloud_cover_low,14.809736
8,cloud_cover_mid,23.105875
9,cloud_cover_high,44.169403


In [94]:
fig = px.bar(Average, x='feature', y='average',color='average')
fig.show()

In [95]:
# 获取数据每一列的方差
Variance = DA.getVarianceColumn(start_col='temperature_2m',end_col='pac_copy')
Variance

Unnamed: 0,feature,variance
0,temperature_2m,107.415549
1,relative_humidity_2m,458.707902
2,dew_point_2m,130.045079
3,pressure_msl,94.114153
4,surface_pressure,93.741376
5,precipitation_probability,845.076495
6,cloud_cover,1872.736738
7,cloud_cover_low,921.54471
8,cloud_cover_mid,1341.009798
9,cloud_cover_high,1924.43518


In [96]:
fig = px.bar(Variance, x='feature', y='variance',color='variance')
fig.show()

In [97]:
# 获取数据每一列的标准差
Std = DA.getStdColumn(start_col='temperature_2m',end_col='pac_copy')
Std

Unnamed: 0,feature,standard deviation
0,temperature_2m,10.364147
1,relative_humidity_2m,21.417467
2,dew_point_2m,11.403731
3,pressure_msl,9.701245
4,surface_pressure,9.682013
5,precipitation_probability,29.070199
6,cloud_cover,43.275128
7,cloud_cover_low,30.356955
8,cloud_cover_mid,36.619801
9,cloud_cover_high,43.868385


In [98]:
fig = px.bar(Std, x='feature', y='standard deviation',color='standard deviation')
fig.show()

In [99]:
# 获取数据每一列的最大值
maxval = DA.getMaxColumn(start_col='temperature_2m',end_col='pac_copy')
maxval

Unnamed: 0,feature,max value
0,temperature_2m,37.3755
1,relative_humidity_2m,99.7154
2,dew_point_2m,28.6185
3,pressure_msl,1040.1781
4,surface_pressure,1039.6564
5,precipitation_probability,100.0
6,cloud_cover,100.0
7,cloud_cover_low,100.0
8,cloud_cover_mid,100.0
9,cloud_cover_high,100.0


In [100]:
fig = px.bar(maxval, x='feature', y='max value',color='max value')
fig.show()

In [101]:
# 获取数据每一列的最小值
minval = DA.getMinColumn(start_col='temperature_2m',end_col='pac_copy')
minval

Unnamed: 0,feature,min value
0,temperature_2m,-7.3524
1,relative_humidity_2m,6.4159
2,dew_point_2m,-30.0038
3,pressure_msl,993.1012
4,surface_pressure,992.6477
5,precipitation_probability,0.0
6,cloud_cover,0.0
7,cloud_cover_low,0.0
8,cloud_cover_mid,0.0
9,cloud_cover_high,0.0


In [102]:
fig = px.bar(minval, x='feature', y='min value',color='min value')
fig.show()

In [103]:
# 获取数据每一列的中位数
median = DA.getMedianColumn(start_col='temperature_2m',end_col='pac_copy')
median

Unnamed: 0,feature,median
0,temperature_2m,14.847367
1,relative_humidity_2m,70.304767
2,dew_point_2m,8.4126
3,pressure_msl,1016.944233
4,surface_pressure,1016.464667
5,precipitation_probability,0.0
6,cloud_cover,72.043833
7,cloud_cover_low,0.003
8,cloud_cover_mid,0.5019
9,cloud_cover_high,28.403267


In [104]:
fig = px.bar(median, x='feature', y='median',color='median')
fig.show()

In [105]:
# 获取数据每一列的分位数：定义percent值以设置分为数
DA.getQuantileColumn(percent=[1/3,2/3], start_col='temperature_2m',end_col='pac_copy')

Unnamed: 0,temperature_2m,relative_humidity_2m,dew_point_2m,pressure_msl,surface_pressure,precipitation_probability,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,...,direct_radiation,direct_normal_irradiance,diffuse_radiation,wind_speed_10m,wind_speed_80m,wind_speed_120m,wind_direction_10m,wind_direction_80m,wind_direction_120m,pac_copy
0.333333,9.395711,58.686722,1.999067,1011.242078,1010.771656,0.0,17.556967,0.0,0.0,0.2971,...,0.0,0.0,0.0,2.269711,4.054411,4.499511,105.159856,105.096956,105.074044,0.0
0.666667,20.910522,80.083556,13.586044,1022.553667,1022.061289,0.0,99.084533,2.406733,11.490067,86.713422,...,46.6545,123.734667,106.320789,3.511478,5.8149,6.495622,204.100044,200.858611,200.454178,1.89


## 第四步：变量相关性分析
真实工业智能化应用数据往往是具有复杂相关性的多变量数据，挖掘不同变量之间的相关性与各个变量的自相关性可以获取对于下游任务宝贵的数据先验知识。

目前包含的相关性分析类型有：
- 互相关性分析
- 自相关性分析

In [106]:
# 获取所有序列两两之间的互相关性：定义method以指定计算相关性标准（'pearson' | 'kendall' | 'spearman'）
CrossCorr = DA.getCorr(method='pearson', start_col='temperature_2m',end_col='pac_copy')
CrossCorr

Unnamed: 0,temperature_2m,relative_humidity_2m,dew_point_2m,pressure_msl,surface_pressure,precipitation_probability,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,...,direct_radiation,direct_normal_irradiance,diffuse_radiation,wind_speed_10m,wind_speed_80m,wind_speed_120m,wind_direction_10m,wind_direction_80m,wind_direction_120m,pac_copy
temperature_2m,1.0,0.002659,0.856233,-0.841781,-0.841257,0.127061,0.107972,-0.010373,-0.119939,0.135945,...,0.293357,0.211568,0.361662,0.127845,-0.019054,-0.05309,-0.20764,-0.172351,-0.157662,0.361278
relative_humidity_2m,0.002659,1.0,0.504566,-0.19659,-0.196877,0.308533,0.22537,0.28238,0.211954,0.139472,...,-0.434078,-0.463947,-0.339244,-0.218618,-0.07259,-0.039641,-0.173192,-0.185031,-0.188352,-0.493308
dew_point_2m,0.856233,0.504566,1.0,-0.813301,-0.81299,0.252598,0.208423,0.126174,2.2e-05,0.1873,...,0.030891,-0.053971,0.146929,-0.008985,-0.05942,-0.0715,-0.271813,-0.247576,-0.236504,0.058731
pressure_msl,-0.841781,-0.19659,-0.813301,1.0,1.0,-0.19763,-0.12714,-0.037092,0.084033,-0.176873,...,-0.069566,0.009198,-0.157362,-0.1321,-0.080562,-0.065511,0.14193,0.094196,0.07782,-0.11812
surface_pressure,-0.841257,-0.196877,-0.81299,1.0,1.0,-0.197702,-0.127131,-0.03716,0.083951,-0.176891,...,-0.069151,0.009588,-0.156954,-0.132066,-0.080713,-0.065701,0.141758,0.094016,0.077642,-0.117656
precipitation_probability,0.127061,0.308533,0.252598,-0.19763,-0.197702,1.0,0.302296,0.393692,0.475861,0.207357,...,-0.106184,-0.142063,0.023278,0.155453,0.108399,0.093246,-0.060512,-0.06735,-0.068996,-0.070818
cloud_cover,0.107972,0.22537,0.208423,-0.12714,-0.127131,0.302296,1.0,0.393047,0.534994,0.821876,...,-0.262576,-0.335268,0.087305,0.114651,0.058752,0.042481,-0.195308,-0.204226,-0.202153,-0.134017
cloud_cover_low,-0.010373,0.28238,0.126174,-0.037092,-0.03716,0.393692,0.393047,1.0,0.331672,0.092183,...,-0.152238,-0.184404,0.089564,0.214122,0.119032,0.093185,-0.100522,-0.11347,-0.114409,-0.089434
cloud_cover_mid,-0.119939,0.211954,2.2e-05,0.084033,0.083951,0.475861,0.534994,0.331672,1.0,0.299225,...,-0.25192,-0.294599,-0.058042,0.123907,0.095419,0.084386,-0.133,-0.157405,-0.163178,-0.182567
cloud_cover_high,0.135945,0.139472,0.1873,-0.176873,-0.176891,0.207357,0.821876,0.092183,0.299225,1.0,...,-0.20107,-0.258944,0.041504,0.055996,0.037196,0.030364,-0.136261,-0.138075,-0.135735,-0.098048


In [107]:
fig = px.imshow(CrossCorr, width=1000, height=1000)
fig.show()

In [108]:
# 获取所有序列自相关系数：定义lag以指定计算自相关的滞后期数（时间间隔）
SelfCorr = DA.getSelfCorr(lag=96, start_col='temperature_2m',end_col='pac_copy')
SelfCorr

Unnamed: 0,feature,self correlation
0,temperature_2m,0.830924
1,relative_humidity_2m,0.357044
2,dew_point_2m,0.962329
3,pressure_msl,0.970117
4,surface_pressure,0.970122
5,precipitation_probability,0.36067
6,cloud_cover,0.502775
7,cloud_cover_low,0.506674
8,cloud_cover_mid,0.51718
9,cloud_cover_high,0.428332


In [109]:
fig = px.bar(SelfCorr, x='feature', y='self correlation',color='self correlation')
fig.show()

## 第五步：周期性分析

In [110]:
# 获取3个最主要的周期
topk, sample_freq = DA.getFFTtopk('pac')
topk

{'top_k_power': array([ 49419.0652097 ,  88036.40012762, 218975.90490175]),
 'fft_periods': array([288, 144, 288])}

In [111]:
fig = px.line(sample_freq)
fig.show()

## 第六步：变量平稳性分析
针对变量平稳性的分析可以展示数据中的异常变化与波动。

目前包含的平稳性分析方法有：
- ADF
- Phillips-Perron
- DF-GLS
- KPSS
- Zivot-Andrew
- Variance Ratio

In [112]:
# 获取ADF平稳性测试结果
DA.getADF(start_col='pac',end_col='pac')

{'pac': {'Test Statistic': -48.081238826892,
  'P-value': 0.0,
  'Lags': np.int64(48),
  'Trend': 'c',
  'Summary': <class 'statsmodels.iolib.summary.Summary'>
  """
     Augmented Dickey-Fuller Results   
  Test Statistic                -48.081
  P-value                         0.000
  Lags                               48
  -------------------------------------
  
  Trend: Constant
  Critical Values: -3.43 (1%), -2.86 (5%), -2.57 (10%)
  Null Hypothesis: The process contains a unit root.
  Alternative Hypothesis: The process is weakly stationary.
  """}}

In [113]:
# 获取Phillips-Perron平稳性测试结果
DA.getPhillipsPerron(start_col='pac',end_col='pac')

{'pac': {'Test Statistic': np.float64(-54.647662063387315),
  'P-value': 0.0,
  'Lags': 73,
  'Trend': 'c',
  'Summary': <class 'statsmodels.iolib.summary.Summary'>
  """
       Phillips-Perron Test (Z-tau)    
  Test Statistic                -54.648
  P-value                         0.000
  Lags                               73
  -------------------------------------
  
  Trend: Constant
  Critical Values: -3.43 (1%), -2.86 (5%), -2.57 (10%)
  Null Hypothesis: The process contains a unit root.
  Alternative Hypothesis: The process is weakly stationary.
  """}}

In [114]:
# 获取DF-GLS平稳性测试结果
DA.getDFGLS(start_col='pac',end_col='pac')

{'pac': {'Test Statistic': -42.33243314733131,
  'P-value': 0.0,
  'Lags': np.int64(48),
  'Trend': 'c',
  'Summary': <class 'statsmodels.iolib.summary.Summary'>
  """
        Dickey-Fuller GLS Results      
  Test Statistic                -42.332
  P-value                         0.000
  Lags                               48
  -------------------------------------
  
  Trend: Constant
  Critical Values: -2.57 (1%), -1.94 (5%), -1.62 (10%)
  Null Hypothesis: The process contains a unit root.
  Alternative Hypothesis: The process is weakly stationary.
  """}}