In [51]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import glob
import os

In [52]:
def load_pm25_data(file_paths, station_ids=['76T']):
    all_pm25_data = []
    
    for file_path in file_paths:
        df = pd.read_excel(file_path, sheet_name='PM2.5')

        df['Date'] = pd.to_datetime(df['Date'])

        for station_id in station_ids:
            if station_id in df.columns:
                df_selected = df[['Date', station_id]]
                df_selected.columns = ['date', 'pm25']
                all_pm25_data.append(df_selected)

    if all_pm25_data:
        combined_df = pd.concat(all_pm25_data, ignore_index=True)
        combined_df = combined_df.drop_duplicates(subset=['date']).reset_index(drop=True)
        combined_df['pm25'] = pd.to_numeric(combined_df['pm25'], errors='coerce')
        return combined_df
    else:
        print(f"ไม่พบข้อมูลสำหรับสถานี {station_id}")
        return pd.DataFrame(columns=['date', 'pm25'])

In [53]:
def load_fire_data(file_path, station_lat, station_lng, inner_radius=20, outer_radius=50, start_date=None, end_date=None):
    fire_df = pd.read_csv(file_path)

    fire_df['acq_date'] = pd.to_datetime(fire_df['acq_date'])

    if start_date is not None:
        fire_df = fire_df[fire_df['acq_date'] >= pd.to_datetime(start_date)]
    if end_date is not None:
        fire_df = fire_df[fire_df['acq_date'] <= pd.to_datetime(end_date)]

    fire_df['distance'] = np.sqrt(
        ((fire_df['latitude'] - station_lat) * 111.32)**2 +
        ((fire_df['longitude'] - station_lng) * 111.32 * np.cos(np.radians(station_lat)))**2
    )

    fire_inner = fire_df[fire_df['distance'] <= inner_radius]
    fire_outer = fire_df[(fire_df['distance'] > inner_radius) & (fire_df['distance'] <= outer_radius)]

    if start_date is not None and end_date is not None:
        all_dates = pd.DataFrame({'date': pd.date_range(start=start_date, end=end_date)})
    else:
        min_date = fire_df['acq_date'].min() if not fire_df.empty else pd.to_datetime('today')
        max_date = fire_df['acq_date'].max() if not fire_df.empty else pd.to_datetime('today')
        all_dates = pd.DataFrame({'date': pd.date_range(start=min_date, end=max_date)})

    features_inner = fire_inner.groupby('acq_date').agg(
        fire_count_inner=('acq_date', 'size'),
        fire_frp_sum_inner=('frp', 'sum'),
        fire_conf_inner=('confidence', 'mean')
    ).reset_index().rename(columns={'acq_date': 'date'})
    
    features_outer = fire_outer.groupby('acq_date').agg(
        fire_count_outer=('acq_date', 'size'),
        fire_frp_sum_outer=('frp', 'sum'),
        fire_conf_outer=('confidence', 'mean')
    ).reset_index().rename(columns={'acq_date': 'date'})

    result = all_dates.merge(features_inner, on='date', how='left').fillna(0)
    result = result.merge(features_outer, on='date', how='left').fillna(0)

    result['fire_count_total'] = result['fire_count_inner'] + result['fire_count_outer']
    result['fire_frp_sum_total'] = result['fire_frp_sum_inner'] + result['fire_frp_sum_outer']
    
    return result

In [54]:
def load_weather_data(file_paths):

    # กำหนดตัวแปรที่จะใช้ข้อมูลจากทุกจุด
    important_features = [
        'temperature_2m_max (°C)', 
        'temperature_2m_mean (°C)', 
        'temperature_2m_min (°C)', 
        'precipitation_sum (mm)', 
        'wind_speed_10m_max (km/h)', 
        'wind_direction_10m_dominant (°)'
    ]

    center_only_features = [
        'weather_code (wmo code)',
        'cloud_cover_mean (%)', 
        'cloud_cover_max (%)', 
        'cloud_cover_min (%)',
        'relative_humidity_2m_mean (%)',
        'relative_humidity_2m_max (%)',
        'relative_humidity_2m_min (%)',
        'surface_pressure_mean (hPa)', 
        'surface_pressure_max (hPa)', 
        'surface_pressure_min (hPa)',
        'winddirection_10m_dominant (°)',
        'wind_speed_10m_mean (km/h)',
        'wind_speed_10m_min (km/h)',
        'vapour_pressure_deficit_max (kPa)'
    ]
    
    locations = ['center', 'north', 'south', 'east', 'west']
    combined_df = None
    center_data = None
    
    for i, file_path in enumerate(file_paths):
        try:
            df = pd.read_csv(file_path, skiprows=3)

            df['date'] = pd.to_datetime(df['time'])

            location = locations[i] if i < len(locations) else f"location_{i}"

            if location == 'center':
                available_cols = ['date'] 

                for col in center_only_features:
                    if col in df.columns:
                        available_cols.append(col)

                for col in important_features:
                    if col in df.columns:
                        df = df.rename(columns={col: f"{col}_{location}"})
                        available_cols.append(f"{col}_{location}")
                
                selected_df = df[available_cols].copy()

                if combined_df is None:
                    combined_df = selected_df
                else:
                    center_data = selected_df
            else:
                selected_cols = ['date']
                renamed_cols = {}
                
                for col in important_features:
                    if col in df.columns:
                        renamed_cols[col] = f"{col}_{location}"
                        selected_cols.append(col)
                
                if len(selected_cols) > 1:
                    df_selected = df[selected_cols].copy()
                    df_selected = df_selected.rename(columns=renamed_cols)
                    
                    if combined_df is not None:
                        combined_df = pd.merge(combined_df, df_selected, on='date', how='outer')
                    else:
                        combined_df = df_selected
        
        except Exception as e:
            print(f"เกิดข้อผิดพลาดในการอ่านไฟล์ {file_path}: {e}")

    if center_data is not None and combined_df is not None:
        center_cols = [col for col in center_data.columns if col not in combined_df.columns or col == 'date']
        if len(center_cols) > 1:
            center_data_selected = center_data[center_cols]
            combined_df = pd.merge(combined_df, center_data_selected, on='date', how='outer')
    
    if combined_df is not None:
        return combined_df
    else:
        print("ไม่พบข้อมูลสภาพอากาศที่ใช้งานได้")
        return pd.DataFrame(columns=['date'])

In [55]:
def create_combined_dataset(pm25_files, fire_file, weather_files, station_ids=['76T'], 
                            station_lat=116.746463959589768, station_lng=98.57437426524484,
                            start_date='2019-01-01', end_date='2024-12-31'):
    
    # อ่านข้อมูล PM2.5 ✅
    pm25_df = load_pm25_data(pm25_files, station_ids)
    
    # อ่านข้อมูลไฟป่า ✅
    fire_df = load_fire_data(fire_file, station_lat, station_lng, start_date=start_date, end_date=end_date)
    
    # อ่านข้อมูลสภาพอากาศ ✅
    weather_df = load_weather_data(weather_files)
    
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    full_date_df = pd.DataFrame({'date': date_range})
    
    # รวมข้อมูลทั้งหมดเข้าด้วยกัน
    combined_df = full_date_df.merge(pm25_df, on='date', how='left')
    combined_df = combined_df.merge(fire_df, on='date', how='left')
    combined_df = combined_df.merge(weather_df, on='date', how='left')
    
    fire_columns = [col for col in combined_df.columns if 'fire_count' in col]
    combined_df[fire_columns] = combined_df[fire_columns].fillna(0)
    
    return combined_df

In [56]:
station_id = '76T'
station_lat = 16.746463959589768 # center
station_lng = 98.57437426524484 # center
start_date = '2019-01-01'
end_date = '2024-12-31'

In [57]:
pm25_files = sorted(glob.glob("dataset/PM2.5/PM2.5(*.xlsx")) 
fire_file = "dataset/Fire/fire_archive_M-C61_606028.csv"
weather_files = sorted(glob.glob("dataset/Weather_forecast/76T/*.csv"))

In [58]:
combined_df = create_combined_dataset(
    pm25_files, fire_file, weather_files, 
    [station_id], station_lat, station_lng,
    start_date, end_date
)

In [59]:
def save_combined_dataset(combined_df, output_file='combined_pm25_dataset.csv'):
    combined_df.to_csv(output_file, index=False)
    print(f"บันทึกชุดข้อมูลรวมเรียบร้อยแล้วที่ {output_file}")
    print(f"จำนวนแถวทั้งหมด: {len(combined_df)}")
    print(f"จำนวนคอลัมน์ทั้งหมด: {len(combined_df.columns)}")
    print(f"คอลัมน์ในชุดข้อมูล: {combined_df.columns.tolist()}")

In [60]:
save_combined_dataset(combined_df, output_file=f'dataset/Full/combined_pm25_{station_id}_forecast_dataset.csv')

บันทึกชุดข้อมูลรวมเรียบร้อยแล้วที่ dataset/Full/combined_pm25_76T_forecast_dataset.csv
จำนวนแถวทั้งหมด: 2192
จำนวนคอลัมน์ทั้งหมด: 54
คอลัมน์ในชุดข้อมูล: ['date', 'pm25', 'fire_count_inner', 'fire_frp_sum_inner', 'fire_conf_inner', 'fire_count_outer', 'fire_frp_sum_outer', 'fire_conf_outer', 'fire_count_total', 'fire_frp_sum_total', 'weather_code (wmo code)', 'cloud_cover_mean (%)', 'cloud_cover_max (%)', 'cloud_cover_min (%)', 'relative_humidity_2m_mean (%)', 'relative_humidity_2m_max (%)', 'relative_humidity_2m_min (%)', 'surface_pressure_mean (hPa)', 'surface_pressure_max (hPa)', 'surface_pressure_min (hPa)', 'winddirection_10m_dominant (°)', 'wind_speed_10m_mean (km/h)', 'wind_speed_10m_min (km/h)', 'vapour_pressure_deficit_max (kPa)', 'temperature_2m_max (°C)_center', 'temperature_2m_mean (°C)_center', 'temperature_2m_min (°C)_center', 'precipitation_sum (mm)_center', 'wind_speed_10m_max (km/h)_center', 'wind_direction_10m_dominant (°)_center', 'temperature_2m_max (°C)_north', 'tem