In [18]:
# Reload modules to pick up changes
import importlib
import pipeline.features
import pipeline.preprocess

importlib.reload(pipeline.features)
importlib.reload(pipeline.preprocess)

from pipeline.features import *
from pipeline.preprocess import *

print("✓ Modules reloaded")

✓ Modules reloaded


In [19]:
import pandas as pd
from pipeline.extract import load_traffy_data, load_single_location_weather, load_bangkok_air_quality
from pipeline.utils import split_coordinates, clean_traffy_data
from pipeline.preprocess import parse_type_column, filter_empty_types, drop_missing_weather
from pipeline.features import prepare_features

In [20]:
# Load and clean Traffy data through pipeline
df_traffy = load_traffy_data('data/raw/bangkok_traffy.csv')
df_traffy = clean_traffy_data(df_traffy)
df_traffy = split_coordinates(df_traffy)

print(f"✓ Cleaned Traffy data: {df_traffy.shape}")
df_traffy.head()

Loading Traffy data from: data/raw/bangkok_traffy.csv
✓ Loaded 787,026 records
✓ Loaded 787,026 records
✓ Cleaned Traffy data: (651600, 9)
✓ Cleaned Traffy data: (651600, 9)


Unnamed: 0,type,comment,coords,subdistrict,district,province,timestamp,longitude,latitude
0,"{น้ำท่วม,ร้องเรียน}",น้ำท่วมเวลาฝนตกและทะลุเข้าบ้านเดือดร้อนมากทุกๆ...,"100.66709,13.67891",หนองบอน,ประเวศ,กรุงเทพมหานคร,2021-09-19 14:56:08.924992+00,100.66709,13.67891
1,{สะพาน},สะพานลอยปรับปรุงไม่เสร็จตามกำหนด\nปากซอย สาทร12,"100.52649,13.72060",ยานนาวา,สาทร,กรุงเทพมหานคร,2021-09-26 05:03:52.594898+00,100.52649,13.7206
2,"{น้ำท่วม,ถนน}",ซอยลาดพร้าววังหิน 75 ถนนลาดพร้าววังหิน แขวงลาด...,"100.59165,13.82280",ลาดพร้าว,ลาดพร้าว,กรุงเทพมหานคร,2021-12-09 12:29:08.408763+00,100.59165,13.8228
3,{},หน้าปากซอย ลาดพร้าววังหิน26,"100.59131,13.80910",ลาดพร้าว,ลาดพร้าว,กรุงเทพมหานคร,2021-12-13 05:53:36.861064+00,100.59131,13.8091
4,{},ยังไม่มีหน่วยงานไหนมาดูแลครับ รถจะเชี่ยวหลายคน...,"100.50848,13.77832",ดุสิต,ดุสิต,กรุงเทพมหานคร,2021-12-17 08:46:02.610983+00,100.50848,13.77832


In [21]:
# Load weather data
df_weather = load_single_location_weather('data/raw/open-meteo-13.74N100.50E9m.csv')

# Parse timestamps and create date columns
df_traffy['timestamp'] = pd.to_datetime(df_traffy['timestamp'], format='mixed', utc=True)
df_traffy['date'] = df_traffy['timestamp'].dt.date
df_traffy['date'] = pd.to_datetime(df_traffy['date'])

df_weather['date'] = pd.to_datetime(df_weather['time']).dt.date
df_weather['date'] = pd.to_datetime(df_weather['date'])

# Aggregate weather to daily average
df_weather_daily = df_weather.groupby('date').mean(numeric_only=True).reset_index()

print(f"✓ Weather aggregated: {df_weather_daily.shape}")

✓ Weather aggregated: (1279, 10)


In [22]:
# Load air quality data
df_air = load_bangkok_air_quality('data/processed/bangkok-air-quality.csv')
print(f"✓ Loaded {len(df_air):,} air quality records")

# Merge weather on date
df = df_traffy.merge(df_weather_daily, on='date', how='left')
print(f"✓ Merged with weather: {df.shape}")

# Merge air quality data
df = df.merge(df_air, on='date', how='left')
print(f"✓ Merged with air quality: {df.shape}")

✓ Loaded 4,351 air quality records
✓ Merged with weather: (651600, 19)
✓ Merged with air quality: (651600, 23)


In [23]:
# Apply preprocessing pipeline
df = parse_type_column(df)
df = filter_empty_types(df)
df = drop_missing_weather(df)

print(f"✓ After preprocessing: {df.shape}")
df.info()

✓ After preprocessing: (540633, 23)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540633 entries, 0 to 540632
Data columns (total 23 columns):
 #   Column                         Non-Null Count   Dtype              
---  ------                         --------------   -----              
 0   type                           540633 non-null  object             
 1   comment                        540633 non-null  object             
 2   coords                         540633 non-null  object             
 3   subdistrict                    540633 non-null  object             
 4   district                       540633 non-null  object             
 5   province                       540633 non-null  object             
 6   timestamp                      540633 non-null  datetime64[ns, UTC]
 7   longitude                      540633 non-null  float64            
 8   latitude                       540633 non-null  float64            
 9   date                           540633 non-nul

In [25]:
# Check actual column names for air quality
air_cols = [col for col in df.columns if any(x in col.lower() for x in ['pm2.5', 'pm10', 'o3', 'no2', 'pm25'])]
print("Air quality column names:")
print(air_cols)

Air quality column names:
[' pm25', ' pm10', ' o3', ' no2']


In [26]:
# Drop rows without air quality data
# Find actual air quality column names (they have leading spaces)
air_quality_cols = [col for col in df.columns if any(x in col.strip().lower() for x in ['pm2.5', 'pm25', 'pm10', 'o3', 'no2'])]
print(f"Air quality columns: {air_quality_cols}")
print(f"Before dropping nulls: {len(df):,} records")
df = df.dropna(subset=air_quality_cols)
print(f"After dropping nulls: {len(df):,} records")
print(f"✓ Dropped records without complete air quality data")

Air quality columns: [' pm25', ' pm10', ' o3', ' no2']
Before dropping nulls: 540,633 records
After dropping nulls: 540,236 records
✓ Dropped records without complete air quality data


In [27]:
# Apply feature engineering
df = prepare_features(df)
print(f"✓ Features prepared: {df.shape}")
df.info()


FEATURE ENGINEERING

Extracting time features...
hello
Encoding cyclical features...
Encoding districts...
  ✓ Encoded 50 districts

✓ Feature engineering complete
  Final shape: (540236, 83)
  New features: hour, day_of_week, month, season, cyclical encodings, districts
✓ Features prepared: (540236, 83)
  ✓ Encoded 50 districts

✓ Feature engineering complete
  Final shape: (540236, 83)
  New features: hour, day_of_week, month, season, cyclical encodings, districts
✓ Features prepared: (540236, 83)
<class 'pandas.core.frame.DataFrame'>
Index: 540236 entries, 0 to 540632
Data columns (total 83 columns):
 #   Column                         Non-Null Count   Dtype              
---  ------                         --------------   -----              
 0   type                           540236 non-null  object             
 1   comment                        540236 non-null  object             
 2   coords                         540236 non-null  object             
 3   subdistrict       

In [29]:
df.columns

Index(['type', 'comment', 'coords', 'subdistrict', 'district', 'province',
       'timestamp', 'longitude', 'latitude', 'date', 'temperature_2m (°C)',
       'dew_point_2m (°C)', 'relative_humidity_2m (%)', 'rain (mm)',
       'vapour_pressure_deficit (kPa)', 'cloud_cover (%)',
       'wind_direction_10m (°)', 'surface_pressure (hPa)',
       'wind_speed_10m (km/h)', ' pm25', ' pm10', ' o3', ' no2',
       'timestamp_col', 'hour', 'day_of_week', 'month', 'hour_sin', 'hour_cos',
       'day_sin', 'day_cos', 'month_sin', 'month_cos', 'district_คลองสาน',
       'district_คลองสามวา', 'district_คลองเตย', 'district_คันนายาว',
       'district_จตุจักร', 'district_จอมทอง', 'district_ดอนเมือง',
       'district_ดินแดง', 'district_ดุสิต', 'district_ตลิ่งชัน',
       'district_ทวีวัฒนา', 'district_ทุ่งครุ', 'district_ธนบุรี',
       'district_บางกอกน้อย', 'district_บางกอกใหญ่', 'district_บางกะปิ',
       'district_บางขุนเทียน', 'district_บางคอแหลม', 'district_บางซื่อ',
       'district_บางนา', 'd