## Setup: Import Libraries

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 6)

## Step 1: Extract - Load Raw Data

In [None]:
from pipeline.extract import load_traffy_data

df_traffy = load_traffy_data('../data/raw/bangkok_traffy.csv')
df_weather = pd.read_csv('../data/raw/open-meteo-13.74N100.50E9m.csv')

print(f"Traffy shape: {df_traffy.shape}")
print(f"Weather shape: {df_weather.shape}")

df_traffy.head()

## Step 2: Transform - Clean Traffy Data

In [None]:
from pipeline.utils import clean_traffy_data

df_traffy = clean_traffy_data(df_traffy)
print(f"After cleaning: {df_traffy.shape}")
df_traffy.head()

## Step 3: Transform - Split Coordinates

In [None]:
from pipeline.utils import split_coordinates

df_traffy = split_coordinates(df_traffy)
print(f"Coordinates split: {df_traffy.shape}")
print(f"Lat range: {df_traffy['latitude'].min():.2f} to {df_traffy['latitude'].max():.2f}")
print(f"Lon range: {df_traffy['longitude'].min():.2f} to {df_traffy['longitude'].max():.2f}")
df_traffy[['coords', 'latitude', 'longitude']].head()

## Step 4: Transform - Parse Timestamps to Date

In [None]:
df_traffy['timestamp'] = pd.to_datetime(df_traffy['timestamp'], format='mixed', utc=True)
df_traffy['date'] = df_traffy['timestamp'].dt.date
df_traffy['date'] = pd.to_datetime(df_traffy['date'])

df_weather['date'] = pd.to_datetime(df_weather['time']).dt.date
df_weather['date'] = pd.to_datetime(df_weather['date'])

print(f"Traffy date range: {df_traffy['date'].min()} to {df_traffy['date'].max()}")
print(f"Weather date range: {df_weather['date'].min()} to {df_weather['date'].max()}")

## Step 5: Transform - Aggregate Weather to Daily Average

In [None]:
df_weather_daily = df_weather.groupby('date').mean(numeric_only=True).reset_index()
print(f"Daily weather shape: {df_weather_daily.shape}")
df_weather_daily.head()

## Step 6: Transform - Merge Traffy with Weather

In [None]:
df_merged = df_traffy.merge(df_weather_daily, on='date', how='left')

temp_col = 'temperature_2m (°C)' if 'temperature_2m (°C)' in df_merged.columns else 'temperature_2m'
match_rate = (~df_merged[temp_col].isna()).sum() / len(df_merged) * 100 if temp_col in df_merged.columns else 0

print(f"Merged shape: {df_merged.shape}")
print(f"Weather match rate: {match_rate:.1f}%")
df_merged.head()

## Step 7: Preprocess - Parse Type Column

In [None]:
from pipeline.preprocess import parse_type_column

df_merged = parse_type_column(df_merged)
print(f"Type column parsed: {df_merged.shape}")
print(f"Sample types: {df_merged['type'].head().tolist()}")

## Step 8: Preprocess - Filter Empty Types

In [None]:
from pipeline.preprocess import filter_empty_types

df_merged = filter_empty_types(df_merged)
print(f"After filtering empty types: {df_merged.shape}")

## Step 9: Preprocess - Drop Missing Weather Data

In [None]:
from pipeline.preprocess import drop_missing_weather

df_merged = drop_missing_weather(df_merged)
print(f"After dropping missing weather: {df_merged.shape}")

## Step 10: Feature Engineering - Prepare Features

In [None]:
from pipeline.features import prepare_features

df_merged = prepare_features(df_merged)
print(f"After feature engineering: {df_merged.shape}")
print(f"Feature columns: {[col for col in df_merged.columns if col.startswith(('hour_', 'day_', 'month_', 'district_'))]}")

## Step 11: Feature Engineering - Create Binary Target Columns

In [None]:
from pipeline.preprocess import create_binary_targets

df_merged, binary_cols = create_binary_targets(df_merged)
print(f"Created {len(binary_cols)} binary target columns")
print(f"Binary columns: {binary_cols[:10]}...")
print(f"\nFinal shape: {df_merged.shape}")

## Step 12: Optional - Sample Data for Faster Training

In [None]:
from pipeline.preprocess import sample_data

df_sample = sample_data(df_merged, n=200000, random_state=42)
print(f"Sampled data: {df_sample.shape}")

## Step 13: Explore Type Distribution

In [None]:
type_counts = {col.replace('type_', ''): df_merged[col].sum() for col in binary_cols}
type_counts = dict(sorted(type_counts.items(), key=lambda x: x[1], reverse=True))

print("Top 10 complaint types:")
for i, (type_name, count) in enumerate(list(type_counts.items())[:10], 1):
    pct = count / len(df_merged) * 100
    print(f"{i:2d}. {type_name:20s}: {count:6,} ({pct:5.2f}%)")

plt.figure(figsize=(14, 6))
plt.bar(range(len(type_counts)), list(type_counts.values()))
plt.xticks(range(len(type_counts)), list(type_counts.keys()), rotation=45, ha='right')
plt.ylabel('Count')
plt.title('Complaint Type Distribution')
plt.tight_layout()
plt.show()

## Step 14: Train - Prepare Features for Training

In [None]:
from pipeline.train import prepare_features_for_training

X, feature_names = prepare_features_for_training(df_sample)
print(f"Feature matrix shape: {X.shape}")
print(f"Number of features: {len(feature_names)}")
print(f"Feature names: {feature_names[:10]}...")

## Step 15: Train - Get Trainable Types

In [None]:
from pipeline.train import get_trainable_types

trainable_types = get_trainable_types(df_sample, min_samples=50)
print(f"Trainable types: {len(trainable_types)}")
print(f"Types: {trainable_types[:10]}")

## Step 16: Train - Train Single Type Model (Demo)

In [None]:
from pipeline.train import train_single_type_model

demo_type = trainable_types[0]
print(f"Training model for: {demo_type}")

result = train_single_type_model(
    df_sample,
    type_col=f"type_{demo_type}",
    feature_names=feature_names,
    n_iter=3,
    cv=2,
    random_state=42
)

print(f"\nModel trained successfully!")
print(f"Accuracy: {result['metrics']['accuracy']:.4f}")
print(f"Precision: {result['metrics']['precision']:.4f}")
print(f"Recall: {result['metrics']['recall']:.4f}")
print(f"F1 Score: {result['metrics']['f1']:.4f}")
print(f"Best params: {result['metrics']['best_params']}")

## Step 17: Train - Train All Types (Full Pipeline)

In [None]:
from pipeline.train import train_all_types

results = train_all_types(
    df_sample,
    n_iter=5,
    min_samples=50,
    output_dir='../data/models',
    adaptive_resampling=True
)

print(f"\nTraining complete! Trained {len(results)} models")

## Step 18: Results - Display Training Summary

In [None]:
summary_data = []
for type_name, result in results.items():
    metrics = result['metrics']
    summary_data.append({
        'type': type_name,
        'accuracy': metrics['accuracy'],
        'precision': metrics['precision'],
        'recall': metrics['recall'],
        'f1': metrics['f1'],
        'n_estimators': metrics['best_params'].get('n_estimators', None),
        'max_depth': metrics['best_params'].get('max_depth', None)
    })

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('f1', ascending=False)
print(summary_df.to_string(index=False))

## Step 19: Visualize - Run Streamlit Dashboard

In [None]:
import subprocess
import os

os.chdir('..')
print("Starting Streamlit dashboard...")
print("Dashboard will open in your browser at http://localhost:8501")
print("\nPress Ctrl+C in the terminal to stop the server\n")

subprocess.run(['streamlit', 'run', 'streamlit_app.py'])

## Step 20: Save Final Dataset

In [None]:
import os

os.makedirs('../data/processed', exist_ok=True)
df_merged.to_csv('../data/processed/traffy_weather_daily.csv', index=False)
print('Saved: data/processed/traffy_weather_daily.csv')
print(f"Final shape: {df_merged.shape}")