In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Pipeline functions
def extract_time_features(df, timestamp_col='timestamp'):
    df = df.copy()
    df['timestamp_col'] = pd.to_datetime(df[timestamp_col], format='mixed', utc=True)
    df['hour'] = df['timestamp_col'].dt.hour
    df['day_of_week'] = df['timestamp_col'].dt.dayofweek
    df['month'] = df['timestamp_col'].dt.month
    return df

def encode_cyclical_features(df):
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    return df

def parse_type_column(df):
    def parse_cell(cell):
        if isinstance(cell, str):
            s = cell.strip('{}').strip()
            return [x.strip() for x in s.split(',')] if s else []
        elif isinstance(cell, list):
            return cell
        else:
            return []
    df['type'] = df['type'].apply(parse_cell)
    return df

def filter_empty_types(df, column='type'):
    return df[df[column].map(lambda x: len(x) > 0)].reset_index(drop=True)

def drop_missing_weather(df, weather_columns=None):
    if weather_columns is None:
        weather_columns = ['temperature_2m (°C)', 'rain (mm)', 'wind_speed_10m (km/h)']
    return df.dropna(subset=weather_columns).reset_index(drop=True)

In [35]:
df = pd.read_csv('data/processed/traffy_weather_merged.csv')
print(f"Initial: {len(df):,} records")
df.info()

Initial: 651,600 records
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651600 entries, 0 to 651599
Data columns (total 24 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   type                           651600 non-null  object 
 1   comment                        651600 non-null  object 
 2   coords                         651600 non-null  object 
 3   subdistrict                    651600 non-null  object 
 4   district                       651600 non-null  object 
 5   province                       651600 non-null  object 
 6   timestamp                      651600 non-null  object 
 7   longitude                      651600 non-null  float64
 8   latitude                       651600 non-null  float64
 9   timestamp_hour                 651600 non-null  object 
 10  grid_lat                       651600 non-null  float64
 11  grid_lon                       651600 non-null  float64
 12  time 

In [36]:
df.columns = df.columns.str.strip()

# Convert weather columns to numeric
weather_cols = ['pm25', 'pm10', 'o3', 'no2', 'temperature_2m (°C)', 'rain (mm)', 'wind_speed_10m (km/h)']
for col in weather_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Parse and filter complaint types
df = parse_type_column(df)
df = filter_empty_types(df)
print(f"After filter_empty_types: {len(df):,} records")

# Drop rows with missing weather data
df = drop_missing_weather(df, weather_columns=['temperature_2m (°C)', 'rain (mm)'])
print(f"After drop_missing_weather: {len(df):,} records")

# Extract temporal features
df = extract_time_features(df, timestamp_col='timestamp')
df = encode_cyclical_features(df)

# One-hot encode districts
district_encoded = pd.get_dummies(df['district'], prefix='district')
df = pd.concat([df, district_encoded], axis=1)

# Create binary target columns
all_types = set()
for type_list in df['type']:
    all_types.update(type_list)

for t in all_types:
    df[f'type_{t}'] = df['type'].apply(lambda x: 1 if t in x else 0)

print(f"Final shape: {df.shape}, Types: {len(all_types)}")

After filter_empty_types: 540,633 records
After drop_missing_weather: 187,138 records
Final shape: (187138, 84), Types: 24
Final shape: (187138, 84), Types: 24


In [37]:
def get_predictable_types(df, sort_by_distribution=False):
    """
    Get all predictable complaint types from dataframe.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame with binary type columns (type_*)
    sort_by_distribution : bool
        If True, sort by frequency (most common first)
    
    Returns:
    --------
    list : List of all type column names (with 'type_' prefix)
    """
    type_cols = [c for c in df.columns if c.startswith('type_')]
    
    if sort_by_distribution:
        # Sort by count (most frequent first)
        type_counts = [(col, df[col].sum()) for col in type_cols]
        type_counts.sort(key=lambda x: x[1], reverse=True)
        type_names = [col for col, count in type_counts]
    else:
        type_names = type_cols
    
    return type_names

# Get list of all predictable types (sorted by frequency)
available_types = get_predictable_types(df, sort_by_distribution=True)
print(f"Total types: {len(available_types)}")
print(f"Top 10: {[t.replace('type_', '') for t in available_types[:10]]}")

Total types: 24
Top 10: ['ถนน', 'ทางเท้า', 'กีดขวาง', 'ความปลอดภัย', 'แสงสว่าง', 'ความสะอาด', 'จราจร', 'ท่อระบายน้ำ', 'สะพาน', 'ป้าย']


In [38]:
import pickle
import os

def save_model(model, type_name, output_dir='data/models'):
    """
    Save trained model to disk.
    
    Parameters:
    -----------
    model : sklearn model
        Trained model to save
    type_name : str
        Complaint type name (with 'type_' prefix)
    output_dir : str
        Directory to save models
    """
    os.makedirs(output_dir, exist_ok=True)
    filename = f"{type_name.replace('type_', '')}_model.pkl"
    filepath = os.path.join(output_dir, filename)
    
    with open(filepath, 'wb') as f:
        pickle.dump(model, f)
    
    return filepath

def save_feature_names(feature_names, output_dir='data/models'):
    """Save feature names to a separate file (same for all models)."""
    os.makedirs(output_dir, exist_ok=True)
    filepath = os.path.join(output_dir, 'feature_names.pkl')
    with open(filepath, 'wb') as f:
        pickle.dump(feature_names, f)
    return filepath

def train_models(df, type_indices, available_types, n_iter=10):
    """
    Train models for selected complaint types.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Prepared dataframe with features and targets
    type_indices : list
        List of indices to select from available_types
    available_types : list
        List of all available type column names
    n_iter : int
        Number of RandomizedSearchCV iterations
    
    Returns:
    --------
    dict : Dictionary with type names as keys and results as values
    """
    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.pipeline import Pipeline as ImbPipeline
    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import randint
    
    results = {}
    
    # Parameter distribution for RandomizedSearchCV
    param_dist = {
        'n_estimators': randint(100, 501),
        'max_depth': randint(10, 51),
        'min_samples_split': randint(2, 11),
        'min_samples_leaf': randint(1, 5),
        'max_features': ['sqrt', 'log2', None],
        'class_weight': ['balanced', 'balanced_subsample']
    }
    
    # Prepare features once
    exclude = df.select_dtypes(include=['object']).columns.tolist() + ['timestamp', 'timestamp_col'] + [c for c in df.columns if c.startswith('type_')]
    X = df[[c for c in df.columns if c not in exclude]].fillna(0)
    feature_names = X.columns.tolist()
    
    # Save feature names once (same for all models)
    feature_names_path = save_feature_names(feature_names)
    print(f"✓ Feature names saved to: {feature_names_path}")
    print(f"  Total features: {len(feature_names)}\n")
    
    for idx in type_indices:
        if idx >= len(available_types):
            print(f"⚠ Index {idx} out of range, skipping...")
            continue
            
        target = available_types[idx]
        type_name = target.replace('type_', '')
        
        print("\n" + "="*80)
        print(f"Training model for: {type_name} (index {idx})")
        print("="*80)
        
        y = df[target]
        minority_ratio = y.sum() / len(y)
        print(f"Positive samples: {y.sum():,} ({minority_ratio:.2%})")
        
        # Resample if needed
        if minority_ratio < 0.05:
            try:
                smote = SMOTE(random_state=42, k_neighbors=5, sampling_strategy=0.2)
                under = RandomUnderSampler(random_state=42, sampling_strategy=0.3)
                X_res, y_res = ImbPipeline([('s', smote), ('u', under)]).fit_resample(X, y)
                print(f"After resampling: {len(X_res):,} samples")
            except Exception as e:
                print(f"⚠ Resampling failed: {e}, using original data")
                X_res, y_res = X, y
        else:
            X_res, y_res = X, y
        
        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
        
        # Train with RandomizedSearchCV
        print(f"Training with RandomizedSearchCV (n_iter={n_iter})...")
        rf = RandomForestClassifier(random_state=42, n_jobs=-1)
        random_search = RandomizedSearchCV(rf, param_dist, n_iter=n_iter, cv=3, scoring='f1', random_state=42, n_jobs=-1, verbose=0)
        random_search.fit(X_train, y_train)
        
        # Evaluate
        model = random_search.best_estimator_
        y_pred = model.predict(X_test)
        
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'best_params': random_search.best_params_
        }
        
        # Save model
        filepath = save_model(model, target)
        
        print(f"\n✓ Results:")
        print(f"  Accuracy:  {metrics['accuracy']:.4f}")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall:    {metrics['recall']:.4f}")
        print(f"  F1:        {metrics['f1']:.4f}")
        print(f"  Saved to:  {filepath}")
        
        # Feature importance with names
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"\nTop 10 Feature Importances:")
        for i, row in importance_df.head(10).iterrows():
            print(f"  {row['feature']:40s} {row['importance']:.6f}")
        
        results[type_name] = {
            'model': model,
            'metrics': metrics,
            'filepath': filepath,
            'feature_importance': importance_df
        }
    
    return results

In [39]:
# Select types to train (by index)
selected_indices = [22]  # Train top 3 types

print(f"Selected types for training:")
for idx in selected_indices:
    if idx < len(available_types):
        print(f"  [{idx}] {available_types[idx].replace('type_', '')}")

# Train models for selected types
trained_results = train_models(df, selected_indices, available_types, n_iter=1)

# Summary
print("\n" + "="*80)
print("Training Summary")
print("="*80)
for type_name, result in trained_results.items():
    print(f"{type_name}: F1={result['metrics']['f1']:.4f}, saved to {result['filepath']}")

Selected types for training:
  [22] การเดินทาง
✓ Feature names saved to: data/models\feature_names.pkl
  Total features: 50


Training model for: การเดินทาง (index 22)
Positive samples: 650 (0.35%)
After resampling: 161,620 samples
Training with RandomizedSearchCV (n_iter=1)...
After resampling: 161,620 samples
Training with RandomizedSearchCV (n_iter=1)...


Selected types for training:
  [22] การเดินทาง
✓ Feature names saved to: data/models\feature_names.pkl
  Total features: 50


Training model for: การเดินทาง (index 22)
Positive samples: 650 (0.35%)
After resampling: 161,620 samples
Training with RandomizedSearchCV (n_iter=1)...
After resampling: 161,620 samples
Training with RandomizedSearchCV (n_iter=1)...


KeyboardInterrupt: 

In [None]:
# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.pipeline import Pipeline as ImbPipeline

# target = 'type_ถนน'
# exclude = df.select_dtypes(include=['object']).columns.tolist() + ['timestamp', 'timestamp_col'] + [c for c in df.columns if c.startswith('type_')]

# X = df[[c for c in df.columns if c not in exclude]].fillna(0)
# y = df[target]

# # Show weather features being used
# weather_cols = [c for c in X.columns if any(w in c.lower() for w in ['pm25', 'pm10', 'temperature', 'rain', 'wind', 'o3', 'no2'])]
# print(f"Weather features ({len(weather_cols)}): {weather_cols[:10]}")

# minority_ratio = y.sum() / len(y)
# print(f"\nTarget: {target} ({y.sum():,}, {minority_ratio:.2%})")

# if minority_ratio < 0.05:
#     smote = SMOTE(random_state=42, k_neighbors=5, sampling_strategy=0.2)
#     under = RandomUnderSampler(random_state=42, sampling_strategy=0.3)
#     X_res, y_res = ImbPipeline([('s', smote), ('u', under)]).fit_resample(X, y)
# else:
#     X_res, y_res = X, y

# X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
# print(f"Train: {len(X_train):,}, Test: {len(X_test):,}, Features: {X_train.shape[1]}")

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint

# param_dist = {
#     'n_estimators': randint(100, 501),
#     'max_depth': randint(10, 51),
#     'min_samples_split': randint(2, 11),
#     'min_samples_leaf': randint(1, 5),
#     'max_features': ['sqrt', 'log2', None],
#     'class_weight': ['balanced', 'balanced_subsample']
# }

# rf = RandomForestClassifier(random_state=42, n_jobs=-1)
# random_search = RandomizedSearchCV(rf, param_dist, n_iter=10, cv=3, scoring='f1', random_state=42, n_jobs=-1, verbose=1)
# random_search.fit(X_train, y_train)

# model = random_search.best_estimator_
# y_pred = model.predict(X_test)

# print(f"\nBest params: {random_search.best_params_}")
# print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
# print(f"Precision: {precision_score(y_test, y_pred):.4f}")
# print(f"Recall: {recall_score(y_test, y_pred):.4f}")
# print(f"F1: {f1_score(y_test, y_pred):.4f}")