In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Pipeline functions
def extract_time_features(df, timestamp_col='timestamp'):
    df = df.copy()
    df['timestamp_col'] = pd.to_datetime(df[timestamp_col], format='mixed', utc=True)
    df['hour'] = df['timestamp_col'].dt.hour
    df['day_of_week'] = df['timestamp_col'].dt.dayofweek
    df['month'] = df['timestamp_col'].dt.month
    return df

def encode_cyclical_features(df):
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    return df

def parse_type_column(df):
    def parse_cell(cell):
        if isinstance(cell, str):
            s = cell.strip('{}').strip()
            return [x.strip() for x in s.split(',')] if s else []
        elif isinstance(cell, list):
            return cell
        else:
            return []
    df['type'] = df['type'].apply(parse_cell)
    return df

def filter_empty_types(df, column='type'):
    return df[df[column].map(lambda x: len(x) > 0)].reset_index(drop=True)

def drop_missing_weather(df, weather_columns=None):
    if weather_columns is None:
        weather_columns = ['temperature_2m (°C)', 'rain (mm)', 'wind_speed_10m (km/h)']
    return df.dropna(subset=weather_columns).reset_index(drop=True)

In [28]:
df = pd.read_csv('data/processed/traffy_weather_merged.csv')
print(f"Initial: {len(df):,} records")
df.info()

  df = pd.read_csv('data/processed/traffy_weather_merged.csv')


Initial: 651,600 records
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651600 entries, 0 to 651599
Data columns (total 22 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   type                           651600 non-null  object 
 1   comment                        651600 non-null  object 
 2   coords                         651600 non-null  object 
 3   subdistrict                    651600 non-null  object 
 4   district                       651600 non-null  object 
 5   province                       651600 non-null  object 
 6   timestamp                      651600 non-null  object 
 7   date                           651600 non-null  object 
 8    pm25                          651138 non-null  object 
 9    pm10                          651138 non-null  object 
 10   o3                            651138 non-null  object 
 11   no2                           651138 non-null  object 
 12  type_

In [29]:
# Sample 200k records for faster training

def sample(df):
    df_sampled = df.sample(n=200000, random_state=42).reset_index(drop=True)
    print(f"Sampled to: {len(df_sampled):,} records")
    return df_sampled

In [None]:
df.columns = df.columns.str.strip()

df = sample(df)

# Convert weather columns to numeric
weather_cols = ['pm25', 'pm10', 'o3', 'no2', 'temperature_2m (°C)', 'rain (mm)', 'wind_speed_10m (km/h)']
for col in weather_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Parse and filter complaint types
df = parse_type_column(df)
df = filter_empty_types(df)
print(f"After filter_empty_types: {len(df):,} records")

# Drop rows with missing weather data
df = drop_missing_weather(df, weather_columns=['temperature_2m (°C)', 'rain (mm)'])
print(f"After drop_missing_weather: {len(df):,} records")

# Extract temporal features
df = extract_time_features(df, timestamp_col='timestamp')
df = encode_cyclical_features(df)

# One-hot encode districts
district_encoded = pd.get_dummies(df['district'], prefix='district')
df = pd.concat([df, district_encoded], axis=1)

# Create binary target columns
all_types = set()
for type_list in df['type']:
    if isinstance(type_list, list):
        all_types.update(type_list)
    else:
        print(f"Warning: Non-list type found: {type_list}")

print(f"Found {len(all_types)} unique types")


for t in all_types:
    df[f'type_{t}'] = df['type'].apply(lambda x: 1 if (isinstance(x, list) and t in x) else 0)

# Verify binary columns are numeric
type_cols = [c for c in df.columns if c.startswith('type_')]
print(f"Created {len(type_cols)} binary type columns")
print(f"Sample type column dtypes: {df[type_cols[:3]].dtypes.tolist()}")

print(f"Final shape: {df.shape}, Types: {len(all_types)}")

Sampled to: 200,000 records
After filter_empty_types: 200,000 records
After drop_missing_weather: 200,000 records
After filter_empty_types: 200,000 records
After drop_missing_weather: 200,000 records
Found 25 unique types
Found 25 unique types
Created 26 binary type columns
Sample type column dtypes: [dtype('O'), dtype('int64'), dtype('int64')]
Final shape: (200000, 107), Types: 25
Created 26 binary type columns
Sample type column dtypes: [dtype('O'), dtype('int64'), dtype('int64')]
Final shape: (200000, 107), Types: 25


In [None]:
def get_predictable_types(df, sort_by_distribution=False):
    """
    Get all predictable complaint types from dataframe.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame with binary type columns (type_*)
    sort_by_distribution : bool
        If True, sort by frequency (most common first)
    
    Returns:
    --------
    list : List of all type column names (with 'type_' prefix)
    """
    type_cols = [c for c in df.columns if c.startswith('type_')]
    
    if sort_by_distribution:
        # Count occurrences for each type and sort
        type_counts = {}
        for col in type_cols:
            type_counts[col] = (df[col] == 1).sum()
        
        # Sort by count (most frequent first)
        sorted_types = sorted(type_counts.items(), key=lambda x: x[1], reverse=True)
        type_names = [col for col, count in sorted_types]
    else:
        type_names = type_cols
    
    return type_names

# Get list of all predictable types (sorted by frequency)
available_types = get_predictable_types(df, sort_by_distribution=True)
print(f"Total types: {len(available_types)}")
print(f"Top 10: {[t.replace('type_', '') for t in available_types[:10]]}")

Total types: 26
Top 10: ['ถนน', 'ไม่ระบุ', 'ทางเท้า', 'ความปลอดภัย', 'แสงสว่าง', 'กีดขวาง', 'ความสะอาด', 'จราจร', 'น้ำท่วม', 'ท่อระบายน้ำ']


In [32]:
# Analyze class distribution for all types
print("Class Distribution Analysis:")
print("="*80)

type_stats = []
for col in available_types[:15]:  # Check top 15
    positive = (df[col] == 1).sum()
    negative = (df[col] == 0).sum()
    ratio = positive / len(df)
    type_stats.append({
        'type': col.replace('type_', ''),
        'positive': positive,
        'negative': negative,
        'ratio': ratio,
        'imbalance': negative / positive if positive > 0 else float('inf')
    })

stats_df = pd.DataFrame(type_stats)
print(stats_df.to_string(index=False))
print("\n" + "="*80)

Class Distribution Analysis:
       type  positive  negative    ratio  imbalance
        ถนน     43932    156068 0.219660   3.552490
    ไม่ระบุ     33967    166033 0.169835   4.888068
    ทางเท้า     30900    169100 0.154500   5.472492
ความปลอดภัย     20420    179580 0.102100   8.794319
   แสงสว่าง     17666    182334 0.088330  10.321182
    กีดขวาง     17511    182489 0.087555  10.421392
  ความสะอาด     16402    183598 0.082010  11.193635
      จราจร     12457    187543 0.062285  15.055230
    น้ำท่วม     10945    189055 0.054725  17.273184
ท่อระบายน้ำ     10869    189131 0.054345  17.400957
       ป้าย     10094    189906 0.050470  18.813751
  ร้องเรียน      9894    190106 0.049470  19.214271
      สะพาน      8420    191580 0.042100  22.752969
     ต้นไม้      7490    192510 0.037450  25.702270
 เสียงรบกวน      6967    193033 0.034835  27.706760



In [None]:
import pickle
import os

def save_model(model, type_name, output_dir='data/models'):
    """
    Save trained model to disk.
    
    Parameters:
    -----------
    model : sklearn model
        Trained model to save
    type_name : str
        Complaint type name (with 'type_' prefix)
    output_dir : str
        Directory to save models
    """
    os.makedirs(output_dir, exist_ok=True)
    filename = f"{type_name.replace('type_', '')}_model.pkl"
    filepath = os.path.join(output_dir, filename)
    
    with open(filepath, 'wb') as f:
        pickle.dump(model, f)
    
    return filepath

def save_feature_names(feature_names, output_dir='data/models'):
    """Save feature names to a separate file (same for all models)."""
    os.makedirs(output_dir, exist_ok=True)
    filepath = os.path.join(output_dir, 'feature_names.pkl')
    with open(filepath, 'wb') as f:
        pickle.dump(feature_names, f)
    return filepath

def train_models(df, type_indices, available_types, n_iter=5, adaptive_strategy=True):
    """
    Train models for selected complaint types with adaptive resampling strategy.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Prepared dataframe with features and targets
    type_indices : list
        List of indices to select from available_types
    available_types : list
        List of all available type column names
    n_iter : int
        Number of RandomizedSearchCV iterations (reduced to 5 for speed)
    adaptive_strategy : bool
        If True, use adaptive resampling based on class distribution
    
    Returns:
    --------
    dict : Dictionary with type names as keys and results as values
    """
    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.pipeline import Pipeline as ImbPipeline
    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import randint
    
    results = {}
    
    # Optimized parameter distribution (fewer options for speed)
    param_dist = {
        'n_estimators': randint(100, 301),  # Reduced max from 501
        'max_depth': randint(10, 31),        # Reduced max from 51
        'min_samples_split': randint(2, 6),  # Reduced max from 11
        'min_samples_leaf': randint(1, 4),   # Reduced max from 5
        'max_features': ['sqrt', 'log2'],    # Removed None option
        'class_weight': ['balanced', 'balanced_subsample']
    }
    
    # Prepare features once
    exclude = df.select_dtypes(include=['object']).columns.tolist() + ['timestamp', 'timestamp_col'] + [c for c in df.columns if c.startswith('type_')]
    X = df[[c for c in df.columns if c not in exclude]].fillna(0)
    feature_names = X.columns.tolist()
    
    # Save feature names once (same for all models)
    feature_names_path = save_feature_names(feature_names)
    print(f"✓ Feature names saved to: {feature_names_path}")
    print(f"  Total features: {len(feature_names)}\n")
    
    for idx in type_indices:
        if idx >= len(available_types):
            print(f"⚠ Index {idx} out of range, skipping...")
            continue
            
        target = available_types[idx]
        type_name = target.replace('type_', '')
        
        print("\n" + "="*80)
        print(f"Training model for: {type_name} (index {idx})")
        print("="*80)
        
        y = df[target]
        minority_ratio = y.sum() / len(y)
        print(f"Positive samples: {y.sum():,} ({minority_ratio:.2%})")
        
        # ADAPTIVE RESAMPLING STRATEGY
        if adaptive_strategy:
            if minority_ratio >= 0.15:
                # High frequency: No resampling, use class_weight only
                print(f"Strategy: CLASS_WEIGHT ONLY (ratio >= 15%)")
                X_res, y_res = X, y
                
            elif minority_ratio >= 0.05:
                # Medium frequency: Pure undersampling to 1:2 ratio
                print(f"Strategy: UNDERSAMPLING to 1:2 ratio (5% <= ratio < 15%)")
                try:
                    under = RandomUnderSampler(random_state=42, sampling_strategy=0.5)
                    X_res, y_res = under.fit_resample(X, y)
                    print(f"After undersampling: {len(X_res):,} samples (from {len(X):,})")
                except Exception as e:
                    print(f"⚠ Undersampling failed: {e}, using original data")
                    X_res, y_res = X, y
                    
            else:
                # Low frequency: SMOTE + undersampling
                print(f"Strategy: SMOTE + UNDERSAMPLING (ratio < 5%)")
                try:
                    smote = SMOTE(random_state=42, k_neighbors=5, sampling_strategy=0.2)
                    under = RandomUnderSampler(random_state=42, sampling_strategy=0.3)
                    X_res, y_res = ImbPipeline([('s', smote), ('u', under)]).fit_resample(X, y)
                    print(f"After SMOTE+under: {len(X_res):,} samples (from {len(X):,})")
                except Exception as e:
                    print(f"⚠ Resampling failed: {e}, using original data")
                    X_res, y_res = X, y
        else:
            # Original logic (for comparison)
            if minority_ratio < 0.05:
                try:
                    smote = SMOTE(random_state=42, k_neighbors=5, sampling_strategy=0.2)
                    under = RandomUnderSampler(random_state=42, sampling_strategy=0.3)
                    X_res, y_res = ImbPipeline([('s', smote), ('u', under)]).fit_resample(X, y)
                    print(f"After resampling: {len(X_res):,} samples")
                except Exception as e:
                    print(f"⚠ Resampling failed: {e}, using original data")
                    X_res, y_res = X, y
            else:
                X_res, y_res = X, y
        
        # Train/test split with stratification
        X_train, X_test, y_train, y_test = train_test_split(
            X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
        )
          
        # Train with RandomizedSearchCV (reduced CV from 3 to 2 for speed)
        print(f"Training with RandomizedSearchCV (n_iter={n_iter}, cv=2)...")
        rf = RandomForestClassifier(random_state=42, n_jobs=-1)
        random_search = RandomizedSearchCV(rf, param_dist, n_iter=n_iter, cv=2, scoring='f1', random_state=42, n_jobs=-1, verbose=0)
        random_search.fit(X_train, y_train)
        
        # Evaluate
        model = random_search.best_estimator_
        y_pred = model.predict(X_test)
        
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'best_params': random_search.best_params_
        }
        
        # Save model
        filepath = save_model(model, target)
        
        print(f"\n✓ Results:")
        print(f"  Accuracy:  {metrics['accuracy']:.4f}")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall:    {metrics['recall']:.4f}")
        print(f"  F1:        {metrics['f1']:.4f}")
        print(f"  Saved to:  {filepath}")
        
        # Feature importance with names
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"\nTop 5 Feature Importances:")  # Reduced from 10 to 5
        for i, row in importance_df.head(5).iterrows():
            print(f"  {row['feature']:40s} {row['importance']:.6f}")
        
        results[type_name] = {
            'model': model,
            'metrics': metrics,
            'filepath': filepath,
            'feature_importance': importance_df
        }
    
    return results

In [None]:
# Filter types with sufficient samples for training
min_positive_samples = 50  # Minimum for SMOTE k_neighbors=5
valid_indices = []
skipped_types = []

print("Filtering types by minimum sample threshold...")
print("="*80)

for idx in range(len(available_types)):
    col = available_types[idx]
    count = (df[col] == 1).sum()
    type_name = col.replace('type_', '')
    
    if count >= min_positive_samples:
        valid_indices.append(idx)
        print(f"✓ [{idx:2d}] {type_name:20s} - {count:,} samples")
    else:
        skipped_types.append((type_name, count))

print("\n" + "="*80)
print(f"Valid types for training: {len(valid_indices)}/{len(available_types)}")
print(f"Skipped (< {min_positive_samples} samples): {len(skipped_types)}")

if skipped_types:
    print("\nSkipped types:")
    for type_name, count in skipped_types[:5]:
        print(f"  {type_name}: {count} samples")
    if len(skipped_types) > 5:
        print(f"  ... and {len(skipped_types) - 5} more")

# Train ALL valid types with adaptive strategy
selected_indices = valid_indices
print(f"\n{'='*80}")
print(f"Training {len(selected_indices)} models with ADAPTIVE STRATEGY")
print(f"Estimated time: ~{len(selected_indices) * 3.5:.0f}-{len(selected_indices) * 5:.0f} minutes")
print(f"{'='*80}\n")

import time
start_time = time.time()

# Train with optimized parameters (n_iter=5, cv=2)
trained_results = train_models(df, selected_indices, available_types, n_iter=5, adaptive_strategy=True)

elapsed_time = time.time() - start_time

# Summary
print("\n" + "="*80)
print("TRAINING SUMMARY")
print("="*80)
print(f"Total models trained: {len(trained_results)}/{len(selected_indices)}")
print(f"Total time: {elapsed_time/60:.1f} minutes ({elapsed_time/3600:.2f} hours)")
print(f"Average time per model: {elapsed_time/len(trained_results):.1f} seconds")
print("\n" + "-"*80)
print(f"{'Type':<20s} {'F1':>8s} {'Precision':>10s} {'Recall':>8s} {'Filepath':<30s}")
print("-"*80)

for type_name, result in sorted(trained_results.items(), key=lambda x: x[1]['metrics']['f1'], reverse=True):
    metrics = result['metrics']
    filepath = result['filepath'].split('/')[-1]
    print(f"{type_name:<20s} {metrics['f1']:>8.4f} {metrics['precision']:>10.4f} {metrics['recall']:>8.4f} {filepath:<30s}")

print("="*80)

Filtering types by minimum sample threshold...
✓ [ 0] ถนน                  - 43,932 samples
✓ [ 1] ไม่ระบุ              - 33,967 samples
✓ [ 2] ทางเท้า              - 30,900 samples
✓ [ 3] ความปลอดภัย          - 20,420 samples
✓ [ 4] แสงสว่าง             - 17,666 samples
✓ [ 5] กีดขวาง              - 17,511 samples
✓ [ 6] ความสะอาด            - 16,402 samples
✓ [ 7] จราจร                - 12,457 samples
✓ [ 8] น้ำท่วม              - 10,945 samples
✓ [ 9] ท่อระบายน้ำ          - 10,869 samples
✓ [10] ป้าย                 - 10,094 samples
✓ [11] ร้องเรียน            - 9,894 samples
✓ [12] สะพาน                - 8,420 samples
✓ [13] ต้นไม้               - 7,490 samples
✓ [14] เสียงรบกวน           - 6,967 samples
✓ [15] สายไฟ                - 6,890 samples
✓ [16] คลอง                 - 5,446 samples
✓ [17] สัตว์จรจัด           - 3,375 samples
✓ [18] คนจรจัด              - 1,915 samples
✓ [19] PM2.5                - 1,663 samples
✓ [20] สอบถาม               - 1,149 samples
✓ [21] เสนอแนะ    

In [35]:
# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.pipeline import Pipeline as ImbPipeline

# target = 'type_ถนน'
# exclude = df.select_dtypes(include=['object']).columns.tolist() + ['timestamp', 'timestamp_col'] + [c for c in df.columns if c.startswith('type_')]

# X = df[[c for c in df.columns if c not in exclude]].fillna(0)
# y = df[target]

# # Show weather features being used
# weather_cols = [c for c in X.columns if any(w in c.lower() for w in ['pm25', 'pm10', 'temperature', 'rain', 'wind', 'o3', 'no2'])]
# print(f"Weather features ({len(weather_cols)}): {weather_cols[:10]}")

# minority_ratio = y.sum() / len(y)
# print(f"\nTarget: {target} ({y.sum():,}, {minority_ratio:.2%})")

# if minority_ratio < 0.05:
#     smote = SMOTE(random_state=42, k_neighbors=5, sampling_strategy=0.2)
#     under = RandomUnderSampler(random_state=42, sampling_strategy=0.3)
#     X_res, y_res = ImbPipeline([('s', smote), ('u', under)]).fit_resample(X, y)
# else:
#     X_res, y_res = X, y

# X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
# print(f"Train: {len(X_train):,}, Test: {len(X_test):,}, Features: {X_train.shape[1]}")

In [36]:
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint

# param_dist = {
#     'n_estimators': randint(100, 501),
#     'max_depth': randint(10, 51),
#     'min_samples_split': randint(2, 11),
#     'min_samples_leaf': randint(1, 5),
#     'max_features': ['sqrt', 'log2', None],
#     'class_weight': ['balanced', 'balanced_subsample']
# }

# rf = RandomForestClassifier(random_state=42, n_jobs=-1)
# random_search = RandomizedSearchCV(rf, param_dist, n_iter=10, cv=3, scoring='f1', random_state=42, n_jobs=-1, verbose=1)
# random_search.fit(X_train, y_train)

# model = random_search.best_estimator_
# y_pred = model.predict(X_test)

# print(f"\nBest params: {random_search.best_params_}")
# print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
# print(f"Precision: {precision_score(y_test, y_pred):.4f}")
# print(f"Recall: {recall_score(y_test, y_pred):.4f}")
# print(f"F1: {f1_score(y_test, y_pred):.4f}")