In [2]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('data/processed/traffy_merged.csv')

  df = pd.read_csv('data/processed/traffy_merged.csv')


In [4]:
df.columns = df.columns.str.strip()
df['pm25'] = pd.to_numeric(df['pm25'], errors='coerce')
df['pm10'] = pd.to_numeric(df['pm10'], errors='coerce')
df['o3'] = pd.to_numeric(df['o3'], errors='coerce')
df['no2'] = pd.to_numeric(df['no2'], errors='coerce')

df['type'] = df['type_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed', utc=True)
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['month'] = df['timestamp'].dt.month

df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

district_encoded = pd.get_dummies(df['district'], prefix='district')
df = pd.concat([df, district_encoded], axis=1)

print(f"Shape: {df.shape}")

Shape: (651600, 70)


In [5]:
all_types = set()
for type_list in df['type']:
    all_types.update(type_list)
all_types = sorted(list(all_types))

for t in all_types:
    df[f'type_{t}'] = df['type'].apply(lambda x: 1 if t in x else 0)

df = df.drop(['comment', 'coords', 'subdistrict', 'district', 'province', 'type', 'type_list', 'date'], axis=1)

print(f"Created {len(all_types)} binary target columns")

Created 24 binary target columns


In [6]:
type_cols = [c for c in df.columns if c.startswith('type_')]
type_distribution = []

for col in type_cols:
    count = df[col].sum()
    pct = (count / len(df)) * 100
    type_name = col.replace('type_', '')
    type_distribution.append({'type': type_name, 'count': count, 'percentage': pct})

dist_df = pd.DataFrame(type_distribution).sort_values('count', ascending=False)

print("="*80)
print("Class Distribution for All Complaint Types")
print("="*80)
print(dist_df.to_string(index=False))
print(f"\nTotal samples: {len(df):,}")
print(f"Total complaint types: {len(type_cols)}")

Class Distribution for All Complaint Types
       type  count  percentage
        ถนน 142104   21.808471
    ทางเท้า 101322   15.549724
ความปลอดภัย  66055   10.137354
   แสงสว่าง  57715    8.857428
    กีดขวาง  56902    8.732658
  ความสะอาด  53568    8.220994
      จราจร  41069    6.302793
    น้ำท่วม  35979    5.521639
ท่อระบายน้ำ  35583    5.460866
       ป้าย  32687    5.016421
  ร้องเรียน  31596    4.848987
      สะพาน  27306    4.190608
     ต้นไม้  24145    3.705494
      สายไฟ  22496    3.452425
 เสียงรบกวน  22345    3.429251
       คลอง  17743    2.722990
 สัตว์จรจัด  10836    1.662983
    คนจรจัด   6275    0.963014
      PM2.5   5580    0.856354
     สอบถาม   3729    0.572284
    เสนอแนะ   2709    0.415746
 การเดินทาง   1879    0.288367
    ห้องน้ำ   1306    0.200430
  ป้ายจราจร   1237    0.189840

Total samples: 651,600
Total complaint types: 24


In [7]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

target_type = 'type_ถนน'

text_cols = df.select_dtypes(include=['object']).columns.tolist()
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
exclude = text_cols + datetime_cols + ['timestamp'] + [c for c in df.columns if c.startswith('type_')]

X = df[[c for c in df.columns if c not in exclude]].select_dtypes(include=[np.number]).fillna(0)
y = df[target_type]

print(f"Target: {target_type}")
print(f"Original - Positive: {y.sum():,} ({y.sum()/len(y)*100:.2f}%)")
print(f"Original - Negative: {(y==0).sum():,} ({(y==0).sum()/len(y)*100:.2f}%)")

minority_ratio = y.sum() / len(y)
print(f"Minority ratio: {minority_ratio:.4f}")

try:
    if minority_ratio < 0.01:
        # For very imbalanced data, only use SMOTE to bring minority to 10% of majority
        print("\n⚠ Severely imbalanced - using SMOTE only")
        smote = SMOTE(random_state=42, k_neighbors=min(5, y.sum()-1), sampling_strategy=0.1)
        X_resampled, y_resampled = smote.fit_resample(X, y)
    elif minority_ratio < 0.05:
        # Moderately imbalanced - SMOTE to 20% then undersample to 30%
        print("\n⚙ Using SMOTE + moderate undersampling")
        smote = SMOTE(random_state=42, k_neighbors=5, sampling_strategy=0.2)
        under = RandomUnderSampler(random_state=42, sampling_strategy=0.3)
        pipeline = ImbPipeline([('smote', smote), ('under', under)])
        X_resampled, y_resampled = pipeline.fit_resample(X, y)
    else:
        # Less imbalanced - SMOTE + aggressive undersampling
        print("\n⚙ Using SMOTE + aggressive undersampling")
        smote = SMOTE(random_state=42, k_neighbors=5, sampling_strategy=0.5)
        under = RandomUnderSampler(random_state=42, sampling_strategy=0.7)
        pipeline = ImbPipeline([('smote', smote), ('under', under)])
        X_resampled, y_resampled = pipeline.fit_resample(X, y)
    
    print(f"\nAfter Resampling:")
    print(f"Positive: {y_resampled.sum():,} ({y_resampled.sum()/len(y_resampled)*100:.2f}%)")
    print(f"Negative: {(y_resampled==0).sum():,} ({(y_resampled==0).sum()/len(y_resampled)*100:.2f}%)")
    print(f"Total samples: {len(y_resampled):,} (was {len(y):,})")
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
    print(f"\n✓ Balanced dataset ready for training")
    print(f"Train - Positive: {y_train.sum():,}, Negative: {(y_train==0).sum():,}")
    print(f"Test - Positive: {y_test.sum():,}, Negative: {(y_test==0).sum():,}")
    
except Exception as e:
    print(f"\n⚠ Resampling failed: {e}")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print("Using original imbalanced data with stratified split + class_weight")

Target: type_ถนน
Original - Positive: 142,104 (21.81%)
Original - Negative: 509,496 (78.19%)
Minority ratio: 0.2181

⚙ Using SMOTE + aggressive undersampling

After Resampling:
Positive: 254,748 (41.18%)
Negative: 363,925 (58.82%)
Total samples: 618,673 (was 651,600)

✓ Balanced dataset ready for training
Train - Positive: 203,762, Negative: 291,176
Test - Positive: 50,986, Negative: 72,749

After Resampling:
Positive: 254,748 (41.18%)
Negative: 363,925 (58.82%)
Total samples: 618,673 (was 651,600)

✓ Balanced dataset ready for training
Train - Positive: 203,762, Negative: 291,176
Test - Positive: 50,986, Negative: 72,749


In [8]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define parameter distributions (continuous ranges)
param_dist = {
    'n_estimators': randint(100, 501),  # Uniform distribution from 100 to 500
    'max_depth': randint(10, 51),  # Uniform distribution from 10 to 50
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt', 'log2', None],
    'class_weight': ['balanced', 'balanced_subsample']
}

print("Starting RandomizedSearchCV...")
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(
    rf, 
    param_distributions=param_dist,
    n_iter=5,  # Number of parameter settings sampled
    cv=2,  # 3-fold cross-validation
    scoring='f1',  # Optimize for F1 score
    random_state=42,
    n_jobs=-1,
    verbose=3
)

random_search.fit(X_train, y_train)

print("\n" + "="*60)
print("Best Parameters:")
print("="*60)
for param, value in random_search.best_params_.items():
    print(f"{param}: {value}")

print(f"\nBest CV F1 Score: {random_search.best_score_:.4f}")

# Use best model
model = random_search.best_estimator_
print("\n✓ Training complete with optimized parameters")

Starting RandomizedSearchCV...
Fitting 2 folds for each of 5 candidates, totalling 10 fits

Best Parameters:
class_weight: balanced
max_depth: 38
max_features: None
min_samples_leaf: 3
min_samples_split: 9
n_estimators: 288

Best CV F1 Score: 0.5232

✓ Training complete with optimized parameters

Best Parameters:
class_weight: balanced
max_depth: 38
max_features: None
min_samples_leaf: 3
min_samples_split: 9
n_estimators: 288

Best CV F1 Score: 0.5232

✓ Training complete with optimized parameters


In [9]:
y_pred = model.predict(X_test)

print("="*60)
print("Model Performance: type_การเดินทาง")
print("="*60)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, zero_division=0):.4f}")
print(f"F1: {f1_score(y_test, y_pred, zero_division=0):.4f}")
print("\n" + classification_report(y_test, y_pred, target_names=['Not การเดินทาง', 'การเดินทาง']))

Model Performance: type_การเดินทาง
Accuracy: 0.5805
Precision: 0.4926
Recall: 0.6000
F1: 0.5410

                precision    recall  f1-score   support

Not การเดินทาง       0.67      0.57      0.61     72749
    การเดินทาง       0.49      0.60      0.54     50986

      accuracy                           0.58    123735
     macro avg       0.58      0.58      0.58    123735
  weighted avg       0.60      0.58      0.58    123735



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651600 entries, 0 to 651599
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   pm25         651600 non-null  float64
 1   pm10         651600 non-null  float64
 2   o3           651600 non-null  float64
 3   no2          651600 non-null  float64
 4   hour         651600 non-null  int32  
 5   day_of_week  651600 non-null  int32  
 6   month        651600 non-null  int32  
 7   hour_sin     651600 non-null  float64
 8   hour_cos     651600 non-null  float64
 9   day_sin      651600 non-null  float64
 10  day_cos      651600 non-null  float64
dtypes: float64(8), int32(3)
memory usage: 47.2 MB


In [10]:
importance_df = pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_}).sort_values('importance', ascending=False)

print("="*60)
print("Top 10 Factors Affecting type_การเดินทาง")
print("="*60)
print(importance_df.head(10).to_string(index=False))

Top 10 Factors Affecting type_การเดินทาง
    feature  importance
   hour_sin    0.175511
   hour_cos    0.149521
       pm25    0.123423
       pm10    0.113733
       hour    0.110488
         o3    0.097769
        no2    0.064648
      month    0.051365
    day_sin    0.044763
day_of_week    0.038911
