# Model Training Pipeline Test

This notebook tests the training pipeline on the preprocessed data.

In [9]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from pipeline.train import *

print("✓ Imports successful")

✓ Imports successful


In [10]:
# Clean column names (remove leading/trailing spaces)
df.columns = df.columns.str.strip()

# Convert to numeric
df['pm25'] = pd.to_numeric(df['pm25'], errors='coerce')
df['pm10'] = pd.to_numeric(df['pm10'], errors='coerce')
df['o3'] = pd.to_numeric(df['o3'], errors='coerce')
df['no2'] = pd.to_numeric(df['no2'], errors='coerce')

print(f"\n✓ After conversion:")
print(df[['pm25', 'pm10', 'o3', 'no2']].dtypes)


✓ After conversion:
pm25    float64
pm10    float64
o3      float64
no2     float64
dtype: object


In [11]:
import ast

df = pd.read_csv('data/processed/traffy_merged.csv')
print(f"Loaded {len(df):,} records with {len(df.columns)} columns")

df['type'] = df['type_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

numeric_cols = ['pm25', 'pm10', 'o3', 'no2']
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed', utc=True)
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['month'] = df['timestamp'].dt.month

df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

district_encoded = pd.get_dummies(df['district'], prefix='district')
df = pd.concat([df, district_encoded], axis=1)

print(f"\n✓ Feature engineering complete")
print(f"Final shape: {df.shape}")
df.head()

  df = pd.read_csv('data/processed/traffy_merged.csv')


Loaded 651,600 records with 13 columns

✓ Feature engineering complete
Final shape: (651600, 72)

✓ Feature engineering complete
Final shape: (651600, 72)


Unnamed: 0,type,comment,coords,subdistrict,district,province,timestamp,date,pm25,pm10,...,district_วัฒนา,district_สวนหลวง,district_สะพานสูง,district_สัมพันธวงศ์,district_สาทร,district_สายไหม,district_หนองจอก,district_หนองแขม,district_หลักสี่,district_ห้วยขวาง
0,"[น้ำท่วม, ร้องเรียน]",น้ำท่วมเวลาฝนตกและทะลุเข้าบ้านเดือดร้อนมากทุกๆ...,"100.66709,13.67891",หนองบอน,ประเวศ,กรุงเทพมหานคร,2021-09-19 14:56:08.924992+00:00,2021-09-19,52,27,...,False,False,False,False,False,False,False,False,False,False
1,[สะพาน],สะพานลอยปรับปรุงไม่เสร็จตามกำหนด\nปากซอย สาทร12,"100.52649,13.72060",ยานนาวา,สาทร,กรุงเทพมหานคร,2021-09-26 05:03:52.594898+00:00,2021-09-26,25,18,...,False,False,False,False,True,False,False,False,False,False
2,"[น้ำท่วม, ถนน]",ซอยลาดพร้าววังหิน 75 ถนนลาดพร้าววังหิน แขวงลาด...,"100.59165,13.82280",ลาดพร้าว,ลาดพร้าว,กรุงเทพมหานคร,2021-12-09 12:29:08.408763+00:00,2021-12-09,100,54,...,False,False,False,False,False,False,False,False,False,False
3,[],หน้าปากซอย ลาดพร้าววังหิน26,"100.59131,13.80910",ลาดพร้าว,ลาดพร้าว,กรุงเทพมหานคร,2021-12-13 05:53:36.861064+00:00,2021-12-13,79,65,...,False,False,False,False,False,False,False,False,False,False
4,[],ยังไม่มีหน่วยงานไหนมาดูแลครับ รถจะเชี่ยวหลายคน...,"100.50848,13.77832",ดุสิต,ดุสิต,กรุงเทพมหานคร,2021-12-17 08:46:02.610983+00:00,2021-12-17,117,48,...,False,False,False,False,False,False,False,False,False,False


In [12]:
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(df['type'])

print(f"Encoded shape: {y_encoded.shape}")
print(f"Number of unique labels: {len(mlb.classes_)}")
print(f"\nAll labels: {list(mlb.classes_)}")

Encoded shape: (651600, 24)
Number of unique labels: 24

All labels: ['PM2.5', 'การเดินทาง', 'กีดขวาง', 'คนจรจัด', 'คลอง', 'ความปลอดภัย', 'ความสะอาด', 'จราจร', 'ต้นไม้', 'ถนน', 'ทางเท้า', 'ท่อระบายน้ำ', 'น้ำท่วม', 'ป้าย', 'ป้ายจราจร', 'ร้องเรียน', 'สอบถาม', 'สะพาน', 'สัตว์จรจัด', 'สายไฟ', 'ห้องน้ำ', 'เสนอแนะ', 'เสียงรบกวน', 'แสงสว่าง']


In [13]:
text_cols = df.select_dtypes(include=['object']).columns.tolist()
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
exclude_cols = [col for col in text_cols if col != 'type'] + datetime_cols + ['timestamp', 'type_list']

print(f"Total columns: {len(df.columns)}")
print(f"Columns to exclude: {len(exclude_cols)}")
print(f"Excluded: {exclude_cols}")

Total columns: 72
Columns to exclude: 13
Excluded: ['comment', 'coords', 'subdistrict', 'district', 'province', 'date', ' pm25', ' pm10', ' o3', ' no2', 'type_list', 'timestamp', 'type_list']


In [14]:
# Split data into train/test
X_train, X_test, y_train, y_test = split_data(
    df,
    target_col='type',
    exclude_cols=exclude_cols,
    test_size=0.2,
    random_state=42
)

# Encode targets after split
y_train_encoded = mlb.transform(y_train)
y_test_encoded = mlb.transform(y_test)

print(f"\nTraining features shape: {X_train.shape}")
print(f"Training target shape: {y_train_encoded.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"Test target shape: {y_test_encoded.shape}")

Train set: 521,280 samples
Test set: 130,320 samples
Features: 59

Training features shape: (521280, 59)
Training target shape: (521280, 24)
Test features shape: (130320, 59)
Test target shape: (130320, 24)

Training features shape: (521280, 59)
Training target shape: (521280, 24)
Test features shape: (130320, 59)
Test target shape: (130320, 24)


In [15]:
# Check for data quality issues
print("Data Quality Check:")
print("="*60)
print(f"Missing values in training features:")
print(X_train.isnull().sum().sum())
print(f"\nMissing values in test features:")
print(X_test.isnull().sum().sum())

print(f"\nInfinite values in training features:")
print(np.isinf(X_train.select_dtypes(include=[np.number])).sum().sum())

print(f"\nFeature value ranges (first 5 features):")
for col in X_train.columns[:5]:
    print(f"  {col}: [{X_train[col].min():.2f}, {X_train[col].max():.2f}]")

Data Quality Check:
Missing values in training features:
0

Missing values in test features:
0

Infinite values in training features:
0

Feature value ranges (first 5 features):
  hour: [0.00, 23.00]
  day_of_week: [0.00, 6.00]
  month: [1.00, 12.00]
  hour_sin: [-1.00, 1.00]
  hour_cos: [-1.00, 1.00]


In [16]:
# Train multi-label classifier
model = train_classifier(
    X_train,
    y_train_encoded,
    n_estimators=100,
    max_depth=20,
    random_state=42,
    multi_output=True  # Enable multi-label
)


Training Random Forest Classifier...
  Estimators: 100
  Max depth: 20
  Multi-output mode: Enabled
✓ Training complete
✓ Training complete


In [17]:
# Evaluate model
metrics = evaluate_classifier(model, X_test, y_test_encoded, target_names=mlb.classes_)


CLASSIFICATION METRICS
Accuracy: 0.1795
Precision Micro: 0.5440
Precision Macro: 0.4355
Recall Micro: 0.0206
Recall Macro: 0.0139
F1 Micro: 0.0397
F1 Macro: 0.0263

------------------------------------------------------------
CLASSIFICATION REPORT
------------------------------------------------------------
              precision    recall  f1-score   support

       PM2.5       0.40      0.01      0.01      1117
  การเดินทาง       0.00      0.00      0.00       380
     กีดขวาง       0.54      0.03      0.05     11410
     คนจรจัด       0.00      0.00      0.00      1274
        คลอง       0.62      0.02      0.03      3499
 ความปลอดภัย       0.63      0.01      0.02     13293
   ความสะอาด       0.61      0.03      0.06     10721
       จราจร       0.50      0.01      0.02      8151
      ต้นไม้       0.64      0.01      0.02      4852
         ถนน       0.49      0.03      0.06     28582
     ทางเท้า       0.46      0.02      0.04     20179
 ท่อระบายน้ำ       0.35      0.00      0.

In [18]:
# Feature importance analysis
importance_df = get_feature_importance(model, X_train.columns.tolist())


TOP 10 FEATURE IMPORTANCE
         feature  importance
            hour    0.148542
        hour_cos    0.128350
        hour_sin    0.126087
           month    0.090301
         day_sin    0.086388
     day_of_week    0.085298
       month_cos    0.080703
       month_sin    0.074453
         day_cos    0.056422
district_หนองจอก    0.006699


In [19]:
# Check raw predictions - more detailed
# For MultiOutputClassifier, we need to check each estimator
print("Checking first label's classifier:")
first_estimator = model.estimators_[0]
first_pred = first_estimator.predict(X_test.head(5))
first_pred_proba = first_estimator.predict_proba(X_test.head(5))

print(f"First label predictions: {first_pred}")
print(f"First label probabilities shape: {len(first_pred_proba)} classes")
print(f"Probability for class 1: {first_pred_proba[:, 1] if first_pred_proba.shape[1] > 1 else 'Only one class!'}")

# Check overall predictions
sample_pred_binary = model.predict(X_test.head(5))

print("\n" + "="*60)
print("Overall Predictions:")
print(f"Prediction shape: {sample_pred_binary.shape}")
print(f"Predictions for 5 samples:")
for i, pred in enumerate(sample_pred_binary):
    print(f"  Sample {i+1}: {pred.sum()} labels predicted (values: {pred[:10]}...)")

# Check if model is actually predicting anything
print("\n" + "="*60)
print(f"Total predictions across all test samples:")
print(f"Average labels per sample (true): {y_test_encoded.sum(axis=1).mean():.2f}")
all_preds = model.predict(X_test)
print(f"Average labels per sample (predicted): {all_preds.sum(axis=1).mean():.2f}")
print(f"Total samples with 0 predictions: {(all_preds.sum(axis=1) == 0).sum()}")

Checking first label's classifier:
First label predictions: [0 0 0 0 0]
First label probabilities shape: 5 classes
Probability for class 1: [0.00166731 0.00292064 0.00384766 0.00085534 0.00728545]

Overall Predictions:
Prediction shape: (5, 24)
Predictions for 5 samples:
  Sample 1: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)
  Sample 2: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)
  Sample 3: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)
  Sample 4: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)
  Sample 5: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)

Total predictions across all test samples:
Average labels per sample (true): 1.23

Overall Predictions:
Prediction shape: (5, 24)
Predictions for 5 samples:
  Sample 1: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)
  Sample 2: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)
  Sample 3: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)
  Sample 4: 0 labels predicted (values: [0

In [20]:
# Use predict_proba with custom threshold
def predict_with_threshold(model, X, threshold=0.3):
    """Predict using custom probability threshold"""
    # Get probabilities for each label
    predictions = []
    for estimator in model.estimators_:
        proba = estimator.predict_proba(X)
        # Get probability of positive class (class 1)
        if proba.shape[1] > 1:
            pred = (proba[:, 1] >= threshold).astype(int)
        else:
            pred = np.zeros(len(X), dtype=int)  # If only one class, pre dict 0
        predictions.append(pred)
    
    return np.column_stack(predictions)

# Test with different thresholds
print("Testing different thresholds:")
print("="*60)
for threshold in [0.5, 0.3, 0.2, 0.1]:
    preds = predict_with_threshold(model, X_test, threshold=threshold)
    avg_labels = preds.sum(axis=1).mean()
    zero_preds = (preds.sum(axis=1) == 0).sum()
    print(f"Threshold {threshold}: Avg labels = {avg_labels:.2f}, Zero preds = {zero_preds:,}")

print(f"\nTrue average: {y_test_encoded.sum(axis=1).mean():.2f}")

Testing different thresholds:
Threshold 0.5: Avg labels = 0.05, Zero preds = 124,922
Threshold 0.5: Avg labels = 0.05, Zero preds = 124,922
Threshold 0.3: Avg labels = 0.23, Zero preds = 104,406
Threshold 0.3: Avg labels = 0.23, Zero preds = 104,406
Threshold 0.2: Avg labels = 1.14, Zero preds = 20,830
Threshold 0.2: Avg labels = 1.14, Zero preds = 20,830
Threshold 0.1: Avg labels = 3.50, Zero preds = 37

True average: 1.23
Threshold 0.1: Avg labels = 3.50, Zero preds = 37

True average: 1.23


In [21]:
# Test predictions with optimal threshold (0.2)
OPTIMAL_THRESHOLD = 0.2

sample_predictions = predict_with_threshold(model, X_test.head(5), threshold=OPTIMAL_THRESHOLD)
sample_labels = mlb.inverse_transform(sample_predictions)

print(f"Sample Predictions (threshold={OPTIMAL_THRESHOLD}):")
print("="*60)
for i, (true, pred) in enumerate(zip(y_test.head(5), sample_labels)):
    print(f"\n{i+1}. True: {true}")
    print(f"   Pred: {pred}")
    
# Calculate metrics with optimal threshold
all_preds_optimal = predict_with_threshold(model, X_test, threshold=OPTIMAL_THRESHOLD)
print("\n" + "="*60)
print("Overall Performance with Optimal Threshold:")
print(f"Average labels per sample: {all_preds_optimal.sum(axis=1).mean():.2f} (true: 1.47)")
print(f"Samples with 0 predictions: {(all_preds_optimal.sum(axis=1) == 0).sum():,}")

Sample Predictions (threshold=0.2):

1. True: ['ความสะอาด']
   Pred: ('แสงสว่าง',)

2. True: ['ป้ายจราจร']
   Pred: ()

3. True: []
   Pred: ('ถนน',)

4. True: ['ถนน']
   Pred: ('จราจร', 'ถนน')

5. True: ['ถนน', 'กีดขวาง']
   Pred: ('ถนน',)

Overall Performance with Optimal Threshold:
Average labels per sample: 1.14 (true: 1.47)
Samples with 0 predictions: 20,830

Overall Performance with Optimal Threshold:
Average labels per sample: 1.14 (true: 1.47)
Samples with 0 predictions: 20,830


In [22]:
# # Save model
# save_model(
#     model,
#     'data/models/complaint_classifier.pkl',
#     metadata={
#         'features': X_train.columns.tolist(),
#         'labels': mlb.classes_.tolist(),
#         'n_samples': len(X_train),
#         'accuracy': metrics['accuracy'],
#         'f1_micro': metrics['f1_micro']
#     }
# )