# Model Training Pipeline Test

This notebook tests the training pipeline on the preprocessed data.

In [41]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from pipeline.train import *

print("✓ Imports successful")

✓ Imports successful


In [42]:
# Load preprocessed data
df = pd.read_csv('data/processed/traffy_weather_final.csv')
print(f"Loaded {len(df):,} records with {len(df.columns)} columns")
df.head()

Loaded 187,138 records with 60 columns


Unnamed: 0,type,comment,coords,subdistrict,district,province,timestamp,longitude,latitude,timestamp_hour,...,district_ปทุมวัน,district_ป้อมปราบศัตรูพ่าย,district_พญาไท,district_พระนคร,district_ภาษีเจริญ,district_ยานนาวา,district_ราชเทวี,district_ราษฎร์บูรณะ,district_สัมพันธวงศ์,district_สาทร
0,['สะพาน'],สะพานลอยปรับปรุงไม่เสร็จตามกำหนด\nปากซอย สาทร12,"100.52649,13.72060",ยานนาวา,สาทร,กรุงเทพมหานคร,2021-09-26 05:03:52.594898+00:00,100.52649,13.7206,2021-09-26 05:00:00+00:00,...,False,False,False,False,False,False,False,False,False,True
1,"['ถนน', 'ทางเท้า']",บริเวณนราธิวาส แยกถนนจันทน์ ใกล้สวนสาธารณะช่อ...,"100.53764,13.70716",ทุ่งมหาเมฆ,สาทร,กรุงเทพมหานคร,2022-01-02 10:53:25.580723+00:00,100.53764,13.70716,2022-01-02 10:00:00+00:00,...,False,False,False,False,False,False,False,False,False,True
2,"['ท่อระบายน้ำ', 'ทางเท้า']",ทางเท้าช่วง จันทน์ 18/5 สภาพโอเค แต่ฝาท่อเก่าแ...,"100.53025,13.70566",ทุ่งวัดดอน,สาทร,กรุงเทพมหานคร,2022-01-14 01:32:03.715912+00:00,100.53025,13.70566,2022-01-14 01:00:00+00:00,...,False,False,False,False,False,False,False,False,False,True
3,"['ถนน', 'สะพาน']",ถนนจันทน์ช่วงสะพานสามถึงจันทน์ 18/5 น่าจะลาดยา...,"100.52970,13.70569",ทุ่งวัดดอน,สาทร,กรุงเทพมหานคร,2022-01-16 10:57:39.184772+00:00,100.5297,13.70569,2022-01-16 10:00:00+00:00,...,False,False,False,False,False,False,False,False,False,True
4,['สะพาน'],ริมถนน ตรงสะพานสามช่วงจันทน์ 18/6-18/4 ชอบมีรถ...,"100.52760,13.70615",ทุ่งวัดดอน,สาทร,กรุงเทพมหานคร,2022-01-16 11:08:54.141813+00:00,100.5276,13.70615,2022-01-16 11:00:00+00:00,...,False,False,False,False,False,False,False,False,False,True


In [43]:
# Check and parse type column (convert string to list)
import ast

# Check current format
print("Type column sample (before parsing):")
print(df['type'].head())
print(f"\nType of first element: {type(df['type'][0])}")

# Parse string representation to actual list
if isinstance(df['type'][0], str):
    print("\n⚠ Type column is string, parsing to list...")
    df['type'] = df['type'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    print("✓ Parsed to list format")

print("\nType column sample (after parsing):")
print(df['type'].head())
print(f"\nType of first element: {type(df['type'][0])}")

Type column sample (before parsing):
0                     ['สะพาน']
1            ['ถนน', 'ทางเท้า']
2    ['ท่อระบายน้ำ', 'ทางเท้า']
3              ['ถนน', 'สะพาน']
4                     ['สะพาน']
Name: type, dtype: object

Type of first element: <class 'str'>

⚠ Type column is string, parsing to list...
✓ Parsed to list format

Type column sample (after parsing):
0                   [สะพาน]
1            [ถนน, ทางเท้า]
2    [ท่อระบายน้ำ, ทางเท้า]
3              [ถนน, สะพาน]
4                   [สะพาน]
Name: type, dtype: object

Type of first element: <class 'list'>
✓ Parsed to list format

Type column sample (after parsing):
0                   [สะพาน]
1            [ถนน, ทางเท้า]
2    [ท่อระบายน้ำ, ทางเท้า]
3              [ถนน, สะพาน]
4                   [สะพาน]
Name: type, dtype: object

Type of first element: <class 'list'>


In [44]:
# Check data types and identify text columns
print("Data Types:")
print("="*60)
print(df.dtypes)

print("\n\nText/Object Columns:")
print("="*60)
text_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in text_cols:
    print(f"  • {col}")
    print(f"    Sample: {df[col].iloc[0]}")
    print()

Data Types:
type                              object
comment                           object
coords                            object
subdistrict                       object
district                          object
province                          object
timestamp                         object
longitude                        float64
latitude                         float64
timestamp_hour                    object
grid_lat                         float64
grid_lon                         float64
time                              object
temperature_2m (°C)              float64
dew_point_2m (°C)                float64
relative_humidity_2m (%)         float64
rain (mm)                        float64
vapour_pressure_deficit (kPa)    float64
cloud_cover (%)                  float64
wind_direction_10m (°)           float64
surface_pressure (hPa)           float64
wind_speed_10m (km/h)            float64
latitude_weather                 float64
longitude_weather                float64
time

In [45]:
# Encode multi-label target
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(df['type'])

print(f"Encoded shape: {y_encoded.shape}")
print(f"Number of unique labels: {len(mlb.classes_)}")
print(f"\nLabels: {mlb.classes_[:10]}...")  # Show first 10

Encoded shape: (187138, 24)
Number of unique labels: 24

Labels: ['PM2.5' 'การเดินทาง' 'กีดขวาง' 'คนจรจัด' 'คลอง' 'ความปลอดภัย' 'ความสะอาด'
 'จราจร' 'ต้นไม้' 'ถนน']...


In [46]:
# Prepare data for training - automatically exclude text columns
# Get all text/object columns
text_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove 'type' from text_cols if it exists (we need it as target)
exclude_cols = [col for col in text_cols if col != 'type']

print(f"Total columns: {len(df.columns)}")
print(f"Text columns to exclude: {len(exclude_cols)}")
print(f"Excluded columns: {exclude_cols}")

Total columns: 60
Text columns to exclude: 9
Excluded columns: ['comment', 'coords', 'subdistrict', 'district', 'province', 'timestamp', 'timestamp_hour', 'time', 'timestamp_col']


In [47]:
# Split data into train/test
X_train, X_test, y_train, y_test = split_data(
    df,
    target_col='type',
    exclude_cols=exclude_cols,
    test_size=0.2,
    random_state=42
)

# Encode targets after split
y_train_encoded = mlb.transform(y_train)
y_test_encoded = mlb.transform(y_test)

print(f"\nTraining features shape: {X_train.shape}")
print(f"Training target shape: {y_train_encoded.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"Test target shape: {y_test_encoded.shape}")

Train set: 149,710 samples
Test set: 37,428 samples
Features: 50

Training features shape: (149710, 50)
Training target shape: (149710, 24)
Test features shape: (37428, 50)
Test target shape: (37428, 24)

Training features shape: (149710, 50)
Training target shape: (149710, 24)
Test features shape: (37428, 50)
Test target shape: (37428, 24)


In [48]:
# Check for data quality issues
print("Data Quality Check:")
print("="*60)
print(f"Missing values in training features:")
print(X_train.isnull().sum().sum())
print(f"\nMissing values in test features:")
print(X_test.isnull().sum().sum())

print(f"\nInfinite values in training features:")
print(np.isinf(X_train.select_dtypes(include=[np.number])).sum().sum())

print(f"\nFeature value ranges (first 5 features):")
for col in X_train.columns[:5]:
    print(f"  {col}: [{X_train[col].min():.2f}, {X_train[col].max():.2f}]")

Data Quality Check:
Missing values in training features:
0

Missing values in test features:
0

Infinite values in training features:
0

Feature value ranges (first 5 features):
  longitude: [100.45, 100.63]
  latitude: [13.65, 13.85]
  grid_lat: [13.60, 13.80]
  grid_lon: [100.50, 100.60]
  temperature_2m (°C): [16.80, 39.30]


In [49]:
# Train multi-label classifier
model = train_classifier(
    X_train,
    y_train_encoded,
    n_estimators=100,
    max_depth=20,
    random_state=42,
    multi_output=True  # Enable multi-label
)


Training Random Forest Classifier...
  Estimators: 100
  Max depth: 20
  Multi-output mode: Enabled
✓ Training complete
✓ Training complete


In [50]:
# Evaluate model
metrics = evaluate_classifier(model, X_test, y_test_encoded, target_names=mlb.classes_)


CLASSIFICATION METRICS
Accuracy: 0.0686
Precision Micro: 0.7347
Precision Macro: 0.6822
Recall Micro: 0.0746
Recall Macro: 0.0502
F1 Micro: 0.1355
F1 Macro: 0.0900

------------------------------------------------------------
CLASSIFICATION REPORT
------------------------------------------------------------
              precision    recall  f1-score   support

       PM2.5       0.93      0.04      0.08       299
  การเดินทาง       0.33      0.02      0.04       132
     กีดขวาง       0.84      0.17      0.28      5362
     คนจรจัด       0.76      0.02      0.04       625
        คลอง       0.77      0.02      0.04       842
 ความปลอดภัย       0.63      0.02      0.04      4313
   ความสะอาด       0.79      0.03      0.05      3433
       จราจร       0.84      0.06      0.12      2892
      ต้นไม้       0.62      0.02      0.04      1680
         ถนน       0.71      0.06      0.12      9060
     ทางเท้า       0.70      0.13      0.22      8716
 ท่อระบายน้ำ       0.54      0.02      0.

In [51]:
# Feature importance analysis
importance_df = get_feature_importance(model, X_train.columns.tolist())


TOP 10 FEATURE IMPORTANCE
                      feature  importance
                     latitude    0.134276
                    longitude    0.133593
        wind_speed_10m (km/h)    0.060167
       wind_direction_10m (°)    0.059178
       surface_pressure (hPa)    0.058869
vapour_pressure_deficit (kPa)    0.056616
          temperature_2m (°C)    0.053813
            dew_point_2m (°C)    0.053435
     relative_humidity_2m (%)    0.045894
              cloud_cover (%)    0.035548


In [52]:
# Check raw predictions - more detailed
# For MultiOutputClassifier, we need to check each estimator
print("Checking first label's classifier:")
first_estimator = model.estimators_[0]
first_pred = first_estimator.predict(X_test.head(5))
first_pred_proba = first_estimator.predict_proba(X_test.head(5))

print(f"First label predictions: {first_pred}")
print(f"First label probabilities shape: {len(first_pred_proba)} classes")
print(f"Probability for class 1: {first_pred_proba[:, 1] if first_pred_proba.shape[1] > 1 else 'Only one class!'}")

# Check overall predictions
sample_pred_binary = model.predict(X_test.head(5))

print("\n" + "="*60)
print("Overall Predictions:")
print(f"Prediction shape: {sample_pred_binary.shape}")
print(f"Predictions for 5 samples:")
for i, pred in enumerate(sample_pred_binary):
    print(f"  Sample {i+1}: {pred.sum()} labels predicted (values: {pred[:10]}...)")

# Check if model is actually predicting anything
print("\n" + "="*60)
print(f"Total predictions across all test samples:")
print(f"Average labels per sample (true): {y_test_encoded.sum(axis=1).mean():.2f}")
all_preds = model.predict(X_test)
print(f"Average labels per sample (predicted): {all_preds.sum(axis=1).mean():.2f}")
print(f"Total samples with 0 predictions: {(all_preds.sum(axis=1) == 0).sum()}")

Checking first label's classifier:
First label predictions: [0 0 0 0 0]
First label probabilities shape: 5 classes
Probability for class 1: [0.00153323 0.0141542  0.00101816 0.01072854 0.00439033]

Overall Predictions:
Prediction shape: (5, 24)
Predictions for 5 samples:
  Sample 1: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)
  Sample 2: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)
  Sample 3: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)
  Sample 4: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)
  Sample 5: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)

Total predictions across all test samples:
Average labels per sample (true): 1.47

Overall Predictions:
Prediction shape: (5, 24)
Predictions for 5 samples:
  Sample 1: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)
  Sample 2: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)
  Sample 3: 0 labels predicted (values: [0 0 0 0 0 0 0 0 0 0]...)
  Sample 4: 0 labels predicted (values: [0

In [None]:
# Use predict_proba with custom threshold
def predict_with_threshold(model, X, threshold=0.3):
    """Predict using custom probability threshold"""
    # Get probabilities for each label
    predictions = []
    for estimator in model.estimators_:
        proba = estimator.predict_proba(X)
        # Get probability of positive class (class 1)
        if proba.shape[1] > 1:
            pred = (proba[:, 1] >= threshold).astype(int)
        else:
            pred = np.zeros(len(X), dtype=int)  # If only one class, pre dict 0
        predictions.append(pred)
    
    return np.column_stack(predictions)

# Test with different thresholds
print("Testing different thresholds:")
print("="*60)
for threshold in [0.5, 0.3, 0.2, 0.1]:
    preds = predict_with_threshold(model, X_test, threshold=threshold)
    avg_labels = preds.sum(axis=1).mean()
    zero_preds = (preds.sum(axis=1) == 0).sum()
    print(f"Threshold {threshold}: Avg labels = {avg_labels:.2f}, Zero preds = {zero_preds:,}")

print(f"\nTrue average: {y_test_encoded.sum(axis=1).mean():.2f}")

Testing different thresholds:
Threshold 0.5: Avg labels = 0.15, Zero preds = 32,566
Threshold 0.5: Avg labels = 0.15, Zero preds = 32,566
Threshold 0.3: Avg labels = 0.68, Zero preds = 17,217
Threshold 0.3: Avg labels = 0.68, Zero preds = 17,217
Threshold 0.2: Avg labels = 1.89, Zero preds = 1,046
Threshold 0.2: Avg labels = 1.89, Zero preds = 1,046
Threshold 0.1: Avg labels = 4.61, Zero preds = 0

True average: 1.47
Threshold 0.1: Avg labels = 4.61, Zero preds = 0

True average: 1.47


In [54]:
# Test predictions with optimal threshold (0.2)
OPTIMAL_THRESHOLD = 0.2

sample_predictions = predict_with_threshold(model, X_test.head(5), threshold=OPTIMAL_THRESHOLD)
sample_labels = mlb.inverse_transform(sample_predictions)

print(f"Sample Predictions (threshold={OPTIMAL_THRESHOLD}):")
print("="*60)
for i, (true, pred) in enumerate(zip(y_test.head(5), sample_labels)):
    print(f"\n{i+1}. True: {true}")
    print(f"   Pred: {pred}")
    
# Calculate metrics with optimal threshold
all_preds_optimal = predict_with_threshold(model, X_test, threshold=OPTIMAL_THRESHOLD)
print("\n" + "="*60)
print("Overall Performance with Optimal Threshold:")
print(f"Average labels per sample: {all_preds_optimal.sum(axis=1).mean():.2f} (true: 1.47)")
print(f"Samples with 0 predictions: {(all_preds_optimal.sum(axis=1) == 0).sum():,}")

Sample Predictions (threshold=0.2):

1. True: ['กีดขวาง', 'ทางเท้า']
   Pred: ('ถนน', 'ทางเท้า')

2. True: ['จราจร']
   Pred: ('จราจร', 'ทางเท้า')

3. True: ['ความสะอาด']
   Pred: ('กีดขวาง', 'ท่อระบายน้ำ')

4. True: ['ทางเท้า']
   Pred: ('ทางเท้า', 'แสงสว่าง')

5. True: ['ความสะอาด']
   Pred: ('จราจร', 'ถนน', 'ทางเท้า')

Overall Performance with Optimal Threshold:
Average labels per sample: 1.89 (true: 1.47)
Samples with 0 predictions: 1,046

Overall Performance with Optimal Threshold:
Average labels per sample: 1.89 (true: 1.47)
Samples with 0 predictions: 1,046


In [55]:
# # Save model
# save_model(
#     model,
#     'data/models/complaint_classifier.pkl',
#     metadata={
#         'features': X_train.columns.tolist(),
#         'labels': mlb.classes_.tolist(),
#         'n_samples': len(X_train),
#         'accuracy': metrics['accuracy'],
#         'f1_micro': metrics['f1_micro']
#     }
# )