In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [12]:
# อ่านข้อมูล
data_train = pd.read_csv('./data/salary.train.processed.csv').set_index('id')
data_test = pd.read_csv('./data/salary.test.processed.csv').set_index('id')


In [13]:
# กำหนด target
target = 'label'

# แยกข้อมูลฝึกและทดสอบ
X_train = data_train.drop(columns=[target])
y_train = data_train[target]
X_test = data_test.drop(columns=[target])
y_test = data_test[target]

# สร้างและฝึกโมเดล Random Forest เริ่มต้น
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [14]:
# ทดสอบและคำนวณ F1 Score
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')
print("\nF1 Score จาก Random Forest เริ่มต้น:", f1)


F1 Score จาก Random Forest เริ่มต้น: 0.8204091782669011


In [15]:
import pandas
import optuna
import sklearn.metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
import numpy as np

# --- 1. โหลดและเตรียมข้อมูล (เหมือนเดิม) ---
# (ใช้ไฟล์ salary.train.processed.csv เหมือนเดิม)
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # คุณต้องจัดการ error นี้ ในตัวอย่างนี้จะสมมติว่าโหลดไฟล์ได้
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

# หมายเหตุ: Optuna ทำงานได้ดีที่สุดเมื่อใช้ cross-validation
# เราไม่จำเป็นต้องแบ่ง X_train/X_val เหมือนตอนทำ early stopping กับ XGBoost
# เราจะใช้ X_full, y_full ไปเลย
print(f"ใช้ข้อมูลเทรนทั้งหมด {len(y_full)} records สำหรับการจูนด้วย Optuna (K-Fold CV)")

# คำนวณ class_weight สำหรับ RF (RF ใช้ 'class_weight' แทน 'scale_pos_weight')
# 'balanced' เป็นวิธีที่ง่ายและมักจะได้ผลดี
rf_class_weight = 'balanced' 


# --- 2. สร้างฟังก์ชัน Objective (หัวใจของ Optuna) ---

def objective(trial):
    """
    ฟังก์ชันนี้จะถูกเรียกโดย Optuna ในแต่ละ "trial" (การทดลอง)
    trial: คือ object ที่ใช้ในการ "เสนอ" ค่าพารามิเตอร์
    """
    
    # กำหนดช่วงของพารามิเตอร์ที่เราอยากให้ Optuna ไป "สุ่มหา"
    param_rf = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 14),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0) # สุ่มเป็น %
    }
    
    # สร้างโมเดล RF ด้วยพารามิเตอร์ที่ Optuna สุ่มมาให้
    model_rf = RandomForestClassifier(
        **param_rf,
        class_weight=rf_class_weight, # จัดการ imbalance
        random_state=42,
        n_jobs=-1
    )
    
    # ประเมินผลโมเดลด้วย Cross-validation (cv=3 คือ 3-Fold)
    # เราจะใช้ 'f1_weighted' เพราะข้อมูลเรา imbalance
    score = cross_val_score(
        model_rf, 
        X_full, 
        y_full, 
        cv=3,                 # 3-Fold Cross-validation
        scoring='f1_weighted',
        n_jobs=-1
    )
    
    # คืนค่า F1 เฉลี่ยกลับไปให้ Optuna
    f1_avg = np.mean(score)
    return f1_avg

# --- 3. เริ่มการค้นหา (Study) ---

print("\nกำลังเริ่มการค้นหาพารามิเตอร์ด้วย Optuna...")

# สร้าง study object, บอกว่าเราต้องการ 'maximize' (หาค่า F1 สูงสุด)
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)

# สั่งให้เริ่มค้นหา (optimize) โดยเรียก objective 50 ครั้ง
study.optimize(
    objective, 
    n_trials=50,  # จำนวนครั้งที่จะให้ลอง (ยิ่งเยอะยิ่งดี แต่ยิ่งนาน)
    show_progress_bar=True # แสดงแถบความคืบหน้า
)

# --- 4. แสดงผลลัพธ์ ---

print("\n--- Optuna ค้นหาเสร็จสิ้น! ---")

# พารามิเตอร์ที่ดีที่สุดที่หาได้
print("พารามิเตอร์ที่ดีที่สุด (Best Parameters):")
print(study.best_params)

# F1-score ที่ดีที่สุดที่ทำได้ (จากการ cross-validation)
print(f"\nF1-Weighted ที่ดีที่สุด (จากการ CV): {study.best_value:.6f}")


# --- 5. (ขั้นตอนต่อไป) เทรนโมเดลสุดท้ายและประเมินผลบน Test Set ---

print("\nกำลังเทรนโมเดล RF สุดท้ายด้วยพารามิเตอร์ที่ดีที่สุด...")

# ดึงพารามิเตอร์ที่ดีที่สุดมา
best_parameter = study.best_params  # 👈 *** เปลี่ยนชื่อตัวแปรตรงนี้ ***

# สร้างโมเดลสุดท้าย
rf_model_final = RandomForestClassifier(
    **best_parameter,             # 👈 *** และเปลี่ยนตรงนี้ ***
    class_weight=rf_class_weight,
    random_state=42,
    n_jobs=-1
)

# เทรนโมเดลด้วยข้อมูล "ทั้งหมด" (X_full, y_full)
rf_model_final.fit(X_full, y_full)

print("เทรนโมเดลสุดท้ายเสร็จสิ้น! กำลังประเมินผลบน Test Set...")

# โหลดข้อมูล Test (เหมือนโค้ด XGBoost ของคุณ)
data_test_rf = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
data_test_rf['prediction'] = rf_model_final.predict(data_test_rf.drop(['label'], axis='columns'))

# แสดงผลลัพธ์บน Test Set
print("\nRandom Forest (Optuna-Tuned) Confusion Matrix:")
print(sklearn.metrics.confusion_matrix(
    y_true=data_test_rf['label'],
    y_pred=data_test_rf['prediction']
))

report_scores_rf = sklearn.metrics.classification_report(
    y_true=data_test_rf['label'],
    y_pred=data_test_rf['prediction'],
    digits=6,
    output_dict=True
)
df_score_rf = pandas.DataFrame(report_scores_rf).transpose()
print("\nRandom Forest (Optuna-Tuned) Report:")
print(df_score_rf)

[I 2025-10-22 06:29:25,930] A new study created in memory with name: no-name-29070943-7e87-4cd2-b8aa-9bbdb8d468ce


ใช้ข้อมูลเทรนทั้งหมด 16720 records สำหรับการจูนด้วย Optuna (K-Fold CV)

กำลังเริ่มการค้นหาพารามิเตอร์ด้วย Optuna...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-10-22 06:29:31,802] Trial 0 finished with value: 0.7857324367021715 and parameters: {'n_estimators': 200, 'max_depth': 4, 'min_samples_leaf': 5, 'min_samples_split': 14, 'criterion': 'gini', 'max_features': 0.716616423471629}. Best is trial 0 with value: 0.7857324367021715.
[I 2025-10-22 06:29:47,663] Trial 1 finished with value: 0.8174553467305552 and parameters: {'n_estimators': 750, 'max_depth': 15, 'min_samples_leaf': 6, 'min_samples_split': 12, 'criterion': 'entropy', 'max_features': 0.3036272421534816}. Best is trial 1 with value: 0.8174553467305552.
[I 2025-10-22 06:30:01,550] Trial 2 finished with value: 0.8095809503113371 and parameters: {'n_estimators': 650, 'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 9, 'criterion': 'gini', 'max_features': 0.8522371823038665}. Best is trial 1 with value: 0.8174553467305552.
[I 2025-10-22 06:30:04,818] Trial 3 finished with value: 0.8148595084011747 and parameters: {'n_estimators': 300, 'max_depth': 12, 'min_samples_le

In [None]:
print(best_parameter)

{'n_estimators': 750, 'max_depth': 13, 'min_samples_leaf': 2, 'min_samples_split': 3, 'criterion': 'entropy', 'max_features': 0.30517819653345885}


### class weight

In [16]:
import pandas
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
from sklearn.metrics import classification_report

# --- 1. Load Data (Needed for training) ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

# --- 2. Define Your Best Parameters ---
# These are the params you provided from Optuna
best_rf_params = best_parameter
# --- 3. Create and Train the Final Model ---
print("\nTraining final RF model with best params and class weighting...")

rf_model_final = RandomForestClassifier(
    **best_rf_params,
    class_weight='balanced_subsample', # 👈 Here is the class weight
    random_state=42,
    n_jobs=-1
)

# Train the model on the full training dataset
rf_model_final.fit(X_full, y_full)
print("Model training complete.")

# --- 4. Evaluate on Test Data ---
print("\nEvaluating model on test data...")

data_test_rf = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
data_test_rf['prediction'] = rf_model_final.predict(data_test_rf.drop(['label'], axis='columns'))

# Print the report
report_scores_rf = sklearn.metrics.classification_report(
    y_true=data_test_rf['label'],
    y_pred=data_test_rf['prediction'],
    digits=6,
    output_dict=True
)
df_score_rf = pandas.DataFrame(report_scores_rf).transpose()

print("\nRandom Forest (Tuned + Weighted) Report:")
print(df_score_rf)


Training final RF model with best params and class weighting...
Model training complete.

Evaluating model on test data...

Random Forest (Tuned + Weighted) Report:
              precision    recall  f1-score      support
0.0            0.881270  0.792632  0.834604  2416.000000
1.0            0.750374  0.853741  0.798727  1764.000000
accuracy       0.818421  0.818421  0.818421     0.818421
macro avg      0.815822  0.823187  0.816666  4180.000000
weighted avg   0.826031  0.818421  0.819464  4180.000000


### class SMOTE

In [17]:
import pandas
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE  # 👈 1. Import SMOTE

# --- 1. Load Data (Needed for training) ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

print(f"Original training data shape: {X_full.shape}")
print(f"Original label distribution:\n{y_full.value_counts()}")

# --- 2. Define Your Best Parameters ---
# These are the params you provided from Optuna
best_rf_params = best_parameter

# --- 3. Apply SMOTE to the Training Data ---
print("\nApplying SMOTE to the training data...")
smote = SMOTE(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smote.fit_resample(X_full, y_full)

print(f"New resampled training data shape: {X_resampled.shape}")
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")


# --- 4. Create and Train the Final Model (with NO class_weight) ---
print("\nTraining final RF model on SMOTEd data...")

rf_model_final = RandomForestClassifier(
    **best_rf_params,
    # class_weight='balanced_subsample', # 👈 2. REMOVE class_weight
    random_state=42,
    n_jobs=-1
)

# 4. Train the model on the NEW resampled data
rf_model_final.fit(X_resampled, y_resampled)
print("Model training complete.")

# --- 5. Evaluate on ORIGINAL Test Data ---
print("\nEvaluating model on *original* test data...")

data_test_rf = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')

# IMPORTANT: Do NOT apply SMOTE to the test data.
X_test = data_test_rf.drop(['label'], axis='columns')
y_test = data_test_rf['label']

data_test_rf['prediction'] = rf_model_final.predict(X_test)

# Print the report
report_scores_rf = sklearn.metrics.classification_report(
    y_true=y_test,
    y_pred=data_test_rf['prediction'],
    digits=6,
    output_dict=True
)
df_score_rf = pandas.DataFrame(report_scores_rf).transpose()

print("\nRandom Forest (Tuned + SMOTE) Report:")
print(df_score_rf)

Original training data shape: (16720, 56)
Original label distribution:
label
0.0    9719
1.0    7001
Name: count, dtype: int64

Applying SMOTE to the training data...


[WinError 2] The system cannot find the file specified
  File "c:\Users\natth\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


New resampled training data shape: (19438, 56)
New resampled label distribution:
label
1.0    9719
0.0    9719
Name: count, dtype: int64

Training final RF model on SMOTEd data...
Model training complete.

Evaluating model on *original* test data...

Random Forest (Tuned + SMOTE) Report:
              precision    recall  f1-score      support
0.0            0.875620  0.804222  0.838403  2416.000000
1.0            0.758797  0.843537  0.798926  1764.000000
accuracy       0.820813  0.820813  0.820813     0.820813
macro avg      0.817208  0.823880  0.818665  4180.000000
weighted avg   0.826319  0.820813  0.821744  4180.000000


### SMOTETomek

In [18]:
import json
import joblib
import pandas
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.combine import SMOTETomek

# --- 1. Load Data (Needed for training) ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']  # 👈 *** THIS IS THE FIX ***

print(f"Original training data shape: {X_full.shape}")
print(f"Original label distribution:\n{y_full.value_counts()}")

# --- 2. Define Your Best Parameters ---
# These are the params you provided from Optuna
best_rf_params = best_parameter

# --- 3. Apply SMOTETomek to the Training Data ---
print("\nApplying SMOTETomek to the training data...")
# Note: SMOTETomek can take a bit longer than plain SMOTE
smt = SMOTETomek(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smt.fit_resample(X_full, y_full)

print(f"New resampled training data shape: {X_resampled.shape}")
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- 4. Create and Train the Final Model (with NO class_weight) ---
print("\nTraining final RF model on SMOTETomek data...")

rf_model_final = RandomForestClassifier(
    **best_rf_params,
    # ⚠️ NO 'class_weight' here
    random_state=42,
    n_jobs=-1
)

# 5. Train the model on the NEW resampled data
rf_model_final.fit(X_resampled, y_resampled)
print("Model training complete.")

# --- 6. Evaluate on ORIGINAL Test Data ---
print("\nEvaluating model on *original* test data...")

data_test_rf = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')

# IMPORTANT: Do NOT resample the test data.
X_test = data_test_rf.drop(['label'], axis='columns')
y_test = data_test_rf['label']

data_test_rf['prediction'] = rf_model_final.predict(X_test)

# Print the report
report_scores_rf = sklearn.metrics.classification_report(
    y_true=y_test,
    y_pred=data_test_rf['prediction'],
    digits=6,
    output_dict=True
)
df_score_rf = pandas.DataFrame(report_scores_rf).transpose()

print("\nRandom Forest (Tuned + SMOTETomek) Report:")
print(df_score_rf)
# joblib.dump(rf_model_final, './model/rf/rf_model_final_smote.pkl')
# with open('./model/rf/rf_config.json','w')as f:
#     json.dump(
#         obj=rf_model_final.get_params(),
#         fp=f,
#         indent = 4
#     )

Original training data shape: (16720, 56)
Original label distribution:
label
0.0    9719
1.0    7001
Name: count, dtype: int64

Applying SMOTETomek to the training data...




New resampled training data shape: (18440, 56)
New resampled label distribution:
label
1.0    9220
0.0    9220
Name: count, dtype: int64

Training final RF model on SMOTETomek data...
Model training complete.

Evaluating model on *original* test data...

Random Forest (Tuned + SMOTETomek) Report:
              precision    recall  f1-score      support
0.0            0.876293  0.806291  0.839836  2416.000000
1.0            0.760858  0.844104  0.800322  1764.000000
accuracy       0.822249  0.822249  0.822249     0.822249
macro avg      0.818576  0.825198  0.820079  4180.000000
weighted avg   0.827579  0.822249  0.823161  4180.000000


### ADASYN (Adaptive Synthetic Sampling)

In [19]:
import pandas
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.over_sampling import ADASYN  # 👈 1. Import ADASYN

# --- 1. Load Data (Needed for training) ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

print(f"Original training data shape: {X_full.shape}")
print(f"Original label distribution:\n{y_full.value_counts()}")

# --- 2. Define Your Best Parameters ---
# These are the params you provided from Optuna
best_rf_params = best_parameter

# --- 3. Apply ADASYN to the Training Data ---
print("\nApplying ADASYN to the training data...")
ada = ADASYN(random_state=42, n_jobs=-1)
X_resampled, y_resampled = ada.fit_resample(X_full, y_full)

print(f"New resampled training data shape: {X_resampled.shape}")
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")
# Note: The counts might not be perfectly equal, as ADASYN's generation is adaptive.


# --- 4. Create and Train the Final Model (with NO class_weight) ---
print("\nTraining final RF model on ADASYN data...")

rf_model_final = RandomForestClassifier(
    **best_rf_params,
    # ⚠️ NO 'class_weight' here
    random_state=42,
    n_jobs=-1
)

# 5. Train the model on the NEW resampled data
rf_model_final.fit(X_resampled, y_resampled)
print("Model training complete.")

# --- 6. Evaluate on ORIGINAL Test Data ---
print("\nEvaluating model on *original* test data...")

data_test_rf = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')

# IMPORTANT: Do NOT resample the test data.
X_test = data_test_rf.drop(['label'], axis='columns')
y_test = data_test_rf['label']

data_test_rf['prediction'] = rf_model_final.predict(X_test)

# Print the report
report_scores_rf = sklearn.metrics.classification_report(
    y_true=y_test,
    y_pred=data_test_rf['prediction'],
    digits=6,
    output_dict=True
)
df_score_rf = pandas.DataFrame(report_scores_rf).transpose()

print("\nRandom Forest (Tuned + ADASYN) Report:")
print(df_score_rf)

Original training data shape: (16720, 56)
Original label distribution:
label
0.0    9719
1.0    7001
Name: count, dtype: int64

Applying ADASYN to the training data...




New resampled training data shape: (19417, 56)
New resampled label distribution:
label
0.0    9719
1.0    9698
Name: count, dtype: int64

Training final RF model on ADASYN data...
Model training complete.

Evaluating model on *original* test data...

Random Forest (Tuned + ADASYN) Report:
              precision    recall  f1-score      support
0.0            0.887582  0.781043  0.830911  2416.000000
1.0            0.742454  0.864512  0.798848  1764.000000
accuracy       0.816268  0.816268  0.816268     0.816268
macro avg      0.815018  0.822778  0.814880  4180.000000
weighted avg   0.826337  0.816268  0.817380  4180.000000
