In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# --- 1. Load Your Data (You already did this) ---
datatrain_df = pd.read_csv('./data/salary.train.processed.csv', index_col='id')
test_df = pd.read_csv('./data/salary.test.processed.csv', index_col='id')

# --- 2. Separate Features (X) and Target (y) ---
# --- IMPORTANT: Change 'target' to your actual target column name! ---
target_column = 'label' # Or 'salary', 'income', etc.

X_train = datatrain_df.drop(target_column, axis=1)
y_train = datatrain_df[target_column]

X_test = test_df.drop(target_column, axis=1)
y_test = test_df[target_column]

# --- 3. CRITICAL STEP: Feature Scaling ---
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler ONLY on the training data
scaler.fit(X_train)

# Transform both training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- 4. Initialize and Train the MLP/DNN ---
print("Training the MLP Classifier...")

# This creates a network with 2 hidden layers: one with 64 neurons, one with 32
mlp_model = MLPClassifier(
    hidden_layer_sizes=(64, 32),  # The architecture of your network
    max_iter=1000,               # Max epochs (passes through data)
    early_stopping=True,         # Stops training when validation score stops improving
    random_state=42,             # For reproducible results
    verbose=True                 # Set to True to see training progress
)

# Train the model on the SCALED data
mlp_model.fit(X_train_scaled, y_train)

print("Training complete.")

# --- 5. Evaluate the Model ---
# Make predictions on the SCALED test data
y_pred_mlp = mlp_model.predict(X_test_scaled)

# Print the results
print(f"\nMLP Accuracy: {accuracy_score(y_test, y_pred_mlp):.4f}")
print("\nMLP Classification Report:")
print(classification_report(y_test, y_pred_mlp, digits=4))

Training the MLP Classifier...
Iteration 1, loss = 0.52827667
Validation score: 0.811005
Iteration 2, loss = 0.39811452
Validation score: 0.808612
Iteration 3, loss = 0.38356991
Validation score: 0.815191
Iteration 4, loss = 0.37720204
Validation score: 0.815789
Iteration 5, loss = 0.37307379
Validation score: 0.818182
Iteration 6, loss = 0.36958425
Validation score: 0.815789
Iteration 7, loss = 0.36655434
Validation score: 0.811603
Iteration 8, loss = 0.36371961
Validation score: 0.814593
Iteration 9, loss = 0.36154710
Validation score: 0.809809
Iteration 10, loss = 0.35873219
Validation score: 0.815789
Iteration 11, loss = 0.35707486
Validation score: 0.812799
Iteration 12, loss = 0.35470151
Validation score: 0.812201
Iteration 13, loss = 0.35336092
Validation score: 0.814593
Iteration 14, loss = 0.35042624
Validation score: 0.812799
Iteration 15, loss = 0.34902052
Validation score: 0.810407
Iteration 16, loss = 0.34739592
Validation score: 0.811603
Validation score did not improve m

In [None]:
import pandas as pd
import optuna
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score

# --- 1. Load Your Data ---
datatrain_df = pd.read_csv('./data/salary.train.processed.csv', index_col='id')
test_df = pd.read_csv('./data/salary.test.processed.csv', index_col='id')

# --- 2. Separate Features (X) and Target (y) ---
# --- IMPORTANT: Change 'target' to your actual target column name! ---
target_column = 'label' # Or 'salary', 'income', etc.

X_train = datatrain_df.drop(target_column, axis=1)
y_train = datatrain_df[target_column]

X_test = test_df.drop(target_column, axis=1)
y_test = test_df[target_column]

# --- 3. CRITICAL STEP: Feature Scaling ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- 4. Define the Optuna Objective Function ---
# This function will be called once per trial
def objective(trial):
    # --- Define the Hyperparameter Search Space ---
    
    # 1. Number of hidden layers (e.g., 1, 2, or 3)
    n_layers = trial.suggest_int('n_layers', 1, 3)
    
    # 2. Size of each hidden layer
    layers = []
    for i in range(n_layers):
        # Suggest a number of neurons (e.g., 16 to 128)
        layers.append(trial.suggest_int(f'n_units_l{i}', 16, 128))
    
    hidden_layer_sizes = tuple(layers)
    
    # 3. Activation function
    activation = trial.suggest_categorical('activation', ['relu', 'tanh'])
    
    # 4. Solver (Algorithm for weight optimization)
    solver = trial.suggest_categorical('solver', ['adam', 'sgd'])
    
    # 5. Regularization strength (helps prevent overfitting)
    # --- CHANGED: Using suggest_float as per the warning ---
    alpha = trial.suggest_float('alpha', 1e-5, 1e-1, log=True)
    
    # 6. Initial learning rate (only for 'sgd' or 'adam')
    # --- CHANGED: Using suggest_float as per the warning ---
    learning_rate_init = trial.suggest_float('learning_rate_init', 1e-4, 1e-2, log=True)

    # --- Create the Model ---
    model = MLPClassifier(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        learning_rate_init=learning_rate_init,
        max_iter=500,  # Give it enough time to converge
        early_stopping=True, # Good practice for individual trials
        random_state=42
    )

    # --- Evaluate the Model ---
    # We use 3-fold cross-validation on the training data.
    # This gives a more stable score than a single train/validation split.
    score = cross_val_score(model, X_train_scaled, y_train, cv=3, scoring='accuracy')
    
    # Return the mean accuracy from the cross-validation
    return score.mean()

# --- 5. Run the Optuna Study ---
print("Starting Optuna hyperparameter search...")

# We want to MAXIMIZE accuracy
study = optuna.create_study(direction='maximize')

# Run 100 trials, and show the progress bar!
study.optimize(
    objective, 
    n_trials=100, 
    show_progress_bar=True  # This enables the progress bar
)

print("\nSearch complete.")
print(f"Best trial (accuracy): {study.best_value:.4f}")
print("Best hyperparameters found:")
print(study.best_params)

# --- 6. Train the FINAL Model with the Best Params ---
print("\nTraining final model with best hyperparameters...")

# Get a copy of the best parameters
best_params = study.best_params.copy()

# --- Reconstruct the hidden_layer_sizes tuple from the best params ---
# 1. Get the number of layers and REMOVE it from the dictionary
n_layers = best_params.pop('n_layers')

# 2. Build the layers list by REMOVING each n_units_l{i} key
layers = []
for i in range(n_layers):
    layer_size = best_params.pop(f'n_units_l{i}')
    layers.append(layer_size)

# 3. Create the final tuple that MLPClassifier understands
final_hidden_layer_sizes = tuple(layers)

# Now, 'best_params' only contains keys that MLPClassifier accepts
# (e.g., 'activation', 'solver', 'alpha')

# Create a new MLP model using the reconstructed tuple and the rest of the params
final_mlp = MLPClassifier(
    hidden_layer_sizes=final_hidden_layer_sizes, # Pass the tuple we just built
    max_iter=1000, 
    early_stopping=True,
    random_state=42,
    **best_params # This unpacks the *cleaned* dictionary
)

# Train on the FULL scaled training set
final_mlp.fit(X_train_scaled, y_train)

# --- 7. Evaluate on the Test Set ---
y_pred_final = final_mlp.predict(X_test_scaled)

print(f"\nFinal Model Accuracy on Test Set: {accuracy_score(y_test, y_pred_final):.4f}")
print("\nFinal Model Classification Report:")
print(classification_report(y_test, y_pred_final,digits=4))

[I 2025-10-22 07:13:44,310] A new study created in memory with name: no-name-f0369536-b741-4225-b59b-5e1b3637bcf9


Starting Optuna hyperparameter search...


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-10-22 07:13:54,013] Trial 0 finished with value: 0.807894921464363 and parameters: {'n_layers': 3, 'n_units_l0': 46, 'n_units_l1': 50, 'n_units_l2': 89, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0004629269617008387, 'learning_rate_init': 0.002625198512757082}. Best is trial 0 with value: 0.807894921464363.




### ADASYN

In [None]:
import pandas
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN

print("\n--- 4. Testing MLPClassifier with ADASYN ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_mlp = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_mlp.drop(['label'], axis='columns')
y_test = data_test_mlp['label']

# --- Apply ADASYN ---
print("Applying ADASYN...")
ada = ADASYN(random_state=42, n_jobs=-1)
X_resampled, y_resampled = ada.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters (Translated) ---
best_mlp_params = {
    'hidden_layer_sizes': (99,),
    'activation': 'relu',
    'solver': 'adam',
    'alpha': 0.0009771293502957021,
    'learning_rate_init': 0.0005471619343332291
}

# --- Create and Train Pipeline ---
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Step 1: Scale
    ('model', MLPClassifier(
        **best_mlp_params,
        # ⚠️ NO 'class_weight'
        random_state=42,
        max_iter=1000
    ))
])

pipeline.fit(X_resampled, y_resampled) # Train on ADASYN data
print("Model training complete.")

# --- Evaluate ---
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nMLPClassifier (Tuned + ADASYN) Report:")
print(df_report)


--- 4. Testing MLPClassifier with ADASYN ---
Applying ADASYN...


[WinError 2] The system cannot find the file specified
  File "c:\Users\natth\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


New resampled label distribution:
label
1.0    9726
0.0    9719
Name: count, dtype: int64
Model training complete.

MLPClassifier (Tuned + ADASYN) Report:
              precision    recall  f1-score     support
0.0            0.842930  0.757450  0.797907  2416.00000
1.0            0.708313  0.806689  0.754307  1764.00000
accuracy       0.778230  0.778230  0.778230     0.77823
macro avg      0.775621  0.782070  0.776107  4180.00000
weighted avg   0.786120  0.778230  0.779507  4180.00000


### SMOTETomek

In [None]:
import pandas
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek

print("\n--- 3. Testing MLPClassifier with SMOTETomek ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_mlp = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_mlp.drop(['label'], axis='columns')
y_test = data_test_mlp['label']

# --- Apply SMOTETomek ---
print("Applying SMOTETomek...")
smt = SMOTETomek(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smt.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters (Translated) ---
best_mlp_params = {
    'hidden_layer_sizes': (99,),
    'activation': 'relu',
    'solver': 'adam',
    'alpha': 0.0009771293502957021,
    'learning_rate_init': 0.0005471619343332291
}

# --- Create and Train Pipeline ---
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Step 1: Scale
    ('model', MLPClassifier(
        **best_mlp_params,
        # ⚠️ NO 'class_weight'
        random_state=42,
        max_iter=1000
    ))
])

pipeline.fit(X_resampled, y_resampled) # Train on SMOTETomek data
print("Model training complete.")

# --- Evaluate ---
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nMLPClassifier (Tuned + SMOTETomek) Report:")
print(df_report)


--- 3. Testing MLPClassifier with SMOTETomek ---
Applying SMOTETomek...




New resampled label distribution:
label
1.0    8914
0.0    8914
Name: count, dtype: int64
Model training complete.

MLPClassifier (Tuned + SMOTETomek) Report:
              precision    recall  f1-score      support
0.0            0.838025  0.800911  0.819048  2416.000000
1.0            0.742918  0.787982  0.764787  1764.000000
accuracy       0.795455  0.795455  0.795455     0.795455
macro avg      0.790472  0.794446  0.791917  4180.000000
weighted avg   0.797889  0.795455  0.796149  4180.000000


### SMOTE

In [None]:
import pandas
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

print("\n--- 2. Testing MLPClassifier with SMOTE ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_mlp = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_mlp.drop(['label'], axis='columns')
y_test = data_test_mlp['label']

# --- Apply SMOTE ---
print("Applying SMOTE...")
smote = SMOTE(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smote.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters (Translated) ---
best_mlp_params = {
    'hidden_layer_sizes': (99,),
    'activation': 'relu',
    'solver': 'adam',
    'alpha': 0.0009771293502957021,
    'learning_rate_init': 0.0005471619343332291
}

# --- Create and Train Pipeline ---
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Step 1: Scale
    ('model', MLPClassifier(
        **best_mlp_params,
        # ⚠️ NO 'class_weight'
        random_state=42,
        max_iter=1000
    ))
])

pipeline.fit(X_resampled, y_resampled) # Train on SMOTEd data
print("Model training complete.")

# --- Evaluate ---
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nMLPClassifier (Tuned + SMOTE) Report:")
print(df_report)


--- 2. Testing MLPClassifier with SMOTE ---
Applying SMOTE...
New resampled label distribution:
label
1.0    9719
0.0    9719
Name: count, dtype: int64




Model training complete.

MLPClassifier (Tuned + SMOTE) Report:
              precision    recall  f1-score      support
0.0            0.839664  0.786838  0.812393  2416.000000
1.0            0.731211  0.794218  0.761413  1764.000000
accuracy       0.789952  0.789952  0.789952     0.789952
macro avg      0.785438  0.790528  0.786903  4180.000000
weighted avg   0.793896  0.789952  0.790879  4180.000000
