In [7]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek
from sklearn.metrics import classification_report, accuracy_score

# -----------------------------------------------------------------
# 1. Define Your Feature List
# -----------------------------------------------------------------
SELECTED_FEATURES = [
    'marital-status_married-civ-spouse', 'relationship_husband',
    'marital-status_never-married', 'education-num', 'capitalgain',
    'age-group', 'relationship_own-child', 'hoursperweek', 'sex_male',
    'sex_female', 'relationship_not-in-family', 'occupation_prof-specialty',
    'occupation_other-service', 'relationship_unmarried',
    'marital-status_divorced', 'capitalloss', 'occupation_exec-managerial',
    'workclass_self-emp-inc', 'relationship_wife', 'workclass_private',
    'race_black', 'race_white', 'relationship_other-relative',
    'occupation_handlers-cleaners'
]

# -----------------------------------------------------------------
# 2. Load and Prepare Data
# -----------------------------------------------------------------
try:
    data_train = pd.read_csv('../data/salary.train.processed.csv').set_index('id')
    data_test = pd.read_csv('../data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: Training or Test file not found.")
    print("Please check the path: ../data/salary.train.processed.csv")
    # You might want to exit or handle this error appropriately
    # For this example, we'll stop
    exit()


data_train = data_train[SELECTED_FEATURES + ['label']]
data_test = data_test[SELECTED_FEATURES + ['label']]

X_train = data_train[SELECTED_FEATURES]
y_train = data_train['label']
X_test = data_test[SELECTED_FEATURES]
y_test = data_test['label']

# -----------------------------------------------------------------
# 3. Define Your Samplers
# -----------------------------------------------------------------
sampler_adasyn = ADASYN(random_state=42)
sampler_smotetomek = SMOTETomek(random_state=42)

# -----------------------------------------------------------------
# 4. Define Your Best Base Models
# -----------------------------------------------------------------
params_rf = {
    'n_estimators': 300, 'max_depth': 12, 'min_samples_leaf': 2, 'min_samples_split': 9,
    'criterion': 'entropy', 'max_features': 0.42082357754585725, 'random_state': 42, 'n_jobs': -1
}
pipe_rf = Pipeline([('model', RandomForestClassifier(**params_rf))])

params_lgbm = {
    'learning_rate': 0.03713445949663834, 'num_leaves': 24, 'max_depth': 4,
    'min_child_samples': 10, 'subsample': 0.8821481250200053,
    'colsample_bytree': 0.7473748858313227, 'random_state': 42, 'n_jobs': -1
}
pipe_lgbm = Pipeline([('model', LGBMClassifier(**params_lgbm))])

params_logreg = {
    'C': 2.7420181030569966, 'penalty': 'elasticnet', 'l1_ratio': 0.9302376392883114,
    'solver': 'saga', 'max_iter': 1000, 'random_state': 42
}
pipe_logreg = Pipeline([('sampler', sampler_adasyn), ('model', LogisticRegression(**params_logreg))])

params_mlp = {
    'hidden_layer_sizes': (226, 112, 185), 'activation': 'relu', 'alpha': 0.09988973301090445,
    'learning_rate_init': 0.0003520144637184677, 'random_state': 42,
    'max_iter': 500, 'early_stopping': True
}
pipe_mlp = Pipeline([('sampler', sampler_adasyn), ('model', MLPClassifier(**params_mlp))])

params_cat = {
    'iterations': 457, 'depth': 7, 'learning_rate': 0.013380005910139176,
    'l2_leaf_reg': 0.06785680825867879, 'border_count': 237,
    'random_strength': 0.004259876881997753, 'random_state': 42, 'verbose': 0
}
pipe_cat = Pipeline([('sampler', sampler_smotetomek), ('model', CatBoostClassifier(**params_cat))])

# -----------------------------------------------------------------
# 5. Define the Stacking Classifier
# -----------------------------------------------------------------
base_estimators = [
    ('rf', pipe_rf),
    ('lgbm', pipe_lgbm),
    ('logreg_adasyn', pipe_logreg),
    ('mlp_adasyn', pipe_mlp),
    ('cat_smotetomek', pipe_cat)
]

meta_model = LogisticRegression(random_state=42)

stacking_model = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    verbose=11,  # <-- SET TO A HIGH NUMBER FOR MAX PROGRESS UPDATES
    passthrough=False
)

# -----------------------------------------------------------------
# 6. Train and Evaluate
# -----------------------------------------------------------------
print("Training the Stacking Classifier... (This will take a long time)")
print("Watch the console for progress updates like [Parallel(...)]:")

stacking_model.fit(X_train, y_train)

print("\nTraining complete!")

# --- Evaluate ---
y_pred_stack = stacking_model.predict(X_test)
y_pred_proba_stack = stacking_model.predict_proba(X_test)

print("\n--- Stacking Model Performance ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")
print(classification_report(y_test, y_pred_stack,digits=4))

Training the Stacking Classifier... (This will take a long time)
Watch the console for progress updates like [Parallel(...)]:


KeyboardInterrupt: 

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek
from sklearn.metrics import classification_report, accuracy_score
import time

# -----------------------------------------------------------------
# 1. Load and Prepare Data (No Feature Selection)
# -----------------------------------------------------------------
print("Loading and preparing data...")
try:
    data_train = pd.read_csv('../data/salary.train.processed.csv').set_index('id')
    data_test = pd.read_csv('../data/salary.test.processed.csv').set_index('id')
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please check the path to your data files.")
    exit()

# Define X and y from the full loaded data
# X_train will be all columns EXCEPT 'label'
X_train = data_train.drop('label', axis=1)
y_train = data_train['label']

# X_test will be all columns EXCEPT 'label'
X_test = data_test.drop('label', axis=1)
y_test = data_test['label']

print(f"Data loaded successfully. Training with {X_train.shape[1]} features.")

# -----------------------------------------------------------------
# 2. Define Your Samplers
# -----------------------------------------------------------------
sampler_adasyn = ADASYN(random_state=42)
sampler_smotetomek = SMOTETomek(random_state=42)

# -----------------------------------------------------------------
# 3. Define Your Best Base Models
# -----------------------------------------------------------------
params_rf = {
    'n_estimators': 300, 'max_depth': 12, 'min_samples_leaf': 2, 'min_samples_split': 9,
    'criterion': 'entropy', 'max_features': 0.42082357754585725, 'random_state': 42, 'n_jobs': -1
}
pipe_rf = Pipeline([('model', RandomForestClassifier(**params_rf))])

params_lgbm = {
    'learning_rate': 0.03713445949663834, 'num_leaves': 24, 'max_depth': 4,
    'min_child_samples': 10, 'subsample': 0.8821481250200053,
    'colsample_bytree': 0.7473748858313227, 'random_state': 42, 'n_jobs': -1, 'verbose': -1
}
pipe_lgbm = Pipeline([('model', LGBMClassifier(**params_lgbm))])

params_logreg = {
    'C': 2.7420181030569966, 'penalty': 'elasticnet', 'l1_ratio': 0.9302376392883114,
    'solver': 'saga', 'max_iter': 1000, 'random_state': 42
}
pipe_logreg = Pipeline([('sampler', sampler_adasyn), ('model', LogisticRegression(**params_logreg))])

params_mlp = {
    'hidden_layer_sizes': (226, 112, 185), 'activation': 'relu', 'alpha': 0.09988973301090445,
    'learning_rate_init': 0.0003520144637184677, 'random_state': 42,
    'max_iter': 500, 'early_stopping': True
}
pipe_mlp = Pipeline([('sampler', sampler_adasyn), ('model', MLPClassifier(**params_mlp))])

params_cat = {
    'iterations': 457, 'depth': 7, 'learning_rate': 0.013380005910139176,
    'l2_leaf_reg': 0.06785680825867879, 'border_count': 237,
    'random_strength': 0.004259876881997753, 'random_state': 42, 'verbose': 0
}
pipe_cat = Pipeline([('sampler', sampler_smotetomek), ('model', CatBoostClassifier(**params_cat))])

# -----------------------------------------------------------------
# 4. Define the Stacking Classifier
# -----------------------------------------------------------------
base_estimators = [
    ('rf', pipe_rf),
    ('lgbm', pipe_lgbm),
    ('logreg_adasyn', pipe_logreg),
    ('mlp_adasyn', pipe_mlp),
    ('cat_smotetomek', pipe_cat)
]

meta_model = LogisticRegression(random_state=42)

stacking_model = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    verbose=11,  # This will print text updates like [Parallel(..): Done 2 of 5 ..]
    passthrough=False
)

# -----------------------------------------------------------------
# 5. Train and Evaluate
# -----------------------------------------------------------------
print(f"\nTraining the Stacking Classifier on {X_train.shape[1]} features...")
print("This will take a long time. Watch for [Parallel(...)] messages.")

start_time = time.time()
stacking_model.fit(X_train, y_train)
end_time = time.time()

print(f"\nTraining complete! Total time: {(end_time - start_time) / 60:.2f} minutes")

# --- Evaluate ---
y_pred_stack = stacking_model.predict(X_test)
y_pred_proba_stack = stacking_model.predict_proba(X_test)

print("\n--- Stacking Model Performance (4 Digits) ---")

# Print Accuracy formatted to 4 decimal places
print(f"Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")

# Print Classification Report formatted to 4 decimal places
print(classification_report(y_test, y_pred_stack, digits=4))

Loading and preparing data...
Data loaded successfully. Training with 56 features.

Training the Stacking Classifier on 56 features...
This will take a long time. Watch for [Parallel(...)] messages.

Training complete! Total time: 2.19 minutes

--- Stacking Model Performance (4 Digits) ---
Accuracy: 0.8280
              precision    recall  f1-score   support

         0.0     0.8631    0.8349    0.8487      2416
         1.0     0.7835    0.8186    0.8007      1764

    accuracy                         0.8280      4180
   macro avg     0.8233    0.8267    0.8247      4180
weighted avg     0.8295    0.8280    0.8284      4180



In [9]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN
from sklearn.metrics import classification_report, accuracy_score
import time

# -----------------------------------------------------------------
# 1. PASTE Your Feature List Here
# -----------------------------------------------------------------
SELECTED_FEATURES = [
    'marital-status_married-civ-spouse', 'relationship_husband',
    'marital-status_never-married', 'education-num', 'capitalgain',
    'age-group', 'relationship_own-child', 'hoursperweek', 'sex_male',
    'sex_female', 'relationship_not-in-family', 'occupation_prof-specialty',
    'occupation_other-service', 'relationship_unmarried',
    'marital-status_divorced', 'capitalloss', 'occupation_exec-managerial',
    'workclass_self-emp-inc', 'relationship_wife', 'workclass_private',
    'race_black', 'race_white', 'relationship_other-relative',
    'occupation_handlers-cleaners'
]

# -----------------------------------------------------------------
# 2. Load and Prepare Data
# -----------------------------------------------------------------
print("Loading and preparing data...")
try:
    data_train = pd.read_csv('../data/salary.train.processed.csv').set_index('id')
    data_test = pd.read_csv('../data/salary.test.processed.csv').set_index('id')
except FileNotFoundError as e:
    print(f"Error: {e}")
    exit()

# Filter data to ONLY your selected features
X_train = data_train[SELECTED_FEATURES]
y_train = data_train['label']
X_test = data_test[SELECTED_FEATURES]
y_test = data_test['label']

print(f"Data loaded successfully. Training with {X_train.shape[1]} selected features.")

# -----------------------------------------------------------------
# 3. Define Your Sampler
# -----------------------------------------------------------------
sampler_adasyn = ADASYN(random_state=42)

# -----------------------------------------------------------------
# 4. Define Your "A-List" Models (using your params)
# -----------------------------------------------------------------
params_rf = {
    'n_estimators': 300, 'max_depth': 12, 'min_samples_leaf': 2, 'min_samples_split': 9,
    'criterion': 'entropy', 'max_features': 0.42082357754585725, 'random_state': 42, 'n_jobs': -1
}
pipe_rf = Pipeline([('model', RandomForestClassifier(**params_rf))])

params_lgbm = {
    'learning_rate': 0.03713445949663834, 'num_leaves': 24, 'max_depth': 4,
    'min_child_samples': 10, 'subsample': 0.8821481250200053,
    'colsample_bytree': 0.7473748858313227, 'random_state': 42, 'n_jobs': -1, 'verbose': -1
}
pipe_lgbm = Pipeline([('model', LGBMClassifier(**params_lgbm))])

params_logreg = {
    'C': 2.7420181030569966, 'penalty': 'elasticnet', 'l1_ratio': 0.9302376392883114,
    'solver': 'saga', 'max_iter': 1000, 'random_state': 42
}
pipe_logreg = Pipeline([('sampler', sampler_adasyn), ('model', LogisticRegression(**params_logreg))])

# -----------------------------------------------------------------
# 5. Define the Stacking Classifier
# -----------------------------------------------------------------
base_estimators = [
    ('rf', pipe_rf),
    ('lgbm', pipe_lgbm),
    ('logreg_adasyn', pipe_logreg)
]

meta_model = LogisticRegression(random_state=42)

stacking_model = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    verbose=11,
    passthrough=False # Meta-model only sees predictions
)

# -----------------------------------------------------------------
# 6. Train and Evaluate
# -----------------------------------------------------------------
print(f"\nTraining Stack (Top 3) with Feature Selection ({X_train.shape[1]} features)...")
start_time = time.time()
stacking_model.fit(X_train, y_train)
end_time = time.time()
print(f"\nTraining complete! Total time: {(end_time - start_time) / 60:.2f} minutes")

y_pred_stack = stacking_model.predict(X_test)
print(f"\n--- Stack (Top 3) + Feature Selection Performance (4 Digits) ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")
print(classification_report(y_test, y_pred_stack, digits=4))

Loading and preparing data...
Data loaded successfully. Training with 24 selected features.

Training Stack (Top 3) with Feature Selection (24 features)...

Training complete! Total time: 0.53 minutes

--- Stack (Top 3) + Feature Selection Performance (4 Digits) ---
Accuracy: 0.8244
              precision    recall  f1-score   support

         0.0     0.8625    0.8282    0.8450      2416
         1.0     0.7769    0.8192    0.7975      1764

    accuracy                         0.8244      4180
   macro avg     0.8197    0.8237    0.8212      4180
weighted avg     0.8264    0.8244    0.8249      4180



In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from lightgbm import LGBMClassifier
import time
from sklearn.metrics import classification_report, accuracy_score

# --- THE FIX IS HERE ---
# We must use the Pipeline from imblearn, NOT from sklearn
from imblearn.pipeline import Pipeline 
from imblearn.combine import SMOTETomek
# -------------------------

# --- Use your 24 selected features ---
SELECTED_FEATURES = [
    'marital-status_married-civ-spouse', 'relationship_husband',
    'marital-status_never-married', 'education-num', 'capitalgain',
    'age-group', 'relationship_own-child', 'hoursperweek', 'sex_male',
    'sex_female', 'relationship_not-in-family', 'occupation_prof-specialty',
    'occupation_other-service', 'relationship_unmarried',
    'marital-status_divorced', 'capitalloss', 'occupation_exec-managerial',
    'workclass_self-emp-inc', 'relationship_wife', 'workclass_private',
    'race_black', 'race_white', 'relationship_other-relative',
    'occupation_handlers-cleaners'
]

# --- Load and filter data ---
print("Loading and preparing data...")
try:
    data_train = pd.read_csv('../data/salary.train.processed.csv').set_index('id')
    data_test = pd.read_csv('../data/salary.test.processed.csv').set_index('id')
except FileNotFoundError as e:
    print(f"Error: {e}")
    exit()

# Filter data to ONLY your selected features
X_train_filtered = data_train[SELECTED_FEATURES]
y_train = data_train['label']
X_test_filtered = data_test[SELECTED_FEATURES]
y_test = data_test['label']

print(f"Data loaded successfully. Training with {X_train_filtered.shape[1]} selected features.")

# --- Define your sampler ---
sampler_smotetomek = SMOTETomek(random_state=42)

# --- Define Your Top 2 Models with Best Params ---
params_rf = {
    'n_estimators': 300, 'max_depth': 12, 'min_samples_leaf': 2, 'min_samples_split': 9,
    'criterion': 'entropy', 'max_features': 0.42082357754585725, 'random_state': 42, 'n_jobs': -1
}
pipe_rf = Pipeline([
    ('sampler', sampler_smotetomek), # Using the sampler that gave you the best RF score
    ('model', RandomForestClassifier(**params_rf))
])

params_lgbm = {
    'learning_rate': 0.03713445949663834, 'num_leaves': 24, 'max_depth': 4,
    'min_child_samples': 10, 'subsample': 0.8821481250200053,
    'colsample_bytree': 0.7473748858313227, 'random_state': 42, 'n_jobs': -1, 'verbose': -1
}
pipe_lgbm = Pipeline([
    ('sampler', sampler_smotetomek),
    ('model', LGBMClassifier(**params_lgbm))
])

# --- Create the Voting Classifier ---
voting_model = VotingClassifier(
    estimators=[
        ('rf', pipe_rf),
        ('lgbm', pipe_lgbm)
    ],
    voting='soft',
    n_jobs=-1,
    verbose=True
)

# --- Train and Evaluate ---
print("Training VotingClassifier (RF + LGBM) with Feature Selection + SMOTETomek...")
start_time = time.time()

# This .fit() call should work now
voting_model.fit(X_train_filtered, y_train)

end_time = time.time()
print(f"\nTraining complete! Total time: {(end_time - start_time) / 60:.2f} minutes")

y_pred_vote = voting_model.predict(X_test_filtered)

print("\n--- VOTING Model Performance (4 Digits) ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_vote):.4f}")
print(classification_report(y_test, y_pred_vote, digits=4))

Loading and preparing data...
Data loaded successfully. Training with 24 selected features.
Training VotingClassifier (RF + LGBM) with Feature Selection + SMOTETomek...

Training complete! Total time: 0.14 minutes

--- VOTING Model Performance (4 Digits) ---
Accuracy: 0.8220
              precision    recall  f1-score   support

         0.0     0.8881    0.7918    0.8372      2416
         1.0     0.7517    0.8634    0.8037      1764

    accuracy                         0.8220      4180
   macro avg     0.8199    0.8276    0.8204      4180
weighted avg     0.8306    0.8220    0.8231      4180

