In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
import os.path as path
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import VotingClassifier

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [39]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


In [40]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from lightgbm import LGBMClassifier


In [41]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [42]:
y = train["y"]
X = train.drop(["y", "id"], axis=1)

In [43]:
test_ids = test["id"]
X_test = test.drop("id", axis=1)

In [44]:
#  Define Feature Types

non_bin_features = ["job", "education", "contact", "month", "poutcome", "marital"]
bin_features = ["default", "housing", "loan"]
cat_features = bin_features + non_bin_features
num_features = [col for col in X.columns if col not in cat_features]

# Preprocessing pipeline
column_transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
    ]
)

In [45]:
#LightGBM

# ============================
# 3. Train/Validation Split
# ============================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ============================
# 4. Define LightGBM + Hyperparameter Grid
# ============================
lgb_base = LGBMClassifier(
    objective="binary",
    random_state=42,
    n_jobs=-1
)

param_dist = {
    "classifier__n_estimators": [300, 500, 700],
    "classifier__learning_rate": [0.01, 0.05, 0.1],
    "classifier__max_depth": [-1, 6, 10],   # -1 means no limit
    "classifier__num_leaves": [31, 64, 128],
    "classifier__min_child_samples": [20, 50, 100],
    "classifier__subsample": [0.7, 0.8, 0.9, 1.0],
    "classifier__colsample_bytree": [0.7, 0.8, 0.9, 1.0],
    "classifier__reg_alpha": [0, 0.1, 0.5],
    "classifier__reg_lambda": [0.5, 1.0, 2.0]
}

# ============================
# 5. Create Pipeline
# ============================
pipeline = Pipeline(steps=[
    ("preprocessor", column_transformer),
    ("classifier", lgb_base)
])

# ============================
# 6. RandomizedSearchCV
# ============================
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=20,
    scoring="roc_auc",
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit on training set
random_search.fit(X_train, y_train)

print("Best hyperparameters:", random_search.best_params_)
print("Validation ROC-AUC:", random_search.best_score_)

# ============================
# 7. Evaluate on Validation Set
# ============================
best_model = random_search.best_estimator_
y_val_pred = best_model.predict(X_val)
y_val_prob = best_model.predict_proba(X_val)[:, 1]

val_acc = accuracy_score(y_val, y_val_pred)
val_auc = roc_auc_score(y_val, y_val_prob)
print(f"Validation Accuracy: {val_acc:.4f}, ROC-AUC: {val_auc:.4f}")

# ============================
# 8. Predict on Test Set and Save Submission
# ============================
y_test_probs = best_model.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({"id": test_ids, "y": y_test_probs})
submission.to_csv("submission4.csv", index=False)
print("✅ Submission file saved as submission4.csv")


Fitting 3 folds for each of 20 candidates, totalling 60 fits


KeyboardInterrupt: 

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# from lightgbm import LGBMClassifier
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from scipy.stats import uniform, randint
# import numpy as np

# # ===========================
# # Preprocessing (same as before)
# # ===========================
# preprocessor = ColumnTransformer(
#     transformers=[
#         ("num", StandardScaler(), num_features),
#         ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_features)
#     ]
# )

# # Base LightGBM model
# lgbm = LGBMClassifier(
#     objective="binary",
#     random_state=42,
#     n_jobs=-1
# )

# pipe = Pipeline(steps=[("preprocessor", preprocessor),
#                        ("classifier", lgbm)])

# # ===========================
# # Refined Search Space
# # (around your best params)
# # ===========================
# param_dist = {
#     "classifier__num_leaves": randint(50, 120),       # Best was 60
#     "classifier__max_depth": randint(3, 8),           # Best was 4
#     "classifier__n_estimators": randint(600, 1200),   # Best was 800
#     "classifier__learning_rate": uniform(0.05, 0.15), # Center around 0.1
#     "classifier__subsample": uniform(0.5, 0.3),       # Best was 0.6
#     "classifier__colsample_bytree": uniform(0.7, 0.3),# Best was 1.0
#     "classifier__reg_alpha": randint(0, 3),           # Best was 1
#     "classifier__reg_lambda": randint(0, 5),          # Best was 0
#     "classifier__min_child_samples": randint(20, 100) # New: controls leaf overfitting
# }

# # ===========================
# # Randomized Search CV
# # ===========================
# random_search_refined = RandomizedSearchCV(
#     pipe,
#     param_distributions=param_dist,
#     n_iter=50,              # More focused iterations
#     scoring="roc_auc",
#     cv=3,
#     verbose=2,
#     n_jobs=-1,
#     random_state=42
# )

# # Fit
# random_search_refined.fit(X_train, y_train)

# # Best params and score
# print("Best params from Refined RandomizedSearchCV:")
# print(random_search_refined.best_params_)
# print("Best ROC-AUC:", random_search_refined.best_score_)

# # Evaluate on holdout
# best_model = random_search_refined.best_estimator_
# y_pred = best_model.predict(X_test)
# y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# print("Test Accuracy:", accuracy_score(y_test, y_pred))
# print("Test ROC-AUC:", roc_auc_score(y_test, y_pred_proba))


In [None]:
# # 3. Train/Validation Split
# # ============================
# X_train, X_val, y_train, y_val = train_test_split(
#     X, y, test_size=0.2, stratify=y, random_state=42
# )

# # ============================
# # 4. Define Base Models + Param Grids
# # ============================

# # LightGBM
# lgb_base = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
# lgb_param_dist = {
#     "classifier__n_estimators": [300, 500, 700],
#     "classifier__learning_rate": [0.01, 0.05, 0.1],
#     "classifier__max_depth": [-1, 6, 10],
#     "classifier__num_leaves": [31, 64, 128],
#     "classifier__min_child_samples": [20, 50, 100],
#     "classifier__subsample": [0.7, 0.8, 0.9, 1.0],
#     "classifier__colsample_bytree": [0.7, 0.8, 0.9, 1.0],
#     "classifier__reg_alpha": [0, 0.1, 0.5],
#     "classifier__reg_lambda": [0.5, 1.0, 2.0]
# }

# # XGBoost
# xgb_base = XGBClassifier(
#     objective="binary:logistic",
#     eval_metric="auc",
#     use_label_encoder=False,
#     random_state=42,
#     n_jobs=-1
# )
# xgb_param_dist = {
#     "classifier__n_estimators": [300, 500, 600],
#     "classifier__learning_rate": [0.01, 0.05, 0.1],
#     "classifier__max_depth": [4, 6, 8],
#     "classifier__min_child_weight": [1, 3, 5],
#     "classifier__gamma": [0, 0.1, 0.3],
#     "classifier__subsample": [0.7, 0.8, 0.9],
#     "classifier__colsample_bytree": [0.7, 0.8, 0.9],
#     "classifier__reg_alpha": [0, 0.1, 0.5],
#     "classifier__reg_lambda": [0.5, 1.0, 2.0]
# }

# # ============================
# # 5. RandomizedSearchCV for LightGBM
# # ============================
# lgb_pipeline = Pipeline([
#     ("preprocessor", column_transformer),
#     ("classifier", lgb_base)
# ])

# lgb_search = RandomizedSearchCV(
#     estimator=lgb_pipeline,
#     param_distributions=lgb_param_dist,
#     n_iter=20,
#     scoring="roc_auc",
#     cv=3,
#     verbose=2,
#     n_jobs=-1,
#     random_state=42
# )

# lgb_search.fit(X_train, y_train)
# print("Best LightGBM params:", lgb_search.best_params_)
# print("Best LightGBM ROC-AUC:", lgb_search.best_score_)

# # ============================
# # 6. RandomizedSearchCV for XGBoost
# # ============================
# xgb_pipeline = Pipeline([
#     ("preprocessor", column_transformer),
#     ("classifier", xgb_base)
# ])

# xgb_search = RandomizedSearchCV(
#     estimator=xgb_pipeline,
#     param_distributions=xgb_param_dist,
#     n_iter=20,
#     scoring="roc_auc",
#     cv=3,
#     verbose=2,
#     n_jobs=-1,
#     random_state=42
# )

# xgb_search.fit(X_train, y_train)
# print("Best XGBoost params:", xgb_search.best_params_)
# print("Best XGBoost ROC-AUC:", xgb_search.best_score_)

# # ============================
# # 7. Ensemble Voting
# # ============================
# ensemble = VotingClassifier(
#     estimators=[
#         ("lgbm", lgb_search.best_estimator_),
#         ("xgb", xgb_search.best_estimator_)
#     ],
#     voting="soft"  # average predicted probabilities
# )

# ensemble.fit(X_train, y_train)

# # Validation
# y_val_pred = ensemble.predict(X_val)
# y_val_prob = ensemble.predict_proba(X_val)[:, 1]

# val_acc = accuracy_score(y_val, y_val_pred)
# val_auc = roc_auc_score(y_val, y_val_prob)
# print(f"Ensemble Validation Accuracy: {val_acc:.4f}, ROC-AUC: {val_auc:.4f}")

# # ============================
# # 8. Predict Test Set
# # ============================
# y_test_probs = ensemble.predict_proba(X_test)[:, 1]
# submission = pd.DataFrame({"id": test_ids, "y": y_test_probs})
# submission.to_csv("submission_ensemble_randomsearch.csv", index=False)
# print("✅ Ensemble submission saved as submission_ensemble_randomsearch.csv")

In [None]:

# # ============================
# # 3. Train/Validation Split
# # ============================
# X_train, X_val, y_train, y_val = train_test_split(
#     X, y, test_size=0.2, stratify=y, random_state=42
# )

# # ============================
# # 4. Define Base Models
# # ============================
# lgb_base = LGBMClassifier(
#     objective="binary",
#     random_state=42,
#     n_jobs=-1
# )

# xgb_base = XGBClassifier(
#     objective="binary:logistic",
#     eval_metric="logloss",
#     use_label_encoder=False,
#     random_state=42,
#     n_jobs=-1
# )

# # ============================
# # 5. Create Pipelines
# # ============================
# lgb_pipeline = Pipeline(steps=[
#     ("preprocessor", column_transformer),
#     ("classifier", lgb_base)
# ])

# xgb_pipeline = Pipeline(steps=[
#     ("preprocessor", column_transformer),
#     ("classifier", xgb_base)
# ])

# # ============================
# # 6. Ensemble with Voting
# # ============================
# ensemble = VotingClassifier(
#     estimators=[
#         ("lgbm", lgb_pipeline),
#         ("xgb", xgb_pipeline)
#     ],
#     voting="soft"   # average probabilities
# )

# # ============================
# # 7. Train Ensemble
# # ============================
# ensemble.fit(X_train, y_train)

# # Validation Performance
# y_val_pred = ensemble.predict(X_val)
# y_val_prob = ensemble.predict_proba(X_val)[:, 1]

# val_acc = accuracy_score(y_val, y_val_pred)
# val_auc = roc_auc_score(y_val, y_val_prob)
# print(f"Validation Accuracy: {val_acc:.4f}, ROC-AUC: {val_auc:.4f}")

# # ============================
# # 8. Predict on Test Set
# # ============================
# y_test_probs = ensemble.predict_proba(X_test)[:, 1]

# submission = pd.DataFrame({"id": test_ids, "y": y_test_probs})
# submission.to_csv("submission_ensemble.csv", index=False)
# print(" Ensemble submission saved as submission_ensemble.csv")