In [None]:
# Basic Libraries
import pandas as pd
import numpy as np
from scipy import stats

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.utils import resample

# For data manipulation
import pandas as pd
import numpy as np

# For splitting the data
from sklearn.model_selection import train_test_split

# For resampling (handling class imbalance)
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

# For scaling numeric features
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.model_selection import StratifiedKFold, GridSearchCV
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

# # Configurations (optional)
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings('ignore')



In [None]:
def preprocess_data(train, test):
    """
    Preprocesses train and test datasets with streamlined transformations and appropriate resampling.

    Parameters:
        train (DataFrame): Training dataset.
        test (DataFrame): Testing dataset.

    Returns:
        train (DataFrame): Preprocessed training dataset.
        test (DataFrame): Preprocessed testing dataset.
    """
    # Step 1: Drop 'ID' and 'Loan_ID' as they are unique identifiers
    train.drop(['ID', 'Loan_ID'], axis=1, inplace=True)
    test.drop(['ID', 'Loan_ID'], axis=1, inplace=True)

    # Step 2: Replace '3+' with 3 in 'Dependents' and convert to float
    train['Dependents'] = train['Dependents'].replace('3+', 3).astype(float)
    test['Dependents'] = test['Dependents'].replace('3+', 3).astype(float)

    # Step 3: Handle missing values (if any)
    # Although your dataset does not have missing values, this is good practice
    numeric_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Total_Income']
    for col in numeric_cols:
        train[col].fillna(train[col].median(), inplace=True)
        test[col].fillna(test[col].median(), inplace=True)

    categorical_cols = ['Credit_History', 'Self_Employed', 'Education', 'Gender', 'Married']
    for col in categorical_cols:
        train[col].fillna(train[col].mode()[0], inplace=True)
        test[col].fillna(test[col].mode()[0], inplace=True)

    # Ensure no NaN values remain in the dataset
    train.fillna(0, inplace=True)
    test.fillna(0, inplace=True)

    # Step 4: One-Hot Encode 'Property_Area'
    train = pd.get_dummies(train, columns=['Property_Area'], drop_first=True)
    test = pd.get_dummies(test, columns=['Property_Area'], drop_first=True)

    # Step 5: Add feature interactions with descriptive headers
    for df in [train, test]:
        # Basic interactions
        df['Loan_to_Income_Ratio'] = df['LoanAmount'] / (df['ApplicantIncome'] + df['CoapplicantIncome'] + 1e-6)
        df['Income_per_Dependent'] = df['Total_Income'] / (df['Dependents'] + 1)
        df['LoanAmount_per_Term'] = df['LoanAmount'] / (df['Loan_Amount_Term'] + 1e-6)
        df['EMI'] = df['LoanAmount'] / (df['Loan_Amount_Term'] + 1e-6)
        df['EMI_to_Income_Ratio'] = df['EMI'] / df['Total_Income']
        df['Debt_to_Income_Ratio'] = df['LoanAmount'] / (df['Total_Income'] + 1e-6)
        df['all_income'] = df['ApplicantIncome'] + df['CoapplicantIncome'] + df['Total_Income']
        df['loan_to_all_income_ratio'] = df['LoanAmount'] / df['all_income']
        df['all_Income_per_Dependent'] = df['all_income'] / (df['Dependents'] + 1)

        # Additional interactions
        df['Dependents_Credit_History_Interaction'] = df['Dependents'] * df['Credit_History']
        df['Income_LoanAmount_Interaction'] = df['Total_Income'] * df['LoanAmount']
        df['all_Income_LoanAmount_Interaction'] = df['all_income'] * df['LoanAmount']
        df['Credit_History_LoanAmount'] = df['Credit_History'] * df['LoanAmount']
        df['Credit_History_Income_Interaction'] = df['Credit_History'] * df['Total_Income']
        df['Credit_History_all_Income_Interaction'] = df['Credit_History'] * df['all_income']
        df['loan_to_income_ratio'] = df['LoanAmount'] / (df['Total_Income'] + 1e-6)
        df['income_per_year_emp'] = df['ApplicantIncome'] / (df['Loan_Amount_Term'] + 1e-6)
        df['coapplicant_income_per_year_emp'] = df['CoapplicantIncome'] / (df['Loan_Amount_Term'] + 1e-6)
        df['all_income_per_year_emp'] = df['all_income'] / (df['Loan_Amount_Term'] + 1e-6)
        df['Coapplicant_Income_Ratio'] = df['CoapplicantIncome'] / df['Total_Income']
        df['Applicant_Income_Ratio'] = df['ApplicantIncome'] / df['Total_Income']
        df['TotalIncome_LoanAmount_Ratio'] = df['Total_Income'] / (df['LoanAmount'] + 1e-6)
        df['all_Income_LoanAmount_Ratio'] = df['all_income'] / (df['LoanAmount'] + 1e-6)
        df['CreditHistory_LoanTerm_Interaction'] = df['Credit_History'] * df['Loan_Amount_Term']
        df['Education_LoanAmount_Interaction'] = df['Education'] * df['LoanAmount']
        df['SelfEmployed_Income_Interaction'] = df['Self_Employed'] * df['Total_Income']

        # Squared and log-transformed features
        df['ApplicantIncome_Squared'] = df['ApplicantIncome'] ** 2
        df['all_income_Squared'] = df['all_income'] ** 2
        df['LoanAmount_Squared'] = df['LoanAmount'] ** 2
        df['Log_LoanAmount_per_Term'] = np.log1p(df['LoanAmount_per_Term'])
        df['Log_ApplicantIncome'] = np.log1p(df['ApplicantIncome'])
        df['Log_CoapplicantIncome'] = np.log1p(df['CoapplicantIncome'])
        df['Log_LoanAmount'] = np.log1p(df['LoanAmount'])
        df['Log_Total_Income'] = np.log1p(df['Total_Income'])

        # Dependents and loan-based interactions
        df['Dependents_TotalIncome_Interaction'] = df['Dependents'] * df['Total_Income']
        df['Dependents_LoanAmount_Interaction'] = df['Dependents'] * df['LoanAmount']
        df['LoanAmount_to_Term_Ratio'] = df['LoanAmount'] / (df['Loan_Amount_Term'] + 1e-6)

        # Feature interactions with Gender and Married
        df['Gender_ApplicantIncome'] = df['Gender'] * df['ApplicantIncome']
        df['Married_CoapplicantIncome'] = df['Married'] * df['CoapplicantIncome']
        df['Married_Total_Income'] = df['Married'] * df['Total_Income']
        df['Gender_Married_Interaction'] = df['Gender'] * df['Married']

    return train, test

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

# Helper function for oversampling
def resample_minority_class(X, y):
    # Concatenate X and y for resampling
    data = pd.concat([X, y], axis=1)
    minority_class = data[y.name].value_counts().idxmin()  # Find the minority class
    minority_data = data[data[y.name] == minority_class]
    majority_data = data[data[y.name] != minority_class]

    # Oversample the minority class
    minority_upsampled = resample(minority_data,
                                  replace=True,  # Sample with replacement
                                  n_samples=len(majority_data),  # Match majority class size
                                  random_state=42)

    # Combine majority data with upsampled minority data
    upsampled_data = pd.concat([majority_data, minority_upsampled])

    return upsampled_data.drop(columns=[y.name]), upsampled_data[y.name]

# Helper function for undersampling
def resample_majority_class(X, y):
    # Concatenate X and y for resampling
    data = pd.concat([X, y], axis=1)
    minority_class = data[y.name].value_counts().idxmin()  # Find the minority class
    minority_data = data[data[y.name] == minority_class]
    majority_data = data[data[y.name] != minority_class]

    # Undersample the majority class
    majority_downsampled = resample(majority_data,
                                    replace=False,  # Sample without replacement
                                    n_samples=len(minority_data),  # Match minority class size
                                    random_state=42)

    # Combine minority data with downsampled majority data
    downsampled_data = pd.concat([minority_data, majority_downsampled])

    return downsampled_data.drop(columns=[y.name]), downsampled_data[y.name]

# Main function for resampling and scaling
def resample_split(train, test, resampling_method='SMOTE', scaling_option=None):
    """
    Splits the data into training and testing sets, handles class imbalance, and scales features appropriately.

    Parameters:
        train (DataFrame): Preprocessed training dataset.
        test (DataFrame): Preprocessed testing dataset.
        resampling_method (str): Resampling technique to handle class imbalance.
                                 Options: 'SMOTE', 'ADASYN', 'Borderline-SMOTE', 'SMOTEENN', 'SMOTETomek', 'oversample', 'undersample'.
        scaling_option (str): Scaling technique for continuous features.
                              Options: 'standard', 'minmax', 'robust', or None.

    Returns:
        X_train_res (array): Resampled training features.
        y_train_res (array): Resampled training labels.
        X_test (array): Scaled testing features.
        y_test (array): Original testing labels.
        test_scaled (array): Scaled test dataset features.
    """
    # Step 1: Create X and y
    X = train.drop('Loan_Status', axis=1)
    y = train['Loan_Status']
    test = test.copy()

    # Step 2: Align columns between train and test
    X, test = X.align(test, join='left', axis=1, fill_value=0)

    # Step 3: Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1, stratify=y)

    # Step 4: Handle class imbalance
    if resampling_method == 'SMOTE':
        resampler = SMOTE(random_state=42)
        X_train_res, y_train_res = resampler.fit_resample(X_train, y_train)
    elif resampling_method == 'ADASYN':
        resampler = ADASYN(random_state=42)
        X_train_res, y_train_res = resampler.fit_resample(X_train, y_train)
    elif resampling_method == 'Borderline-SMOTE':
        resampler = BorderlineSMOTE(random_state=42)
        X_train_res, y_train_res = resampler.fit_resample(X_train, y_train)
    elif resampling_method == 'SMOTEENN':
        resampler = SMOTEENN(random_state=42)
        X_train_res, y_train_res = resampler.fit_resample(X_train, y_train)
    elif resampling_method == 'SMOTETomek':
        resampler = SMOTETomek(random_state=42)
        X_train_res, y_train_res = resampler.fit_resample(X_train, y_train)
    elif resampling_method == 'oversample':
        # Oversample the minority class
        X_train_res, y_train_res = resample_minority_class(X_train, y_train)
    elif resampling_method == 'undersample':
        # Undersample the majority class
        X_train_res, y_train_res = resample_majority_class(X_train, y_train)
    else:
        raise ValueError(f"Invalid resampling method: {resampling_method}")

    # Step 5: Scale numeric features (optional, based on model requirements)
    if scaling_option:
        # List of binary columns (do not scale these)
        binary_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Credit_History']
        # Include 'Property_Area' dummies
        binary_cols += [col for col in X_train_res.columns if 'Property_Area_' in col]

        # Columns to scale
        cols_to_scale = [col for col in X_train_res.columns if col not in binary_cols]

        # Apply scaler
        scaler = None
        if scaling_option == 'standard':
            scaler = StandardScaler()
        elif scaling_option == 'minmax':
            scaler = MinMaxScaler()
        elif scaling_option == 'robust':
            scaler = RobustScaler()
        else:
            raise ValueError(f"Invalid scaling option: {scaling_option}")

        if scaler:
            # Ensure that X_train_res, X_test, test are DataFrames
            if not isinstance(X_train_res, pd.DataFrame):
                X_train_res = pd.DataFrame(X_train_res, columns=X.columns)
            if not isinstance(X_test, pd.DataFrame):
                X_test = pd.DataFrame(X_test, columns=X.columns)
            if not isinstance(test, pd.DataFrame):
                test = pd.DataFrame(test, columns=X.columns)

            # Apply scaling only to continuous features
            X_train_res[cols_to_scale] = scaler.fit_transform(X_train_res[cols_to_scale])
            X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])
            test[cols_to_scale] = scaler.transform(test[cols_to_scale])

    # Convert back to arrays for modeling
    X_train_res = X_train_res.values
    X_test = X_test.values
    test_scaled = test.values

    return X_train_res, y_train_res, X_test, y_test, test_scaled


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

def preprocess_data(train, test):
    # Step 1: Drop the 'Loan_ID' column
    train = train.drop('Loan_ID', axis=1)
    test = test.drop('Loan_ID', axis=1)

    # Step 2: Replace '3+' with 3 in the 'Dependents' column
    train['Dependents'] = train['Dependents'].replace('3+', '3')
    test['Dependents'] = test['Dependents'].replace('3+', '3')

    # Step 3: Convert 'Dependents' to numeric datatype
    train['Dependents'] = pd.to_numeric(train['Dependents'])
    test['Dependents'] = pd.to_numeric(test['Dependents'])

    # Handle missing values before transformations
    numeric_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Total_Income']
    for col in numeric_cols:
        train[col].fillna(train[col].median(), inplace=True)
        test[col].fillna(test[col].median(), inplace=True)

    # Step 4: Apply log transformation to 'ApplicantIncome' to create additional features
    train['Log_ApplicantIncome'] = np.log1p(train['ApplicantIncome'])
    test['Log_ApplicantIncome'] = np.log1p(test['ApplicantIncome'])

    # Step 5: Apply log transformation to 'CoapplicantIncome' to create additional features
    train['Log_CoapplicantIncome'] = np.log1p(train['CoapplicantIncome'])
    test['Log_CoapplicantIncome'] = np.log1p(test['CoapplicantIncome'])

    # Step 6: Apply log transformation to 'LoanAmount' to create additional features
    train['Log_LoanAmount'] = np.log1p(train['LoanAmount'])
    test['Log_LoanAmount'] = np.log1p(test['LoanAmount'])

    # Step 7: Apply Box-Cox transformation to 'Loan_Amount_Term' to create additional features
    # Box-Cox requires all positive values; ensure no zeros or negatives
    train['Loan_Amount_Term'] = train['Loan_Amount_Term'].replace(0, train['Loan_Amount_Term'].median())
    test['Loan_Amount_Term'] = test['Loan_Amount_Term'].replace(0, test['Loan_Amount_Term'].median())

    # Apply Box-Cox transformation
    train['BoxCox_Loan_Amount_Term'], lam = boxcox(train['Loan_Amount_Term'])
    test['BoxCox_Loan_Amount_Term'] = boxcox(test['Loan_Amount_Term'], lmbda=lam)

    # Step 8: Apply log transformation to 'Total_Income' to create additional features
    train['Log_Total_Income'] = np.log1p(train['Total_Income'])
    test['Log_Total_Income'] = np.log1p(test['Total_Income'])

    # Steps 11 & 12: Drop 'Gender' and 'Married' columns if they have no effect
    train = train.drop(['Gender', 'Married'], axis=1)
    test = test.drop(['Gender', 'Married'], axis=1)

    # Step 9: Create X and y, then split the train data into train, validation, and test sets
    X = train.drop('Loan_Status', axis=1)
    y = train['Loan_Status']

    # Encode categorical variables (if any remain after dropping)
    X = pd.get_dummies(X, drop_first=True)
    test = pd.get_dummies(test, drop_first=True)

    # Align test set columns with training set
    X, test = X.align(test, join='left', axis=1, fill_value=0)

    # Step 10: Perform resampling of the minority class using SMOTE
    sm = SMOTE(random_state=25)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    # Step 13: Scale the features using StandardScaler for PyTorch
    scaler = StandardScaler()
    X_train_res = scaler.fit_transform(X_train_res)
    X_test = scaler.transform(X_test)
    test = scaler.transform(test)

    return X_train_res, y_train_res, X_test, y_test, test


In [None]:
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')

# Preprocess the data
train_processed, test_processed = preprocess_data(train, test)

# Resample, split, and scale the data
X_train_res, y_train_res, X_test, y_test, test_scaled = resample_split(
    train_processed,
    test_processed,
    resampling_method='SMOTE',
    scaling_option='standard'
)

# X_train_res, y_train_res, X_test, y_test, test_scaled = preprocess_data(train, test)


In [None]:
def grid_search_lightgbm(X_train, y_train):
    # Define parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'num_leaves': [31, 50, 70],
        'min_data_in_leaf': [20, 50, 100]
    }

    # Create LightGBM model
    lgbm = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', random_state=42)

    # Stratified K-Fold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Grid Search
    grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, scoring='roc_auc', cv=skf, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    print("Best LightGBM Parameters:", grid_search.best_params_)
    print("Best LightGBM ROC-AUC:", grid_search.best_score_)

    return grid_search.best_estimator_

def grid_search_xgboost(X_train, y_train):
    # Define parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9]
    }

    # Create XGBoost model
    xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42, use_label_encoder=False, eval_metric='logloss')

    # Stratified K-Fold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Grid Search
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='roc_auc', cv=skf, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    print("Best XGBoost Parameters:", grid_search.best_params_)
    print("Best XGBoost ROC-AUC:", grid_search.best_score_)

    return grid_search.best_estimator_

def grid_search_catboost(X_train, y_train):
    # Define parameter grid
    param_grid = {
        'iterations': [500, 1000],
        'learning_rate': [0.01, 0.05, 0.1],
        'depth': [4, 6, 8],
        'l2_leaf_reg': [1, 3, 5]
    }

    # Create CatBoost model
    catboost_model = CatBoostClassifier(verbose=0, random_state=42, eval_metric='AUC', task_type='CPU')

    # Stratified K-Fold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Grid Search
    grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid, scoring='roc_auc', cv=skf, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    print("Best CatBoost Parameters:", grid_search.best_params_)
    print("Best CatBoost ROC-AUC:", grid_search.best_score_)

    return grid_search.best_estimator_



In [None]:

# LightGBM Grid Search
best_lgb_model = grid_search_lightgbm(X_train_res, y_train_res)

# # XGBoost Grid Search
# best_xgb_model = grid_search_xgboost(X_train_res, y_train_res)

# # CatBoost Grid Search
# best_cat_model = grid_search_catboost(X_train_res, y_train_res)

# Evaluate on test data (Example using LightGBM)
from sklearn.metrics import roc_auc_score

y_pred = best_lgb_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred)
print("LightGBM Test ROC-AUC:", roc_auc)


In [None]:

# Predict on the test set
y_test_pred = best_lgb_model.predict(X_test)

# Function to print metrics
def print_metrics(y_true, y_pred, dataset_name):
    print(f"Metrics for {dataset_name}:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, pos_label=1))  # Change 'Yes' to the positive label in your dataset
    print("Recall:", recall_score(y_true, y_pred, pos_label=1))
    print("F1 Score:", f1_score(y_true, y_pred, pos_label=1))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\n" + "="*50 + "\n")

# Print metrics for test set
print_metrics(y_test, y_test_pred, "Test Set")

In [None]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import time

def objective_lightgbm(trial):
    # Suggest hyperparameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 0.9),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
    }

    # Stratified K-Fold Cross-Validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, val_idx in skf.split(X_train_res, y_train_res):
        X_train_fold, X_val_fold = X_train_res[train_idx], X_train_res[val_idx]
        y_train_fold, y_val_fold = y_train_res[train_idx], y_train_res[val_idx]

        model = LGBMClassifier(**params, random_state=42)
        model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], early_stopping_rounds=50, verbose=False)
        preds = model.predict_proba(X_val_fold)[:, 1]
        cv_scores.append(roc_auc_score(y_val_fold, preds))

    # Return the average ROC-AUC across folds
    return np.mean(cv_scores)

# Create Optuna study
study = optuna.create_study(direction='maximize')

# Use tqdm for progress tracking
n_trials = 50
start_time = time.time()
with tqdm(total=n_trials, desc="Optimizing LightGBM", unit="trial") as pbar:
    def callback(study, trial):
        pbar.update(1)
    
    study.optimize(objective_lightgbm, n_trials=n_trials, callbacks=[callback])

# Calculate total time
elapsed_time = time.time() - start_time
print(f"Total Optimization Time (LightGBM): {elapsed_time:.2f} seconds")

# Best parameters and score
print("Best Parameters (LightGBM):", study.best_params)
print("Best AUC (LightGBM):", study.best_value)


In [None]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import time
import numpy as np

def objective_lightgbm(trial):
    # Suggest hyperparameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 0.9),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
    }

    # Stratified K-Fold Cross-Validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, val_idx in skf.split(X_train_res, y_train_res):
        X_train_fold, X_val_fold = X_train_res[train_idx], X_train_res[val_idx]
        y_train_fold, y_val_fold = y_train_res[train_idx], y_train_res[val_idx]

        # Instantiate the model with suggested parameters
        model = LGBMClassifier(**params)

        # Fit the model with early stopping
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            eval_names=['valid'],
            eval_metric='auc',
            early_stopping_rounds=50,
            verbose=False  # Enable verbose output
        )

        # Predict probabilities for validation set
        preds = model.predict_proba(X_val_fold)[:, 1]
        cv_scores.append(roc_auc_score(y_val_fold, preds))

    # Return the average ROC-AUC across folds
    return np.mean(cv_scores)

# Create Optuna study
study = optuna.create_study(direction='maximize')

# Use tqdm for progress tracking
n_trials = 50
start_time = time.time()
with tqdm(total=n_trials, desc="Optimizing LightGBM", unit="trial") as pbar:
    def callback(study, trial):
        pbar.update(1)
    
    study.optimize(objective_lightgbm, n_trials=n_trials, callbacks=[callback])

# Calculate total time
elapsed_time = time.time() - start_time
print(f"Total Optimization Time (LightGBM): {elapsed_time:.2f} seconds")

# Best parameters and score
print("Best Parameters (LightGBM):", study.best_params)
print("Best AUC (LightGBM):", study.best_value)

# Train final model with best parameters
best_params = study.best_params
final_model_lgb = LGBMClassifier(**best_params, random_state=42)
final_model_lgb.fit(X_train_res, y_train_res)


In [None]:
# Predict on the test set
y_test_pred = final_model_lgb.predict(X_test)

# Function to print metrics
def print_metrics(y_true, y_pred, dataset_name):
    print(f"Metrics for {dataset_name}:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, pos_label=1))  # Change 'Yes' to the positive label in your dataset
    print("Recall:", recall_score(y_true, y_pred, pos_label=1))
    print("F1 Score:", f1_score(y_true, y_pred, pos_label=1))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\n" + "="*50 + "\n")

# Print metrics for test set
print_metrics(y_test, y_test_pred, "Test Set")

In [None]:
import lightgbm as lgb
print("LightGBM Version:", lgb.__version__)
