***have lasso + lasso as a feature***

In [4]:
import numpy as np 
import pandas as pd
import re
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from scipy import special
from tqdm import tqdm
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.impute import SimpleImputer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, classification_report, accuracy_score, roc_curve
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier, LGBMRegressor
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline



warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:20,.2f}'.format)

In [2]:
X_train = pd.read_csv('X_train_corr.csv')
y_train = pd.read_csv('y_train_large.csv').squeeze()
X_test = pd.read_csv('X_test_corr.csv')
y_test = pd.read_csv('y_test_large.csv').squeeze()

In [None]:
X_train.head()

In [None]:
def evaluate_lightgbm(X_train, y_train, X_test, y_test):
    """
    Returns:
    - model (lgb.LGBMClassifier): Trained LightGBM model.
    - roc_auc (float): ROC-AUC score on the test set.
    - report (str): Classification report with precision, recall, and F1-score up to 4 decimals.
    """
    # Initialize the LightGBM classifier with default parameters
    model = lgb.LGBMClassifier(
        random_state=69,        # For reproducibility
        n_jobs=-1,
        force_col_wise=True              # Utilize all available cores
    )
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Predict probabilities for the positive class
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Predict class labels
    y_pred = model.predict(X_test)
    
    # Calculate ROC-AUC score
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Generate classification report with 4 decimal places
    report = classification_report(y_test, y_pred, digits=4)
    
    # Print the evaluation metrics
    print(f"ROC-AUC: {roc_auc:.8f}")
    print("Classification Report:\n", report)
    
    return model, roc_auc, report

In [9]:
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.pipeline import Pipeline
import pandas as pd

def evaluate_l1(X_train, y_train, X_test, y_test):
    """
    Performs feature selection using L1 Regularization (Lasso) without cross-validation 
    and evaluates a LightGBM model.

    Returns:
    - model (lgb.LGBMClassifier): Trained LightGBM model.
    - roc_auc (float): ROC-AUC score on the test set.
    - report (str): Classification report with precision, recall, and F1-score up to 4 decimals.
    - selected_features (list): List of features selected by Lasso.
    """
    # Step 1: Define a range of C values to explore (inverse of regularization strength)
    #C_values = [0.01, 0.1, 1, 10, 100]
    C_values= [0.1]
    
    best_auc = -1
    best_C = None
    
    # Step 2: Manual Hyperparameter Tuning
    for C in C_values:
        # Initialize the Logistic Regression model with L1 penalty
        lasso = LogisticRegression(
            penalty='l1',
            C=C,
            solver='saga',  # Supports L1 penalty and is efficient for large datasets
            max_iter=10,
            random_state=42,
            n_jobs=-1
        )
        
        # Train the model on the entire training set
        lasso.fit(X_train, y_train)
        
        # Predict probabilities on the training set
        y_train_pred_proba = lasso.predict_proba(X_train)[:, 1]
        
        # Calculate ROC-AUC on the training set
        auc = roc_auc_score(y_train, y_train_pred_proba)
        print(f"Lasso with C={C}: Training ROC-AUC = {auc:.4f}")
        
        # Update best C if current AUC is better
        if auc > best_auc:
            best_auc = auc
            best_C = C
    
    print(f"Best C value for Lasso: {best_C} with Training ROC-AUC: {best_auc:.4f}")
    
    # Step 3: Train Lasso on the entire training set with the best C
    final_lasso = LogisticRegression(
        penalty='l1',
        C=best_C,
        solver='saga',
        max_iter=1000,
        random_state=42,
        n_jobs=-1
    )
    
    final_lasso.fit(X_train, y_train)
    
    # Step 4: Extract the coefficients from the Lasso model
    coefficients = final_lasso.coef_[0]
    
    # Step 5: Create a Series with feature names and their corresponding coefficients
    feature_coefficients = pd.Series(coefficients, index=X_train.columns)
    
    # Step 6: Select features with non-zero coefficients
    selected_features = feature_coefficients[feature_coefficients != 0].index.tolist()
    
    print(f"Number of features selected by Lasso: {len(selected_features)}")
    
    # Step 7: Reduce the training and testing sets to the selected features
    X_train_lasso = X_train[selected_features]
    X_test_lasso = X_test[selected_features]
    
    # Step 8: Evaluate the LightGBM model using the selected features
    model, roc_auc, report = evaluate_lightgbm(X_train_lasso, y_train, X_test_lasso, y_test)
    
    return model, roc_auc, report, selected_features


In [None]:
model_l1, roc_auc_l1, report_l1, selected_features_l1 = evaluate_l1(X_train, y_train, X_test, y_test)
print(report_l1)

In [None]:
def evaluate_elasticnet(X_train, y_train, X_test, y_test):
    """
    Performs feature selection using Elastic Net Regularization without cross-validation 
    and evaluates a LightGBM model.

    Parameters:
    - X_train (pd.DataFrame): Training feature set (already scaled and encoded).
    - y_train (pd.Series): Training target variable.
    - X_test (pd.DataFrame): Testing feature set (already scaled and encoded).
    - y_test (pd.Series): Testing target variable.

    Returns:
    - model (lgb.LGBMClassifier): Trained LightGBM model.
    - roc_auc (float): ROC-AUC score on the test set.
    - report (str): Classification report with precision, recall, and F1-score up to 4 decimals.
    - selected_features (list): List of features selected by Elastic Net.
    """
    # Step 1: Define a grid of hyperparameters to explore
    C_values = [0.01, 0.1, 1, 10, 100]
    l1_ratios = [0.1, 0.5, 0.7, 0.9, 1.0]  # 1.0 corresponds to pure L1 (Lasso)
    
    best_auc = -1
    best_C = None
    best_l1_ratio = None
    
    # Step 2: Manual Hyperparameter Tuning
    for C in C_values:
        for l1_ratio in l1_ratios:
            # Initialize the Logistic Regression model with Elastic Net penalty
            enet = LogisticRegression(
                penalty='elasticnet',
                C=C,
                l1_ratio=l1_ratio,
                solver='saga',  # Supports Elastic Net penalty
                max_iter=1000,
                random_state=42,
                n_jobs=-1
            )
            
            # Train the model on the entire training set
            enet.fit(X_train, y_train)
            
            # Predict probabilities on the training set
            y_train_pred_proba = enet.predict_proba(X_train)[:, 1]
            
            # Calculate ROC-AUC on the training set
            auc = roc_auc_score(y_train, y_train_pred_proba)
            print(f"Elastic Net with C={C}, l1_ratio={l1_ratio}: Training ROC-AUC = {auc:.4f}")
            
            # Update best hyperparameters if current AUC is better
            if auc > best_auc:
                best_auc = auc
                best_C = C
                best_l1_ratio = l1_ratio
    
    print(f"Best parameters for Elastic Net: C={best_C}, l1_ratio={best_l1_ratio} with Training ROC-AUC: {best_auc:.4f}")
    
    # Step 3: Train Elastic Net on the entire training set with the best hyperparameters
    final_enet = LogisticRegression(
        penalty='elasticnet',
        C=best_C,
        l1_ratio=best_l1_ratio,
        solver='saga',
        max_iter=1000,
        random_state=42,
        n_jobs=-1
    )
    
    final_enet.fit(X_train, y_train)
    
    # Step 4: Extract the coefficients from the Elastic Net model
    coefficients = final_enet.coef_[0]
    
    # Step 5: Create a Series with feature names and their corresponding coefficients
    feature_coefficients = pd.Series(coefficients, index=X_train.columns)
    
    # Step 6: Select features with non-zero coefficients
    selected_features = feature_coefficients[feature_coefficients != 0].index.tolist()
    
    print(f"Number of features selected by Elastic Net: {len(selected_features)}")
    
    # Step 7: Reduce the training and testing sets to the selected features
    X_train_enet = X_train[selected_features]
    X_test_enet = X_test[selected_features]
    
    # Step 8: Evaluate the LightGBM model using the selected features
    model, roc_auc, report = evaluate_lightgbm(X_train_enet, y_train, X_test_enet, y_test)
    
    return model, roc_auc, report, selected_features

In [None]:
print("\n=== Evaluating Elastic Net Regularization ===")
model_enet, roc_auc_enet, report_enet, selected_features_enet = evaluate_elasticnet(X_train, y_train, X_test, y_test)

print("\n=== Elastic Net Classification Report ===")
print(report_enet)

In [None]:
from sklearn.linear_model import LogisticRegression
import pandas as pd

def generate_lasso_predictions(X_train, y_train, X_test, best_C):
    """
    Trains a Lasso model and generates predictions for training and testing sets.
    
    Parameters:
    - X_train (pd.DataFrame): Training feature set (already scaled and encoded).
    - y_train (pd.Series): Training target variable.
    - X_test (pd.DataFrame): Testing feature set (already scaled and encoded).
    - best_C (float): The best C value identified from previous feature selection.
    
    Returns:
    - y_train_pred_proba (np.array): Predicted probabilities for the training set.
    - y_test_pred_proba (np.array): Predicted probabilities for the testing set.
    """
    # Initialize the Lasso model with the best C
    lasso = LogisticRegression(
        penalty='l1',
        C=best_C,
        solver='saga',
        max_iter=1000,
        random_state=42,
        n_jobs=-1
    )
    
    # Train the Lasso model on the entire training set
    lasso.fit(X_train, y_train)
    
    # Generate predicted probabilities for both training and testing sets
    y_train_pred_proba = lasso.predict_proba(X_train)[:, 1]
    y_test_pred_proba = lasso.predict_proba(X_test)[:, 1]
    
    return y_train_pred_proba, y_test_pred_proba

def add_lasso_predictions(X_train, X_test, y_train_pred_proba, y_test_pred_proba):
    """
    Adds Lasso model predictions as new features to the training and testing sets.
    
    Parameters:
    - X_train (pd.DataFrame): Original training feature set.
    - X_test (pd.DataFrame): Original testing feature set.
    - y_train_pred_proba (np.array): Lasso predictions for the training set.
    - y_test_pred_proba (np.array): Lasso predictions for the testing set.
    
    Returns:
    - X_train_new (pd.DataFrame): Training set with Lasso predictions added.
    - X_test_new (pd.DataFrame): Testing set with Lasso predictions added.
    """
    # Create new feature columns
    X_train_new = X_train.copy()
    X_test_new = X_test.copy()
    
    X_train_new['lasso_pred_proba'] = y_train_pred_proba
    X_test_new['lasso_pred_proba'] = y_test_pred_proba
    
    return X_train_new, X_test_new

def evaluate_lightgbm_with_lasso(X_train, y_train, X_test, y_test):
    """
    Trains and evaluates a LightGBM model with Lasso predictions as an additional feature.
    
    Parameters:
    - X_train (pd.DataFrame): Training feature set with Lasso predictions.
    - y_train (pd.Series): Training target variable.
    - X_test (pd.DataFrame): Testing feature set with Lasso predictions.
    - y_test (pd.Series): Testing target variable.
    
    Returns:
    - model (lgb.LGBMClassifier): Trained LightGBM model.
    - roc_auc (float): ROC-AUC score on the test set.
    - report (str): Classification report with precision, recall, and F1-score up to 4 decimals.
    """
    model = lgb.LGBMClassifier(
        random_state=69,        # For reproducibility
        n_jobs=-1,
        force_col_wise=True     # Utilize all available cores
    )
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Predict probabilities for the positive class
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Predict class labels
    y_pred = model.predict(X_test)
    
    # Calculate ROC-AUC score
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Generate classification report with 4 decimal places
    report = classification_report(y_test, y_pred, digits=4)
    
    # Print the evaluation metrics
    print(f"ROC-AUC: {roc_auc:.4f}")
    print("Classification Report:\n", report)
    
    return model, roc_auc, report

def evaluate_l1_with_predictions(X_train, y_train, X_test, y_test, best_C):
    """
    Performs feature selection using L1 Regularization (Lasso), adds Lasso predictions as a new feature,
    and evaluates a LightGBM model.
    
    Parameters:
    - X_train (pd.DataFrame): Training feature set (already scaled and encoded).
    - y_train (pd.Series): Training target variable.
    - X_test (pd.DataFrame): Testing feature set (already scaled and encoded).
    - y_test (pd.Series): Testing target variable.
    - best_C (float): The best C value identified from previous Lasso feature selection.
    
    Returns:
    - model (lgb.LGBMClassifier): Trained LightGBM model.
    - roc_auc (float): ROC-AUC score on the test set.
    - report (str): Classification report with precision, recall, and F1-score up to 4 decimals.
    """
    # Generate Lasso predictions
    y_train_pred_proba, y_test_pred_proba = generate_lasso_predictions(X_train, y_train, X_test, best_C)
    
    # Add Lasso predictions as new features
    X_train_new, X_test_new = add_lasso_predictions(X_train, X_test, y_train_pred_proba, y_test_pred_proba)
    
    # Evaluate LightGBM with the new feature
    model, roc_auc, report = evaluate_lightgbm_with_lasso(X_train_new, y_train, X_test_new, y_test)
    
    return model, roc_auc, report

# Example Data (Replace with your actual data)
# X_train, X_test, y_train, y_test = your_data_splitting_function()

# Assume you have already performed Lasso feature selection and identified best_C
best_C = 1  # Replace with your actual best C value

# Evaluate LightGBM with Lasso Predictions as an Additional Feature
print("=== Evaluating LightGBM with Lasso Predictions ===")
model_lgbm_with_lasso, roc_auc_lgbm_lasso, report_lgbm_lasso = evaluate_l1_with_predictions(
    X_train, y_train, X_test, y_test, best_C
)

print("\n=== LightGBM with Lasso Predictions Classification Report ===")
print(report_lgbm_lasso)


In [None]:
def generate_elasticnet_predictions(X_train, y_train, X_test, best_C, best_l1_ratio):
    """
    Trains an Elastic Net model and generates predictions for training and testing sets.
    
    Parameters:
    - X_train (pd.DataFrame): Training feature set (already scaled and encoded).
    - y_train (pd.Series): Training target variable.
    - X_test (pd.DataFrame): Testing feature set (already scaled and encoded).
    - best_C (float): The best C value identified from previous feature selection.
    - best_l1_ratio (float): The best l1_ratio value identified from previous feature selection.
    
    Returns:
    - y_train_pred_proba (np.array): Predicted probabilities for the training set.
    - y_test_pred_proba (np.array): Predicted probabilities for the testing set.
    """
    # Initialize the Elastic Net model with the best C and l1_ratio
    enet = LogisticRegression(
        penalty='elasticnet',
        C=best_C,
        l1_ratio=best_l1_ratio,
        solver='saga',
        max_iter=1000,
        random_state=42,
        n_jobs=-1
    )
    
    # Train the Elastic Net model on the entire training set
    enet.fit(X_train, y_train)
    
    # Generate predicted probabilities for both training and testing sets
    y_train_pred_proba = enet.predict_proba(X_train)[:, 1]
    y_test_pred_proba = enet.predict_proba(X_test)[:, 1]
    
    return y_train_pred_proba, y_test_pred_proba


def add_elasticnet_predictions(X_train, X_test, y_train_pred_proba, y_test_pred_proba):
    """
    Adds Elastic Net model predictions as new features to the training and testing sets.
    
    Parameters:
    - X_train (pd.DataFrame): Original training feature set.
    - X_test (pd.DataFrame): Original testing feature set.
    - y_train_pred_proba (np.array): Elastic Net predictions for the training set.
    - y_test_pred_proba (np.array): Elastic Net predictions for the testing set.
    
    Returns:
    - X_train_new (pd.DataFrame): Training set with Elastic Net predictions added.
    - X_test_new (pd.DataFrame): Testing set with Elastic Net predictions added.
    """
    # Create new feature columns
    X_train_new = X_train.copy()
    X_test_new = X_test.copy()
    
    X_train_new['elasticnet_pred_proba'] = y_train_pred_proba
    X_test_new['elasticnet_pred_proba'] = y_test_pred_proba
    
    return X_train_new, X_test_new


def evaluate_lightgbm_with_elasticnet(X_train, y_train, X_test, y_test):
    """
    Trains and evaluates a LightGBM model with Elastic Net predictions as an additional feature.
    
    Parameters:
    - X_train (pd.DataFrame): Training feature set with Elastic Net predictions.
    - y_train (pd.Series): Training target variable.
    - X_test (pd.DataFrame): Testing feature set with Elastic Net predictions.
    - y_test (pd.Series): Testing target variable.
    
    Returns:
    - model (lgb.LGBMClassifier): Trained LightGBM model.
    - roc_auc (float): ROC-AUC score on the test set.
    - report (str): Classification report with precision, recall, and F1-score up to 4 decimals.
    """
    model = lgb.LGBMClassifier(
        random_state=69,        # For reproducibility
        n_jobs=-1,
        force_col_wise=True     # Utilize all available cores
    )
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Predict probabilities for the positive class
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Predict class labels
    y_pred = model.predict(X_test)
    
    # Calculate ROC-AUC score
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Generate classification report with 4 decimal places
    report = classification_report(y_test, y_pred, digits=4)
    
    # Print the evaluation metrics
    print(f"ROC-AUC: {roc_auc:.4f}")
    print("Classification Report:\n", report)
    
    return model, roc_auc, report

def evaluate_elasticnet_with_predictions(X_train, y_train, X_test, y_test, best_C, best_l1_ratio):
    """
    Performs feature selection using Elastic Net Regularization, adds Elastic Net predictions as a new feature,
    and evaluates a LightGBM model.
    
    Parameters:
    - X_train (pd.DataFrame): Training feature set (already scaled and encoded).
    - y_train (pd.Series): Training target variable.
    - X_test (pd.DataFrame): Testing feature set (already scaled and encoded).
    - y_test (pd.Series): Testing target variable.
    - best_C (float): The best C value identified from previous Elastic Net feature selection.
    - best_l1_ratio (float): The best l1_ratio value identified from previous Elastic Net feature selection.
    
    Returns:
    - model (lgb.LGBMClassifier): Trained LightGBM model.
    - roc_auc (float): ROC-AUC score on the test set.
    - report (str): Classification report with precision, recall, and F1-score up to 4 decimals.
    """
    # Generate Elastic Net predictions
    y_train_pred_proba, y_test_pred_proba = generate_elasticnet_predictions(
        X_train, y_train, X_test, best_C, best_l1_ratio
    )
    
    # Add Elastic Net predictions as new features
    X_train_new, X_test_new = add_elasticnet_predictions(
        X_train, X_test, y_train_pred_proba, y_test_pred_proba
    )
    
    # Evaluate LightGBM with the new feature
    model, roc_auc, report = evaluate_lightgbm_with_elasticnet(X_train_new, y_train, X_test_new, y_test)
    
    return model, roc_auc, report

# Example Data (Replace with your actual data)
# X_train, X_test, y_train, y_test = your_data_splitting_function()

# Assume you have already performed Elastic Net feature selection and identified best_C and best_l1_ratio
best_C = 1       # Replace with your actual best C value
best_l1_ratio = 0.5  # Replace with your actual best l1_ratio value

# Evaluate LightGBM with Elastic Net Predictions as an Additional Feature
print("=== Evaluating LightGBM with Elastic Net Predictions ===")
model_lgbm_with_enet, roc_auc_lgbm_enet, report_lgbm_enet = evaluate_elasticnet_with_predictions(
    X_train, y_train, X_test, y_test, best_C, best_l1_ratio
)

print("\n=== LightGBM with Elastic Net Predictions Classification Report ===")
print(report_lgbm_enet)
