In [1]:
import numpy as np 
import pandas as pd
import re
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from scipy import special
from tqdm import tqdm
import warnings
from sklearn.metrics import roc_auc_score, f1_score, classification_report, accuracy_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier, LGBMRegressor
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, classification_report
from sklearn_genetic import GAFeatureSelectionCV
from sklearn_genetic.callbacks import ProgressBar, ConsecutiveStopping, TensorBoard
from sklearn.feature_selection import RFE

warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:20,.2f}'.format)

In [2]:
X_train = pd.read_csv('/Users/sakshamjain/Desktop/Projects/JAIN-WIN/X_train_corr.csv')
y_train = pd.read_csv('/Users/sakshamjain/Desktop/Projects/JAIN-WIN/y_train_large.csv').squeeze()
X_test = pd.read_csv('/Users/sakshamjain/Desktop/Projects/JAIN-WIN/X_test_corr.csv')
y_test = pd.read_csv('/Users/sakshamjain/Desktop/Projects/JAIN-WIN/y_test_large.csv').squeeze()

In [3]:
X_train.columns = X_train.columns.str.replace(' ', '_')
X_test.columns = X_test.columns.str.replace(' ', '_')

In [None]:
X_train.head()

In [None]:
print("Training LightGBM model...")
model = lgb.LGBMClassifier( random_state=69, n_jobs=-1, force_col_wise=True ,verbose=0)
# Train the initial LightGBM model
model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

# Calculate classification metrics
auc_roc = roc_auc_score(y_test, y_pred_proba)
print("AUC-ROC:", auc_roc)
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
feature_importance = model.feature_importances_
feature_names = X_train.columns

# Create a DataFrame for importance
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})

# Sort by importance
importance_df = importance_df.sort_values(by='importance', ascending=False).reset_index(drop=True)

# Calculate cumulative importance
importance_df['cumulative_importance'] = importance_df['importance'].cumsum() / importance_df['importance'].sum()

# Select features contributing to 95% of cumulative importance
selected_features = importance_df[importance_df['cumulative_importance'] <= 0.95]['feature']

# Filter the train and test sets for selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Retrain the model using the selected features
model_selected = lgb.LGBMClassifier(random_state=69, n_jobs=-1, force_col_wise=True)

# Train the model again on the selected features
model_selected.fit(X_train_selected, y_train)

# Make predictions and evaluate the model
y_pred_selected = model_selected.predict_proba(X_test_selected)[:, 1]
auc_roc_selected = roc_auc_score(y_test, y_pred_selected)

print(f"AUC-ROC on the selected features: {auc_roc_selected}")

In [7]:
X_train=X_train_selected.copy()
X_test=X_test_selected.copy()

In [None]:
def find_dataframes():
    # This will check for variables that are instances of pd.DataFrame in the global scope
    return {name: obj for name, obj in globals().items() if isinstance(obj, pd.DataFrame)}

def print_memory_usage_of_dataframes():
    dataframes = find_dataframes()
    total_memory = 0
    print("Memory usage of dataframes (in GB):")
    for name, df in dataframes.items():
        mem_usage = df.memory_usage(deep=True).sum() / 1024 ** 3  # Convert bytes to gigabytes
        total_memory += mem_usage
        print(f"{name}: {mem_usage:.6f} GB")
    print(f"Total memory used by dataframes: {total_memory:.6f} GB")

print_memory_usage_of_dataframes()

In [9]:
del _,_4,importance_df,X_train_selected,X_test_selected

In [None]:
model = lgb.LGBMClassifier( random_state=69, n_jobs=-1, force_col_wise=True ,verbose=1)


rfe = RFE(estimator=model, n_features_to_select=639, step=0.05)
print("Fitting RFE...")
rfe.fit(X_train, y_train)
print("RFE fitting completed.\n")

# Get the selected features
selected_features_rfe = X_train.columns[rfe.support_].tolist()
print(f"Selected {len(selected_features_rfe)} features after RFE: {selected_features_rfe}\n")

X_train_rfe = X_train[selected_features_rfe]
X_test_rfe = X_test[selected_features_rfe]

# Train LightGBM on RFE-selected Features
print("Training LightGBM model on RFE-selected features...")
model_rfe = lgb.LGBMClassifier( random_state=69, n_jobs=-1, force_col_wise=True )
model_rfe.fit(X_train_rfe, y_train)
print("Model trained on RFE-selected features.\n")

# Predictions and Probabilities
y_pred_rfe = model_rfe.predict(X_test_rfe)
y_proba_rfe = model_rfe.predict_proba(X_test_rfe)[:, 1]

# Metrics
print("Classification Report for RFE-selected Features:")
print(classification_report(y_test, y_pred_rfe))
auc_rfe = roc_auc_score(y_test, y_proba_rfe)
print(f"AUC-ROC for RFE-selected Features: {auc_rfe:.8f}\n")

In [11]:
X_train=X_train_rfe.copy()
X_test=X_test_rfe.copy()

In [12]:
del X_train_rfe,X_test_rfe

In [16]:
model = lgb.LGBMClassifier( random_state=69, n_jobs=-1, force_col_wise=True,verbose=0)

In [18]:
selector=GAFeatureSelectionCV(
    estimator=model,
    max_features=320,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=True,
    generations=3,
    population_size=200
)


progress_bar = ProgressBar()
tensor= TensorBoard()
stopper= ConsecutiveStopping(generations=5, metric="fitness_max")
callbacks = [progress_bar, tensor, stopper]

In [None]:
selector.fit(X_train, y_train,callbacks=callbacks)

In [None]:
selected_features_mask = selector.support_

selected_features = X_train.columns[selected_features_mask]
num_features_selected = len(selected_features)
print(f"Number of features selected: {num_features_selected}")
print("Selected Features:")
print("selected features:", selected_features)

In [None]:
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

model.fit(X_train_selected, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_selected)
y_pred_proba = model.predict_proba(X_test_selected)[:, 1] 

In [None]:
# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# AUC-ROC Score
auc_score = roc_auc_score(y_test, y_pred_proba)
print("AUC-ROC Score:", auc_score)

In [24]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from lightgbm import early_stopping, log_evaluation

#Assuming X_train, y_train, X_test, y_test are already defined

# Define parameter sets
lparams = {}

lparams[0] = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.01,
    'num_leaves': 200,
    'max_bin': 500,
    'min_child_weight': 0.035,
    'subsample': 0.45,
    'colsample_bytree': 0.3,
    'min_data_in_leaf': 150,
    'max_depth': -1,
    'reg_alpha': 0.4,
    'reg_lambda': 0.7,
    'verbose': 1,
    'random_state': 0,  # Combining seed and bagging_seed for reproducibility
    'n_jobs': -1,
    'n_estimators': 30000
}

# Initialize models
model_0 = lgb.LGBMClassifier(**lparams[0])

In [None]:
# Fit models with early stopping
model_0.fit(
    X_train_selected, y_train,
    eval_set=[(X_test_selected, y_test)],
    eval_metric='auc',
    callbacks=[
        early_stopping(stopping_rounds=300),
        log_evaluation(period=1)  
    ]
)

preds_0 = model_0.predict_proba(X_test_selected)[:, 1]
auc_0 = roc_auc_score(y_test, preds_0)
print(f"Model 0 AUC: {auc_0:.4f}")