In [3]:
import numpy as np 
import pandas as pd
import re
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from scipy import special
from tqdm import tqdm
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.impute import SimpleImputer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, classification_report, accuracy_score, roc_curve
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier, LGBMRegressor
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.feature_selection import RFE
from boruta import BorutaPy

warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:20,.2f}'.format)

In [4]:
X_train = pd.read_csv('/Users/sakshamjain/Desktop/Projects/JAIN-WIN/X_train_corr.csv')
y_train = pd.read_csv('/Users/sakshamjain/Desktop/Projects/JAIN-WIN/y_train_large.csv').squeeze()
X_test = pd.read_csv('/Users/sakshamjain/Desktop/Projects/JAIN-WIN/X_test_corr.csv')
y_test = pd.read_csv('/Users/sakshamjain/Desktop/Projects/JAIN-WIN/y_test_large.csv').squeeze()

In [5]:
print("Training LightGBM model...")
model = lgb.LGBMClassifier( random_state=69, n_jobs=-1, force_col_wise=True )
# Train the initial LightGBM model
model.fit(X_train, y_train)

# Get feature importance
feature_importance = model.feature_importances_
feature_names = X_train.columns

Training LightGBM model...
[LightGBM] [Info] Number of positive: 19677, number of negative: 71432
[LightGBM] [Info] Total Bins 850113
[LightGBM] [Info] Number of data points in the train set: 91109, number of used features: 3698
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.215972 -> initscore=-1.289296
[LightGBM] [Info] Start training from score -1.289296


In [6]:
# Create a DataFrame for importance
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})

# Sort by importance
importance_df = importance_df.sort_values(by='importance', ascending=False).reset_index(drop=True)

# Calculate cumulative importance
importance_df['cumulative_importance'] = importance_df['importance'].cumsum() / importance_df['importance'].sum()

# Select features contributing to 95% of cumulative importance
selected_features = importance_df[importance_df['cumulative_importance'] <= 0.95]['feature']

# Filter the train and test sets for selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Retrain the model using the selected features
model_selected = lgb.LGBMClassifier(random_state=69, n_jobs=-1, force_col_wise=True)

# Train the model again on the selected features
model_selected.fit(X_train_selected, y_train)

# Make predictions and evaluate the model
y_pred_selected = model_selected.predict_proba(X_test_selected)[:, 1]
auc_roc_selected = roc_auc_score(y_test, y_pred_selected)

print(f"AUC-ROC on the selected features: {auc_roc_selected}")

[LightGBM] [Info] Number of positive: 19677, number of negative: 71432
[LightGBM] [Info] Total Bins 311277
[LightGBM] [Info] Number of data points in the train set: 91109, number of used features: 1277
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.215972 -> initscore=-1.289296
[LightGBM] [Info] Start training from score -1.289296
AUC-ROC on the selected features: 0.8723564971241229


In [7]:
X_train=X_train_selected.copy()
X_test=X_test_selected.copy()

In [10]:
def find_dataframes():
    # This will check for variables that are instances of pd.DataFrame in the global scope
    return {name: obj for name, obj in globals().items() if isinstance(obj, pd.DataFrame)}

def print_memory_usage_of_dataframes():
    dataframes = find_dataframes()
    total_memory = 0
    print("Memory usage of dataframes (in GB):")
    for name, df in dataframes.items():
        mem_usage = df.memory_usage(deep=True).sum() / 1024 ** 3  # Convert bytes to gigabytes
        total_memory += mem_usage
        print(f"{name}: {mem_usage:.6f} GB")
    print(f"Total memory used by dataframes: {total_memory:.6f} GB")

print_memory_usage_of_dataframes()

Memory usage of dataframes (in GB):
X_train: 0.866253 GB
X_test: 0.371264 GB
Total memory used by dataframes: 1.237516 GB


In [9]:
del importance_df,X_train_selected,X_test_selected

In [14]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from lightgbm import early_stopping, log_evaluation

# Assuming xtrain, ytrain, X_test, ytest are already defined

# Define parameter sets
lparams = {}

lparams[0] = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.01,
    'num_leaves': 200,
    'max_bin': 500,
    'min_child_weight': 0.035,
    'subsample': 0.45,
    'colsample_bytree': 0.3,
    'min_data_in_leaf': 150,
    'max_depth': -1,
    'reg_alpha': 0.4,
    'reg_lambda': 0.7,
    'verbose': 1,
    'random_state': 0,  # Combining seed and bagging_seed for reproducibility
    'n_jobs': -1,
    'n_estimators': 30000
}

lparams[1] = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.01,
    'n_estimators': 30000,
    'subsample': 1,
    'colsample_bytree': 0.225,
    'max_depth': -1,
    'reg_alpha': 1,
    'reg_lambda': 1,
    'verbose': 1,
    'random_state': 1,  # Combining seed and bagging_seed for reproducibility
    'n_jobs': -1
}

lparams[2] = {
        'boosting_type': 'dart',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.01,
        'subsample': 1,
        'colsample_bytree': 0.1,
        'reg_alpha': 3,
        'reg_lambda': 1,
        'scale_pos_weight': 1,
        'n_estimators': 14000,
        'silent': -1,
        'verbose': -1,
        'max_depth': -1,
        'random_state': 0,
        'n_jobs': -1
    }

lparams[3] = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.01,
        'subsample': 1,
        'colsample_bytree': 0.1,
        'reg_alpha': 3,
        'reg_lambda': 1,
        'scale_pos_weight': 1,
        'n_estimators': 300000,
        'silent': -1,
        'verbose': -1,
        'max_depth': -1,
        'random_state': 0,
        'n_jobs': -1

    }

lparams[4] = {
    'objective':'binary'
    ,'boosting': 'goss'
    ,'verbosity': -1
    ,'metric': 'AUC'
    ,'learning_rate': .01
    ,'num_leaves': 63
    ,'min_data_in_leaf': 250
    ,'feature_fraction': .3
    ,'extra_trees': True
    ,'top_rate': .8
    ,'other_rate': .1,
    'random_state': 0,
    'n_jobs': -1
}

# Initialize models
model_0 = lgb.LGBMClassifier(**lparams[0])
model_1 = lgb.LGBMClassifier(**lparams[1])
model_2 = lgb.LGBMClassifier(**lparams[2])
model_3 = lgb.LGBMClassifier(**lparams[3])
model_4 = lgb.LGBMClassifier(**lparams[4])


In [15]:
# Fit models with early stopping
model_0.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    callbacks=[
        early_stopping(stopping_rounds=300),  # Early stopping if no improvement for 300 rounds
        log_evaluation(period=1)  # Log evaluation after each round
    ]
)

preds_0 = model_0.predict_proba(X_test)[:, 1]
auc_0 = roc_auc_score(y_test, preds_0)
print(f"Model 0 AUC: {auc_0:.4f}")


[LightGBM] [Info] Number of positive: 19677, number of negative: 71432
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.297949 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 555954
[LightGBM] [Info] Number of data points in the train set: 91109, number of used features: 1277
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.215972 -> initscore=-1.289296
[LightGBM] [Info] Start training from score -1.289296
[1]	valid_0's auc: 0.81543
Training until validation scores don't improve for 300 rounds
[2]	valid_0's auc: 0.829303
[3]	valid_0's auc: 0.83454
[4]	valid_0's auc: 0.837912
[5]	valid_0's auc: 0.839736
[6]	valid_0's auc: 0.840025
[7]	valid_0's auc: 0.841265
[8]	valid_0's auc: 0.842408
[9]	valid_0's auc: 0.842618
[10]	valid_0's auc: 0.843733
[11]	valid_0's auc: 0.844794
[12]	valid_0's auc: 0.845418
[13]	valid_0's auc: 0.845844
[14]	valid_0's auc: 0.846409
[15]	valid_0's auc: 0.846895
[16]	valid_0's a

: 

In [None]:
model_1.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    callbacks=[early_stopping(stopping_rounds=300)]

)

preds_1 = model_1.predict_proba(X_test)[:, 1]
auc_1 = roc_auc_score(y_test, preds_1)
print(f"Model 1 AUC: {auc_1:.4f}")

In [None]:
model_2.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    callbacks=[early_stopping(stopping_rounds=300)]

)

preds_2 = model_2.predict_proba(X_test)[:, 1]
auc_2 = roc_auc_score(y_test, preds_2)
print(f"Model 1 AUC: {auc_2:.4f}")

model_3.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    callbacks=[early_stopping(stopping_rounds=300)]

)

preds_3 = model_3.predict_proba(X_test)[:, 1]
auc_3 = roc_auc_score(y_test, preds_3)
print(f"Model 1 AUC: {auc_3:.4f}")

model_4.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    callbacks=[early_stopping(stopping_rounds=300)]

)

preds_4 = model_4.predict_proba(X_test)[:, 1]
auc_4 = roc_auc_score(y_test, preds_4)
print(f"Model 1 AUC: {auc_4:.4f}")

