In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb
import time

In [2]:
dfX_train=np.load('/kaggle/input/latestembeddings/LatestEmbeddings.npy')

In [3]:
dfX_train.shape

(303005, 32)

In [4]:
X_train=dfX_train[:250000]

In [5]:
df=pd.read_csv('/kaggle/input/newdataframe/NewDatFrame.csv')

In [6]:
# X_train = df.iloc[:, :-1]  # Select all columns except the last one
y = df.iloc[:, -1]   # Select the last column

In [7]:
y_train=y[:250000]

In [8]:
X_test=dfX_train[250000:]

In [9]:
y_test=y[250000:]

In [10]:
# Define 5 different parameter sets for XGBoost models
params_list = [
    {
        'n_estimators': 594, 
        'max_depth': 42, 
        'learning_rate': 0.024204993124484677, 
        'subsample': 0.7923995978223708, 
        'colsample_bytree': 0.8032798086548559, 
        'min_child_weight': 9, 
        'gamma': 0.004024491722298412, 
        'scale_pos_weight': 70.63189827279774, 
        'reg_alpha': 7.837478547281585,
        'reg_lambda': 9.46120573949853
    },
    {
        'n_estimators': 837, 
        'max_depth': 14, 
        'learning_rate': 0.0353558846085311, 
        'subsample': 0.6616753378478161, 
        'colsample_bytree': 0.6462390124756214, 
        'min_child_weight': 2, 
        'gamma': 0.004161002955494408, 
        'scale_pos_weight': 78.32043308058358, 
        'reg_alpha': 9.215360358471834, 
        'reg_lambda': 0.03910209923605645
    },
    {
        'n_estimators': 523, 
        'max_depth': 38, 
        'learning_rate': 0.01979028328814464, 
        'subsample': 0.807725998559047, 
        'colsample_bytree': 0.749169830595586, 
        'min_child_weight': 10, 
        'gamma': 0.002169538530108702, 
        'scale_pos_weight': 70.85229745683928, 
        'reg_alpha': 8.803826204877566, 
        'reg_lambda': 9.919375860347836
    },
    {
        'n_estimators': 539, 
        'max_depth': 32, 
        'learning_rate': 0.04750293929946122, 
        'subsample': 0.7977338121149586, 
        'colsample_bytree': 0.9082578699425816, 
        'min_child_weight': 9, 
        'gamma': 0.005577192157605548, 
        'scale_pos_weight': 70.84549144876938, 
        'reg_alpha': 6.364814625912276, 
        'reg_lambda': 5.643091944274566
    },
    {
        'n_estimators': 709, 
        'max_depth': 37, 
        'learning_rate': 0.05131505621380027, 
        'subsample': 0.8005171137668711, 
        'colsample_bytree': 0.9283255559114961, 
        'min_child_weight': 9, 
        'gamma': 0.00788130008156799, 
        'scale_pos_weight': 70.48617541408962, 
        'reg_alpha': 6.202205304808387, 
        'reg_lambda': 5.19112005430535
    },
    {
        'n_estimators': 577, 
        'max_depth': 39, 
        'learning_rate': 0.015618168995032516, 
        'subsample': 0.8360754424291521, 
        'colsample_bytree': 0.7811875285918626, 
        'min_child_weight': 10, 
        'gamma': 0.0151264967433285, 
        'scale_pos_weight': 71.47221908023103, 
        'reg_alpha': 7.958591921588237, 
        'reg_lambda': 9.528992187942585
    }
]

In [11]:
# Train each XGBoost model with different parameters
models = []
predictions = []
training_times = []

print("Training individual XGBoost models on GPU...")
for i, params in enumerate(params_list):
    print(f"\nTraining model {i+1} with parameters: {params}")
    
    # Create and train the model
    model = xgb.XGBClassifier(
        max_depth=params['max_depth'],
        learning_rate=params['learning_rate'],
        n_estimators=params['n_estimators'],
        tree_method='gpu_hist',
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        gpu_id= 0,
        predictor= 'gpu_predictor',
        sampling_method= 'gradient_based',
        objective= 'binary:logistic',
        eval_metric= ['auc', 'aucpr'],
        random_state=42,
        min_child_weight= 10, 
        gamma= params['gamma'], 
        scale_pos_weight= params['scale_pos_weight'], 
        reg_alpha= params['reg_alpha'], 
        reg_lambda= params['reg_lambda']
    )
    
    # Time the training process
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    training_time = end_time - start_time
    training_times.append(training_time)
    
    models.append(model)
    
    # Make predictions
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    predictions.append(y_pred_prob)
    
    # Evaluate individual model
    y_pred_class = (y_pred_prob > 0.5).astype(int)
    accuracy = accuracy_score(y_test, y_pred_class)
    auc = roc_auc_score(y_test, y_pred_prob)
    
    print(f"Model {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}, Training Time: {training_time:.2f} seconds")

Training individual XGBoost models on GPU...

Training model 1 with parameters: {'n_estimators': 594, 'max_depth': 42, 'learning_rate': 0.024204993124484677, 'subsample': 0.7923995978223708, 'colsample_bytree': 0.8032798086548559, 'min_child_weight': 9, 'gamma': 0.004024491722298412, 'scale_pos_weight': 70.63189827279774, 'reg_alpha': 7.837478547281585, 'reg_lambda': 9.46120573949853}



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Model 1 - Accuracy: 0.9912, AUC: 0.7008, Training Time: 25.32 seconds

Training model 2 with parameters: {'n_estimators': 837, 'max_depth': 14, 'learning_rate': 0.0353558846085311, 'subsample': 0.6616753378478161, 'colsample_bytree': 0.6462390124756214, 'min_child_weight': 2, 'gamma': 0.004161002955494408, 'scale_pos_weight': 78.32043308058358, 'reg_alpha': 9.215360358471834, 'reg_lambda': 0.03910209923605645}



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Model 2 - Accuracy: 0.9912, AUC: 0.6939, Training Time: 20.25 seconds

Training model 3 with parameters: {'n_estimators': 523, 'max_depth': 38, 'learning_rate': 0.01979028328814464, 'subsample': 0.807725998559047, 'colsample_bytree': 0.749169830595586, 'min_child_weight': 10, 'gamma': 0.002169538530108702, 'scale_pos_weight': 70.85229745683928, 'reg_alpha': 8.803826204877566, 'reg_lambda': 9.919375860347836}



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Model 3 - Accuracy: 0.9912, AUC: 0.6917, Training Time: 24.55 seconds

Training model 4 with parameters: {'n_estimators': 539, 'max_depth': 32, 'learning_rate': 0.04750293929946122, 'subsample': 0.7977338121149586, 'colsample_bytree': 0.9082578699425816, 'min_child_weight': 9, 'gamma': 0.005577192157605548, 'scale_pos_weight': 70.84549144876938, 'reg_alpha': 6.364814625912276, 'reg_lambda': 5.643091944274566}



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Model 4 - Accuracy: 0.9911, AUC: 0.6947, Training Time: 17.63 seconds

Training model 5 with parameters: {'n_estimators': 709, 'max_depth': 37, 'learning_rate': 0.05131505621380027, 'subsample': 0.8005171137668711, 'colsample_bytree': 0.9283255559114961, 'min_child_weight': 9, 'gamma': 0.00788130008156799, 'scale_pos_weight': 70.48617541408962, 'reg_alpha': 6.202205304808387, 'reg_lambda': 5.19112005430535}



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Model 5 - Accuracy: 0.9902, AUC: 0.6915, Training Time: 19.37 seconds

Training model 6 with parameters: {'n_estimators': 577, 'max_depth': 39, 'learning_rate': 0.015618168995032516, 'subsample': 0.8360754424291521, 'colsample_bytree': 0.7811875285918626, 'min_child_weight': 10, 'gamma': 0.0151264967433285, 'scale_pos_weight': 71.47221908023103, 'reg_alpha': 7.958591921588237, 'reg_lambda': 9.528992187942585}



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.



Model 6 - Accuracy: 0.9912, AUC: 0.7042, Training Time: 28.62 seconds



    E.g. tree_method = "hist", device = "cuda"



In [12]:
ensemble_predictions = np.mean(predictions, axis=0)

# Evaluate the ensemble model
ensemble_class_preds = (ensemble_predictions > 0.5).astype(int)
ensemble_accuracy = accuracy_score(y_test, ensemble_class_preds)
ensemble_auc = roc_auc_score(y_test, ensemble_predictions)

In [13]:
print("\n--- Ensemble Model Results ---")
print(f"Ensemble Accuracy: {ensemble_accuracy:.4f}")
print(f"Ensemble AUC: {ensemble_auc:.4f}")
print(f"Average Training Time per Model: {np.mean(training_times):.2f} seconds")
print(f"Total Training Time: {sum(training_times):.2f} seconds")# Ensemble the predictions by averaging probabilities


--- Ensemble Model Results ---
Ensemble Accuracy: 0.9912
Ensemble AUC: 0.6983
Average Training Time per Model: 22.62 seconds
Total Training Time: 135.75 seconds


In [17]:
from sklearn.metrics import confusion_matrix

# Assume y_proba[:, 1] contains probabilities for class 1
y_pred = (ensemble_predictions >= 0.5).astype(int)  # Convert to binary labels (0 or 1)

# Compute confusion matrix
cm = confusion_matrix(y[250000:], y_pred)

print(cm)

[[52535     2]
 [  465     3]]


In [18]:
for sample_idx in range(5):
    print(f"{sample_idx+1}      | ", end="")
    for model_idx in range(5):
        print(f"{predictions[model_idx][sample_idx]:.4f}  | ", end="")
    print(f"{ensemble_predictions[sample_idx]:.4f}")

# Save the models
for i, model in enumerate(models):
    model.save_model(f"xgboost_model_{i+1}.json")
print("\nAll models saved to disk.")

1      | 0.0020  | 0.0011  | 0.0046  | 0.0006  | 0.0004  | 0.0024
2      | 0.0621  | 0.0340  | 0.0480  | 0.0685  | 0.0382  | 0.0538
3      | 0.0060  | 0.0020  | 0.0090  | 0.0019  | 0.0010  | 0.0049
4      | 0.0036  | 0.0012  | 0.0074  | 0.0010  | 0.0006  | 0.0034
5      | 0.0088  | 0.0024  | 0.0128  | 0.0028  | 0.0011  | 0.0067

All models saved to disk.


In [23]:
ensemble_predictions

array([0.0023722 , 0.05380079, 0.00493447, ..., 0.02722444, 0.06291053,
       0.06037589], dtype=float32)