In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
from sklearn.datasets import make_classification
import time
import GPUtil
import threading

In [2]:
# Load datasets
data_frb = pd.read_csv('chimefrbcat1.csv')
data_tns = pd.read_csv('tns_search.csv')

# Preprocessing FRB data
data_frb.dropna(inplace=True)
label_encoder = LabelEncoder()
data_frb['frb_name_encoded'] = label_encoder.fit_transform(data_frb['tns_name'])
features_frb = data_frb[['ra', 'dec', 'bonsai_dm', 'bonsai_snr', 'flux', 'fluence']]
labels_frb = data_frb['frb_name_encoded']

# Preprocessing TNS data
data_tns['dm'] = data_tns['DM']
data_tns['flux'] = np.random.uniform(0, 1, len(data_tns))  # Assuming random flux values as they are not provided
data_tns['fluence'] = np.random.uniform(0, 1, len(data_tns))  # Assuming random fluence values as they are not provided

# Convert RA and DEC from string to float values (simplified approach)
data_tns['RA'] = data_tns['RA'].apply(lambda x: sum([float(i) / 60 ** n for n, i in enumerate(x.split(':'))]))
data_tns['DEC'] = data_tns['DEC'].apply(lambda x: sum([float(i) / 60 ** n for n, i in enumerate(x.split(':'))]))

features_tns = data_tns[['RA', 'DEC', 'dm', 'DM-Err', 'flux', 'fluence']]
labels_tns = np.random.randint(0, len(np.unique(labels_frb)), len(data_tns))  # Random labels for simplicity

# Combine the datasets
features = pd.concat([features_frb, features_tns])
labels = np.hstack((labels_frb, labels_tns))

# Handle missing values
imputer = SimpleImputer(strategy='mean')
features = imputer.fit_transform(features)

# Handling class imbalance using resampling
majority_class = np.bincount(labels).argmax()
minority_classes = np.unique(labels[labels != majority_class])

# Upsample minority classes
majority_data = features[labels == majority_class]
resampled_data = majority_data.copy()

for cls in minority_classes:
    cls_data = features[labels == cls]
    cls_upsampled = resample(cls_data, 
                             replace=True,
                             n_samples=len(majority_data),
                             random_state=42)
    resampled_data = np.vstack([resampled_data, cls_upsampled])

features_resampled = resampled_data
labels_resampled = np.hstack((labels[labels == majority_class], np.repeat(minority_classes, len(majority_data))))

# Feature engineering
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
features_poly = poly.fit_transform(features_resampled)

# Create synthetic data
X_synthetic, y_synthetic = make_classification(n_samples=10000, n_features=features_poly.shape[1], 
                                               n_informative=10, n_redundant=0, random_state=42)

X_combined = np.vstack((features_poly, X_synthetic))
y_combined = np.hstack((labels_resampled, y_synthetic))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Benchmark a single run
xgb_model_single = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, tree_method='hist', device='cuda', use_label_encoder=False, eval_metric='mlogloss')

start_time = time.time()
xgb_model_single.fit(X_train_scaled, y_train)
end_time = time.time()

single_run_time = end_time - start_time
print(f"Single run training time: {single_run_time:.2f} seconds")

# Calculate total estimated training time
total_runs = 486 * 3  # Total grid search combinations * cross-validation folds
total_estimated_time = single_run_time * total_runs
print(f"Total estimated training time: {total_estimated_time / 60:.2f} minutes")

Single run training time: 32.65 seconds
Total estimated training time: 793.32 minutes


In [3]:
# Reduced parameter grid for grid search
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_child_weight': [1, 3],
    'gamma': [0, 0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8]
}

# Initialize the model with GPU support
xgb_model = XGBClassifier(tree_method='hist', device='cuda', use_label_encoder=False, eval_metric='mlogloss')

# Perform grid search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Best parameters and model
print("Best parameters:", grid_search.best_params_)
best_xgb_model = grid_search.best_estimator_

# Evaluate the model
xgb_accuracy = best_xgb_model.score(X_test_scaled, y_test)
print(f"Optimized GBDT Model - Test accuracy: {xgb_accuracy:.4f}")

Fitting 3 folds for each of 32 candidates, totalling 96 fits




Best parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Optimized GBDT Model - Test accuracy: 0.9100
