In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score


In [13]:
aids_t = pd.read_csv('data/aids_t.csv')
X = aids_t.drop(columns=['infected'])
y = aids_t['infected']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test)

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'max_depth': np.arange(3, 10),      # Range of max_depth values to search
    'eta': [0.1, 0.01, 0.001],          # Learning rate values to search
    'gamma': [0, 0.1, 0.2, 0.3],       # Gamma values to search
    'subsample': [0.6, 0.7, 0.8, 0.9], # Subsample values to search
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9],  # colsample_bytree values to search
}

# Create an XGBoost classifier
xtb_classifier = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y_train)),
    eval_metric='merror'

)

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(
    xtb_classifier,
    param_distributions=param_grid,
    n_iter=10,  # Number of random parameter combinations to try
    scoring='accuracy',  # Use an appropriate scoring metric
    cv=3,  # Number of cross-validation folds
    verbose=2,
    n_jobs=-1,  # Use all available CPU cores
    random_state=42
)

# Fit the RandomizedSearchCV to the training data
random_search.fit(X_train, y_train)

# Get the best model from RandomizedSearchCV
best_model_r = random_search.best_estimator_

# Make predictions on test data using the best model
y_pred_xgb_r = best_model_r.predict(X_test)

# Print the best parameters and the accuracy of the best model
print("Best Parameters:", random_search.best_params_)

# Evaluation of classification results
accuracy_xgb_r = accuracy_score(y_test, y_pred_xgb_r)
print("Accuracy: %.2f%%" % (accuracy_xgb_r * 100.0))

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'subsample': 0.9, 'max_depth': 3, 'gamma': 0.3, 'eta': 0.1, 'colsample_bytree': 0.9}
Accuracy: 70.61%
