In [None]:
!pip install numpy pandas scikit-learn imblearn xgboost

In [1]:
import pandas as pd
import numpy as np

In [2]:
diabetes = pd.read_csv('diabetes03.csv')

In [3]:
remove = ['AnyHealthcare', 'Fruits', 'Veggies', 'NoDocbcCost', 'MentHlth', 'PhysHlth', 'Education', "Income"]
diabetes.drop(remove, axis=1, inplace=True)

In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
columns_to_scale = ['BMI', 'GenHlth', 'Age']
diabetes[columns_to_scale] = scaler.fit_transform(diabetes[columns_to_scale])
diabetes

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,HvyAlcoholConsump,GenHlth,DiffWalk,Sex,Age
0,0.0,1.0,1.0,1.0,0.325581,1.0,0.0,0.0,0.0,0.0,1.00,1.0,0.0,0.666667
1,0.0,0.0,0.0,0.0,0.151163,1.0,0.0,0.0,1.0,0.0,0.50,0.0,0.0,0.500000
2,0.0,1.0,1.0,1.0,0.186047,0.0,0.0,0.0,0.0,0.0,1.00,1.0,0.0,0.666667
3,0.0,1.0,0.0,1.0,0.174419,0.0,0.0,0.0,1.0,0.0,0.25,0.0,0.0,0.833333
4,0.0,1.0,1.0,1.0,0.139535,0.0,0.0,0.0,1.0,0.0,0.25,0.0,0.0,0.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0.0,1.0,1.0,1.0,0.383721,0.0,0.0,0.0,0.0,0.0,0.50,0.0,1.0,0.333333
253676,1.0,1.0,1.0,1.0,0.069767,0.0,0.0,0.0,0.0,0.0,0.75,1.0,0.0,0.833333
253677,0.0,0.0,0.0,1.0,0.186047,0.0,0.0,0.0,1.0,0.0,0.00,0.0,0.0,0.083333
253678,0.0,1.0,0.0,1.0,0.127907,0.0,0.0,0.0,0.0,0.0,0.50,0.0,1.0,0.500000


In [5]:
from sklearn.model_selection import train_test_split

# Assuming your dataframe is called 'diabetes' and the target is 'Diabetes_binary'
X = diabetes.drop('Diabetes_binary', axis=1)
y = diabetes['Diabetes_binary']

# First, split out the test set (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Then, split the remaining data into training (60% total) and validation (20% total)
# Note: Since X_train_val is 80% of the data, using test_size=0.25 here gives 0.25*80% = 20%
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)


In [8]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Convert your data to numpy arrays if needed
X_train_np, y_train_np = X_train.to_numpy(), y_train.to_numpy()
X_val_np, y_val_np = X_val.to_numpy(), y_val.to_numpy()
X_test_np, y_test_np = X_test.to_numpy(), y_test.to_numpy()

# Define the model with GPU support
xgb_model = xgb.XGBClassifier(tree_method='gpu_hist', random_state=42, use_label_encoder=False, eval_metric='logloss')

# Optional: Define a grid of hyperparameters to search
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Perform grid search with cross-validation on the training set
grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_np, y_train_np)

# Evaluate on the validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val_np)
val_accuracy = accuracy_score(y_val_np, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Final evaluation on the test set
y_test_pred = best_model.predict(X_test_np)
test_accuracy = accuracy_score(y_test_np, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.



Validation Accuracy: 0.8664
Test Accuracy: 0.8656


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [None]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE

# Convert data to numpy arrays if needed
X_train_np, y_train_np = X_train.to_numpy(), y_train.to_numpy()
X_val_np, y_val_np = X_val.to_numpy(), y_val.to_numpy()
X_test_np, y_test_np = X_test.to_numpy(), y_test.to_numpy()

# Apply SMOTE only on the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_np, y_train_np)

# Define the model with GPU support
xgb_model = xgb.XGBClassifier(tree_method='gpu_hist', random_state=42, use_label_encoder=False, eval_metric='logloss')

# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# 5-Fold Stratified Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform GridSearchCV with 5-Fold Cross-Validation
grid_search = GridSearchCV(xgb_model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

# Best Model from Grid Search
best_model = grid_search.best_estimator_

# Evaluate on Validation Set
y_val_pred = best_model.predict(X_val_np)
val_accuracy = accuracy_score(y_val_np, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Final Evaluation on Test Set
y_test_pred = best_model.predict(X_test_np)
test_accuracy = accuracy_score(y_test_np, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")




In [10]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0


In [7]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   - -------------------------------------- 3.4/124.9 MB 18.4 MB/s eta 0:00:07
   - -------------------------------------- 6.0/124.9 MB 15.4 MB/s eta 0:00:08
   -- ------------------------------------- 7.1/124.9 MB 13.2 MB/s eta 0:00:09
   -- ------------------------------------- 7.3/124.9 MB 9.6 MB/s eta 0:00:13
   -- ------------------------------------- 7.9/124.9 MB 8.0 MB/s eta 0:00:15
   -- ------------------------------------- 8.4/124.9 MB 7.0 MB/s eta 0:00:17
   -- ------------------------------------- 8.7/124.9 MB 6.0 MB/s eta 0:00:20
   -- ------------------------------------- 8.9/124.9 MB 5.8 MB/s eta 0:00:21
   --- ------------------------------------ 10.5/124.9 MB 5.6 MB/s eta 0:00:21
   --- ------------------------------------ 11.5/124.9 MB 5.6 MB/s eta 0:00