In [21]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import precision_score, confusion_matrix, accuracy_score, log_loss
import numpy as np
import time
from sklearn.preprocessing import StandardScaler
from hyperopt import hp, tpe, fmin, Trials, space_eval
import warnings
warnings.filterwarnings('ignore')

scaler = StandardScaler()

# Read the training data
Data_train = pd.read_csv('Segmented_Nuclei_train.csv')

# Extract features and target variable
X = Data_train.drop(['Category'], axis=1)
y = Data_train['Category']

scaler = StandardScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

xgb_clf = xgb.XGBClassifier(objective='binary:logistic', learning_rate=0.1,
                            max_depth=8, eval_metric='logloss', n_estimators=300,
                            seed=42, use_label_encoder=False, n_jobs = -1)

start_time = time.time()
xgb_clf.fit(X_train, y_train)
end_time = time.time()

elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

Elapsed time: 1.7151896953582764 seconds


In [22]:
###Selecting the 20 most important features###

importances = xgb_clf.feature_importances_
sorted_indices = importances.argsort()[::-1]
top_features = X.columns[sorted_indices]

print(top_features)

Index(['extent', 'mean_intensity', 'area_percentage', 'solidity',
       'area_filled', 'area', 'area_bbox', 'euler_number'],
      dtype='object')


In [23]:
###Hyperparameter tuning with Bayes search and cross-validation
# Define the search space
# X_20 = X[top_20_features]

space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.choice('max_depth', np.arange(3, 11, dtype=int)),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'gamma': hp.uniform('gamma', 0, 1),
    'n_estimators': hp.choice('n_estimators', np.arange(100, 1000, 100, dtype=int)),
    'min_child_weight': hp.uniform('min_child_weight', 0, 10)
}

# Objective function to minimize (log loss)
def objective(params):
    model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', seed=42, use_label_encoder=False, **params, n_jobs = -1)
    scores = cross_val_score(model, X, y, cv=5, scoring='neg_log_loss')
    return -np.mean(scores)

# Hyperparameter optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)

# Retrieve the best hyperparameters
best_params = space_eval(space, best)

# Train the final model with the best hyperparameters
best_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', seed=42, use_label_encoder=False, **best_params, n_jobs = -1)
start_time = time.time()
best_model.fit(X_train, y_train)
end_time = time.time()

# Make predictions on the validation set
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_proba.round())
conf_matrix = confusion_matrix(y_test, y_pred_proba.round())
logloss = log_loss(y_test, y_pred_proba)

print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)
print("Confusion matrix:", conf_matrix)
print("LogLoss:", logloss)

elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

100%|██████████| 10/10 [02:18<00:00, 13.82s/trial, best loss: 0.22952606027694808]
Best Hyperparameters: {'colsample_bytree': 0.9370615852680544, 'gamma': 0.729249267441817, 'learning_rate': 0.03584245887917516, 'max_depth': 10, 'min_child_weight': 1.4123423054280826, 'n_estimators': 800, 'subsample': 0.6485582507030402}
Accuracy: 0.8948962945700303
Confusion matrix: [[1996  185]
 [ 266 1844]]
LogLoss: 0.2588742310719193
Elapsed time: 5.081980466842651 seconds


In [24]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

# Define the best model with the best hyperparameters
best_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', seed=42, use_label_encoder=False, **best_params, n_jobs=-1)

# Define KFold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Arrays to store predictions and true labels
val_logloss_scores = []

# Loop through each fold
for train_index, val_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[val_index]
    y_train, y_test = y.iloc[train_index], y.iloc[val_index]
    
    # Train the model on the training set
    best_model.fit(X_train, y_train)
    
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    # Predict probabilities on the validation set
    val_logloss = log_loss(y_test, y_pred_proba)
    val_logloss_scores.append(val_logloss)

mean_logloss = np.mean(val_logloss_scores)
print("Mean LogLoss from Cross-Validation:", mean_logloss)

Mean LogLoss from Cross-Validation: 0.25712625542608397


In [25]:
# import os
# import csv

# # Create a folder named 'solutions' if it doesn't exist
# folder_name = 'solutions'
# if not os.path.exists(folder_name):
#     os.makedirs(folder_name)

# Write = True
# if Write:
# # Your list of variables
#     top_20_features_list = top_20_features.tolist()
#     variables = top_20_features

#     # Path to the CSV file
    
#     csv_file_path = os.path.join(folder_name, 'Classification_GeorgiosSevastakis_XGBoost1_VariableList.csv')

#     # Open the CSV file in write mode
#     with open(csv_file_path, mode='w', newline='') as file:
#         # Create a CSV writer object
#         writer = csv.writer(file)

#         # Write each variable as a row in the CSV file
#         for variable in variables:
#             writer.writerow([variable])
    
#     data = y_pred_prob

#     # Path to the CSV file
#     csv_file_path = os.path.join(folder_name, 'Classification_GeorgiosSevastakis_XGBoost1.csv')
    
#     # Open the CSV file in write mode
#     with open(csv_file_path, mode='w', newline='') as file:
#         # Create a CSV writer object
#         writer = csv.writer(file)
        
#         # Enumerate through the data and write each item along with its index as a row in the CSV file
#         for index, item in enumerate(data, start=0):
#             writer.writerow([index, item])