In [24]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

In [25]:
# Load your dataset
data = pd.read_csv("MLprostateAverageV2.csv")

# Split into features and target
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Target (the last column)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

X_train

Unnamed: 0,Age,IMC,MCAS,DM,TR,APS_preop,Volume,ASA,Poids_specimen,Prostatic_extension,Seminal_extension,pT,pN,Primary_Pgleason,Secondary_Pgleason,Total_Gleason,Nb_Positive_Nodes,Nb_Taken_Nodes,Margins,5PositiveCore
265,65.112329,32.097959,0,0,2,8.90,21.0,1,50.0,1,0,2,0,3,4,7,0,7,0,0
449,75.084932,27.660096,0,0,1,6.29,34.0,2,42.0,1,0,2,0,4,3,7,0,6,0,0
390,58.424658,23.040020,0,0,1,5.14,27.0,2,46.0,0,0,1,0,3,4,7,0,8,0,0
117,64.900000,28.200000,0,0,2,4.86,64.0,2,50.0,0,0,1,0,3,4,7,0,0,0,0
18,64.100000,31.200000,0,0,1,5.10,37.0,2,50.0,0,0,1,0,4,3,7,0,8,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,53.100000,27.800000,0,0,1,9.28,30.0,2,46.0,1,0,2,0,3,4,7,0,0,0,0
270,71.926027,25.013521,0,0,1,15.20,43.3,3,62.0,1,0,2,0,3,4,7,0,4,0,0
348,61.304110,35.154584,0,0,2,19.00,50.0,2,58.0,1,0,2,1,4,5,9,8,12,1,1
435,59.742466,34.350641,0,0,2,5.71,60.0,2,56.0,1,0,2,1,4,5,9,1,13,1,1


In [36]:
train_dmatrix = xgb.DMatrix(data=X_train, label=y_train)
test_dmatrix = xgb.DMatrix(data=X_test, label=y_test)

In [38]:
params = {
    'objective': 'binary:logistic',  # Specify binary classification
    'max_depth': 6,                  # Maximum depth of a tree
    'learning_rate': 0.1,            # Step size shrinkage
    'eval_metric': 'auc',            # Evaluation metric
    'seed': 42                       # Random seed for reproducibility
}

# Train the model
model = xgb.train(params, train_dmatrix, num_boost_round=100)


In [39]:
# Predict the probabilities of the test set
y_pred_proba = model.predict(test_dmatrix)

# Convert probabilities to binary predictions
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_proba]