In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
import xgboost as xgb

In [25]:
#  Load dataset
df = pd.read_csv(r"C:\Users\ROG\OneDrive\Documents\DeskDump\Python\Parkinsons\pd_speech_features.csv")


# Drop ID column and separate features/target
X = df.drop(columns=['id', 'class'])
y = df['class']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Train XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.



Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.64      0.70        39
           1       0.88      0.94      0.91       113

    accuracy                           0.86       152
   macro avg       0.83      0.79      0.81       152
weighted avg       0.86      0.86      0.86       152


Confusion Matrix:
[[ 25  14]
 [  7 106]]


In [26]:
mse_xgb = mean_squared_error(y_test,y_pred)
print(mse_xgb)

0.13815789473684212


In [27]:
y_train.value_counts()


class
1    451
0    153
Name: count, dtype: int64

In [28]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Check resampled class distribution
print("\nResampled class distribution:")
print(pd.Series(y_train_resampled).value_counts())


Resampled class distribution:
class
1    451
0    451
Name: count, dtype: int64


In [29]:
model.fit(X_train_resampled, y_train_resampled)

# Predictions and evaluation
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.



Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.67      0.71        39
           1       0.89      0.93      0.91       113

    accuracy                           0.86       152
   macro avg       0.83      0.80      0.81       152
weighted avg       0.86      0.86      0.86       152


Confusion Matrix:
[[ 26  13]
 [  8 105]]


In [34]:
# from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
# param_grid_xgb = {
#     'lambda': [0.01, 0.1, 1, 10],
#     'gamma': [0, 0.1, 1, 10],
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'n_estimators': [100, 200, 300]
# }

# # GridSearchCV for XGBRegressor (regression task => no accuracy score!)
# grid_search = GridSearchCV(
#     estimator=xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
#     param_grid=param_grid_xgb,
#     scoring='neg_mean_squared_error',  # Use appropriate regression metric
#     cv=10,
#     n_jobs=-1
# )

# from sklearn.model_selection import GridSearchCV

# param_grid_xgb = {
#     'lambda': [0.01, 0.1, 1, 10],
#     'gamma': [0, 0.1, 1, 10],
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'n_estimators': [100, 200, 300]
# }

# xgb_reg = xgb.XGBRegressor(
#     objective='reg:squarederror',
#     tree_method='gpu_hist',        # Enable GPU acceleration
#     predictor='gpu_predictor',     # Optional: speeds up prediction too
#     random_state=42
# )

# grid_search = GridSearchCV(
#     estimator=xgb_reg,
#     param_grid=param_grid_xgb,
#     scoring='neg_mean_squared_error',
#     cv=10,
#     n_jobs=-1,
#     verbose=2                        # Optional: shows progress
# )

from sklearn.model_selection import RandomizedSearchCV

# Define parameter distribution
param_dist_xgb = {
    'lambda': [0.01, 0.1, 1, 10],
    'gamma': [0, 0.1, 1, 10],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}

# Randomized search using XGBClassifier
random_search = RandomizedSearchCV(
    estimator=xgb.XGBClassifier(
        objective='binary:logistic',       # Use 'multi:softprob' for multi-class
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    ),
    param_distributions=param_dist_xgb,
    n_iter=20,                 # You can reduce this for faster results
    scoring='f1',              # Or 'accuracy', 'roc_auc', etc., depending on your goal
    cv=10,
    n_jobs=-1,
    verbose=1
)

# Fit the search on training data
random_result = random_search.fit(X_train_resampled, y_train_resampled)

# Display best score and parameters
print(f"Best score: {random_result.best_score_}")
print(f"Best parameters: {random_result.best_params_}")


Fitting 10 folds for each of 20 candidates, totalling 200 fits


Parameters: { "use_label_encoder" } are not used.



Best score: 0.9686220588581957
Best parameters: {'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.1, 'lambda': 1, 'gamma': 0}


In [35]:
# # Fit on training data (use x_train, y_train instead of full x and y)
# grid_result = grid_search.fit(X_train, y_train)

# # Display the best parameters and the (negative) MSE score
# print(f"Best MSE (negative): {grid_result.best_score_}")
# print(f"Best parameters: {grid_result.best_params_}")
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.74      0.73        39
           1       0.91      0.90      0.91       113

    accuracy                           0.86       152
   macro avg       0.82      0.82      0.82       152
weighted avg       0.86      0.86      0.86       152



In [36]:
import joblib

# Save the best model to a file
joblib.dump(best_model, 'xgb_parkinson_model.pkl')
print("Model saved as 'xgb_parkinson_model.pkl'")

Model saved as 'xgb_parkinson_model.pkl'


In [None]:
# Load the saved model
loaded_model = joblib.load('xgb_parkinson_model.pkl')

# Use it to predict
y_pred = loaded_model.predict(X_test)
