In [12]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# --- 1. Load Data (Assumes PIDD dataset is available as 'diabetes.csv') ---
# Replace 'diabetes.csv' with the actual path/name of your file
try:
    df = pd.read_csv('diabetes.csv')
except FileNotFoundError:
    print("Please make sure 'diabetes.csv' is in the current directory or provide the correct path.")
    # Creating dummy data for demonstration if file not found
    data = {
        'Pregnancies': [6, 1, 8, 1, 0, 5],
        'Glucose': [148, 85, 183, 89, 137, 116],
        'BloodPressure': [72, 66, 64, 66, 40, 74],
        'SkinThickness': [35, 29, 0, 23, 35, 0],
        'Insulin': [0, 0, 0, 94, 168, 0],
        'BMI': [33.6, 26.6, 23.3, 28.1, 43.1, 25.6],
        'DiabetesPedigreeFunction': [0.627, 0.351, 0.672, 0.167, 2.288, 0.201],
        'Age': [50, 31, 32, 21, 33, 30],
        'Outcome': [1, 0, 1, 0, 1, 0]
    }
    df = pd.DataFrame(data)

# --- 2. Data Cleaning / Preprocessing ---

# Identify columns where 0 is an invalid/missing value
zero_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

# Replace 0s with the median of the respective column (a robust imputation technique)
for col in zero_cols:
    df[col] = df[col].replace(0, df[col].median())

# Separate features (X) and target (y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize the features (important for most ML algorithms)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- 3. Model Training: Gradient Boosting Classifier ---

# Initialize the GBC model with a starting set of parameters
gb_model = GradientBoostingClassifier(
    n_estimators=100,      # Number of boosting stages/trees
    learning_rate=0.1,     # Shrinkage (step size)
    max_depth=3,           # Depth of individual trees
    random_state=42
)

print("\n--- Training Gradient Boosting Classifier ---")
gb_model.fit(X_train_scaled, y_train)


# --- 4. Prediction and Evaluation ---

# Make predictions on the test set
y_pred = gb_model.predict(X_test_scaled)
y_pred_proba = gb_model.predict_proba(X_test_scaled)[:, 1] # Probability for the positive class (1)

# Print evaluation metrics
print("\n--- Model Evaluation (Initial GBC) ---")
print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# --- 5. Hyperparameter Tuning (Optional but recommended) ---

# Define the grid of parameters to search
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_leaf': [1, 2]
}

# Use GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',  # Use ROC-AUC as the primary metric for optimization
    cv=5,
    verbose=1,
    n_jobs=-1
)

print("\n--- Starting Grid Search for Hyperparameter Tuning ---")
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_gb_model = grid_search.best_estimator_
print(f"\nBest Parameters found: {grid_search.best_params_}")

# Evaluate the best model
best_y_pred = best_gb_model.predict(X_test_scaled)
best_y_pred_proba = best_gb_model.predict_proba(X_test_scaled)[:, 1]

print("\n--- Model Evaluation (Tuned GBC) ---")
print(f"Accuracy Score: {accuracy_score(y_test, best_y_pred):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, best_y_pred_proba):.4f}")
print("\nClassification Report (Tuned):\n", classification_report(y_test, best_y_pred))
print("\nConfusion Matrix (Tuned):\n", confusion_matrix(y_test, best_y_pred))
# [[TN, FP]
#  [FN, TP]





--- Training Gradient Boosting Classifier ---

--- Model Evaluation (Initial GBC) ---
Accuracy Score: 0.7597
ROC-AUC Score: 0.8278

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.85      0.82       100
           1       0.68      0.59      0.63        54

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.73       154
weighted avg       0.75      0.76      0.76       154


Confusion Matrix:
 [[85 15]
 [22 32]]

--- Starting Grid Search for Hyperparameter Tuning ---
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Best Parameters found: {'learning_rate': 0.05, 'max_depth': 3, 'min_samples_leaf': 1, 'n_estimators': 100}

--- Model Evaluation (Tuned GBC) ---
Accuracy Score: 0.7403
ROC-AUC Score: 0.8204

Classification Report (Tuned):
               precision    recall  f1-score   support

           0       0.78      0.84      0.81       100
           1       0.65   

In [13]:
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

GROUP_NUMBER = "05"
try:
    
    model_filename = f'GROUP_{GROUP_NUMBER}.pkl'
    with open(model_filename, 'wb') as file:
        pickle.dump(best_gb_model, file)
    print(f"✅ Model successfully saved using pickle as: {model_filename}")
    scaler_filename = f'GROUP_{GROUP_NUMBER}.pkl'
    with open(scaler_filename, 'wb') as file:
        pickle.dump(scaler, file)
    
    print(f"✅ Scaler successfully saved using pickle as: {scaler_filename}")
except NameError:
    # This block executes if 'best_gb_model' or 'scaler' were not defined 
    # in the current Python environment (common outside of a full notebook run)
    print("⚠️ Error: 'best_gb_model' or 'scaler' objects were not found.")
    print("Please ensure you have run your model training and scaling steps before saving.")
except Exception as e:
    print(f"An unexpected error occurred during saving: {e}")

✅ Model successfully saved using pickle as: GROUP_05.pkl
✅ Scaler successfully saved using pickle as: GROUP_05.pkl


In [14]:
y_pred_proba

array([0.91891439, 0.12579324, 0.11085731, 0.53674362, 0.01208705,
       0.31781545, 0.48524307, 0.96394201, 0.11026595, 0.7545334 ,
       0.48123019, 0.51308619, 0.07062945, 0.02055208, 0.19108736,
       0.31124793, 0.87831188, 0.03361956, 0.88243621, 0.07350181,
       0.12997535, 0.68454507, 0.46793032, 0.97561068, 0.61459332,
       0.01853032, 0.7643017 , 0.01497763, 0.55474383, 0.03275041,
       0.02536682, 0.04048417, 0.45487121, 0.36631821, 0.77142554,
       0.12621261, 0.1597446 , 0.03040494, 0.9249302 , 0.73141954,
       0.44349649, 0.09682243, 0.03003284, 0.38275315, 0.22671467,
       0.19349082, 0.0404915 , 0.03202845, 0.7161119 , 0.33927862,
       0.57603654, 0.79743522, 0.34664096, 0.04785703, 0.70950587,
       0.23913723, 0.74833228, 0.0736025 , 0.87977765, 0.03696557,
       0.72328552, 0.19950916, 0.02624304, 0.93099939, 0.00908195,
       0.46460411, 0.83109539, 0.01965084, 0.28493054, 0.80048786,
       0.05126502, 0.0102724 , 0.44200226, 0.88549793, 0.04704

In [15]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,30.5,33.6,0.627,50,1
1,1,85,66,29,30.5,26.6,0.351,31,0
2,8,183,64,23,30.5,23.3,0.672,32,1
3,1,89,66,23,94.0,28.1,0.167,21,0
4,0,137,40,35,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180.0,32.9,0.171,63,0
764,2,122,70,27,30.5,36.8,0.340,27,0
765,5,121,72,23,112.0,26.2,0.245,30,0
766,1,126,60,23,30.5,30.1,0.349,47,1


In [16]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.65625,72.386719,27.334635,94.652344,32.450911,0.471876,33.240885,0.348958
std,3.369578,30.438286,12.096642,9.229014,105.547598,6.875366,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,23.0,30.5,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,31.25,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [17]:
import joblib
/joblib.dump(scaler, "25RP18236_model.joblib")
joblib.dump(scaler, "25RP18236_scaler.joblib")

print("Model and scaler saved successfully")

Model and scaler saved successfully
