In [None]:
# Core
import os
import cv2
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import random

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model

# Classifier
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# File handling (for prediction from uploaded image)
from PIL import Image


In [None]:
import kagglehub
import os

# Download dataset (returns a directory path)
path = kagglehub.dataset_download("anaghachoudhari/pcos-detection-using-ultrasound-images")
print("Dataset downloaded to:", path)

# # List files in the dataset
files = os.listdir(path)
print("Files in dataset:", files)

# (Optional) Copy to /content/ for easier access
!mkdir -p "/content/"
!cp -r "{path}"/* "/content/"
print("Files copied to /content/")

Downloading from https://www.kaggle.com/api/v1/datasets/download/anaghachoudhari/pcos-detection-using-ultrasound-images?dataset_version_number=1...


100%|██████████| 126M/126M [00:00<00:00, 227MB/s]

Extracting files...





Dataset downloaded to: /root/.cache/kagglehub/datasets/anaghachoudhari/pcos-detection-using-ultrasound-images/versions/1
Files in dataset: ['data']
Files copied to /content/


In [None]:
# Set path to training directory
train_dir = "/content/data/train"
SIZE = 224

X_train = []
y_train = []

# Load images and labels
for label in ['infected', 'notinfected']:
    folder = os.path.join(train_dir, label)
    for file in tqdm(os.listdir(folder), desc=f"Loading {label}"):
        img_path = os.path.join(folder, file)
        img = cv2.imread(img_path)
        if img is not None:
            img = cv2.resize(img, (SIZE, SIZE))
            X_train.append(img)
            y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

# Encode labels to 0 and 1
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)

print("Train images shape:", X_train.shape)
print("Encoded labels:", np.unique(y_train_enc, return_counts=True))


Loading infected: 100%|██████████| 781/781 [00:00<00:00, 878.46it/s]
Loading notinfected: 100%|██████████| 1143/1143 [00:02<00:00, 422.38it/s]


Train images shape: (1924, 224, 224, 3)
Encoded labels: (array([0, 1]), array([ 781, 1143]))


In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model

# Load VGG16 without top layers
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(SIZE, SIZE, 3))
model = Model(inputs=base_model.input, outputs=base_model.output)

# Preprocess and extract features
def extract_features(images, batch_size=16):
    features = []
    for i in tqdm(range(0, len(images), batch_size), desc="Extracting features"):
        batch = images[i:i+batch_size]
        batch = preprocess_input(batch)
        batch_features = model.predict(batch, verbose=0)
        batch_features = batch_features.reshape(batch_features.shape[0], -1)  # Flatten
        features.append(batch_features)
    return np.vstack(features)

# Extract training features
X_train_features = extract_features(X_train)
print("Extracted features shape:", X_train_features.shape)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Extracting features: 100%|██████████| 121/121 [00:35<00:00,  3.37it/s]

Extracted features shape: (1924, 25088)





In [None]:
import xgboost as xgb

# Initialize and train model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_features, y_train_enc)

print("✅ XGBoost training complete!")



Parameters: { "use_label_encoder" } are not used.



✅ XGBoost training complete!


In [None]:
# Load test images
test_dir = "/content/data/test"
X_test = []
y_test = []

for label in ['infected', 'notinfected']:
    folder = os.path.join(test_dir, label)
    for file in tqdm(os.listdir(folder), desc=f"Loading {label}"):
        img_path = os.path.join(folder, file)
        img = cv2.imread(img_path)
        if img is not None:
            img = cv2.resize(img, (SIZE, SIZE))
            X_test.append(img)
            y_test.append(label)

X_test = np.array(X_test)
y_test = np.array(y_test)
y_test_enc = le.transform(y_test)  # Use same label encoder as train

# Extract VGG16 features
X_test_features = extract_features(X_test)

# Predict with XGBoost
y_pred = xgb_model.predict(X_test_features)

# Evaluate
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("\nAccuracy:", accuracy_score(y_test_enc, y_pred))
print("\nClassification Report:\n", classification_report(y_test_enc, y_pred, target_names=le.classes_))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_enc, y_pred))


Loading infected: 100%|██████████| 787/787 [00:00<00:00, 1046.74it/s]
Loading notinfected: 100%|██████████| 1145/1145 [00:02<00:00, 466.40it/s]
Extracting features: 100%|██████████| 121/121 [00:26<00:00,  4.62it/s]



Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

    infected       1.00      1.00      1.00       781
 notinfected       1.00      1.00      1.00      1141

    accuracy                           1.00      1922
   macro avg       1.00      1.00      1.00      1922
weighted avg       1.00      1.00      1.00      1922


Confusion Matrix:
 [[ 781    0]
 [   0 1141]]


In [None]:
def predict_image(image_path):
    # Load and preprocess
    img = cv2.imread(image_path)
    img = cv2.resize(img, (SIZE, SIZE))
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)

    # Feature extraction
    features = model.predict(img, verbose=0)
    features = features.reshape(1, -1)

    # Prediction
    pred = xgb_model.predict(features)[0]
    prob = xgb_model.predict_proba(features)[0][pred]

    label = le.inverse_transform([pred])[0]
    print(f"\nPrediction: {'PCOS Detected' if label == 'infected' else 'No PCOS Detected'}")
    print(f"Confidence: {prob:.2f}")

    return label, prob

predict_image("/content/sample.jpg")  # Replace with your image path



Prediction: PCOS Detected
Confidence: 1.00


(np.str_('infected'), np.float32(0.999446))

In [None]:
import pickle

# Save XGBoost model
with open("pcos_xgb_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

# Save LabelEncoder
with open("pcos_label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

print("✅ Model and label encoder saved successfully.")


✅ Model and label encoder saved successfully.


In [None]:
print(le.classes_)  # This shows ['infected', 'notinfected'] or vice versa


['infected' 'notinfected']
