In [15]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# 1️⃣ Load the dataset
df = pd.read_csv("dataset/pcos_data.csv")  # <-- Replace with your actual path

# 2️⃣ Pre-processing
df.drop(['Sl. No','Unnamed: 42','Patient File No.','Marraige Status (Yrs)','Blood Group','Hip(inch)','Waist(inch)'], axis=1, inplace=True)
df['Follicle_count']=df['Follicle No. (L)'] + df['Follicle No. (R)']
df.drop(['Follicle No. (L)','Follicle No. (R)','Avg. F size (L) (mm)', 'Avg. F size (R) (mm)','Endometrium (mm)','Hb(g/dl)'],axis = 1,inplace=True)

df["Cycle(R/I)"].replace({2: 0, 4: 1})
df["Cycle(R/I)"].replace({5:0})

df.rename(columns={'PCOS (Y/N)':'PCOS',
                   'Age (yrs)':'Age',
                   'Weight (Kg)':'Weight',
                   'Height(Cm)':'Height',
                   'Pulse rate(bpm)':'PulseRate',
                   'RR (breaths/min)':'RR',
                   'Pregnant(Y/N)':'Pregnant',
                   'No. of aborptions':'Abortions',
                   'FSH(mIU/mL)':'FSH',
                   'TSH (mIU/L)':'TSH',
                   'LH(mIU/mL)':'LSH',
                   'AMH(ng/mL)':'AMH',
                   'PRL(ng/mL)':'PRL',
                   'Vit D3 (ng/mL)':'VitD3',
                   'PRG(ng/mL)':'PRG',
                   'RBS(mg/dl)':'RBS',
                   'Weight gain(Y/N)':'Weight_gain',
                   'hair growth(Y/N)':'hair_growth',
                   'Skin darkening (Y/N)':'Skin_darkening',
                   'Hair loss(Y/N)':'Hair_loss',
                   'FSH/LH':'FSH_LH_ratio',
                   'Pimples(Y/N)':'Pimples',
                   'Fast food (Y/N)':'Fast_food',
                   'Reg.Exercise(Y/N)':'Reg_exercise',
                   'BP _Systolic (mmHg)':'BP_systolic',
                   'BP _Diastolic (mmHg)':'BP_diastolic',
                   'Waist:Hip Ratio':'W_H_ratio' },  
                    inplace=True)


df.replace({'.': np.nan, 'NA': np.nan, '?': np.nan}, inplace=True)


# Fill NA Values with median
df.fillna(df.median(numeric_only=True), inplace=True)
df.columns


Index(['PCOS', 'Age', 'Weight', 'Height', 'BMI', 'PulseRate', 'RR',
       'Cycle(R/I)', 'Cycle length(days)', 'Pregnant', 'Abortions', 'FSH',
       'LSH', 'FSH_LH_ratio', 'W_H_ratio', 'TSH', 'AMH', 'PRL', 'VitD3', 'PRG',
       'RBS', 'Weight_gain', 'hair_growth', 'Skin_darkening', 'Hair_loss',
       'Pimples', 'Fast_food', 'Reg_exercise', 'BP_systolic', 'BP_diastolic',
       'Follicle_count'],
      dtype='object')

In [10]:

# Target column
target_col = "PCOS"
if target_col not in df.columns:
    raise ValueError(f"Target column '{target_col}' NOT found.")

# Split into X and y
X = df.drop(columns=[target_col])
y = df[target_col]
X,y

(     Age  Weight   Height        BMI  PulseRate  RR  Cycle(R/I)  \
 0     28    44.6  152.000  19.300000         78  22           2   
 1     36    65.0  161.500  24.921163         74  20           2   
 2     33    68.8  165.000  25.270891         72  18           2   
 3     37    65.0  148.000  29.674945         72  20           2   
 4     25    52.0  161.000  20.060954         72  18           2   
 ..   ...     ...      ...        ...        ...  ..         ...   
 536   35    50.0  164.592  18.500000         72  16           2   
 537   30    63.2  158.000  25.300000         72  18           2   
 538   36    54.0  152.000  23.400000         74  20           2   
 539   27    50.0  150.000  22.200000         74  20           4   
 540   23    82.0  165.000  30.100000         80  20           4   
 
      Cycle length(days)  Pregnant  Abortions  ...  Weight_gain  hair_growth  \
 0                     5         0          0  ...            0            0   
 1                    

In [11]:

print("\nFinal Shapes -> X:", X.shape, "y:", y.shape)
print("\nTarget Value Counts:\n", y.value_counts())



Final Shapes -> X: (541, 30) y: (541,)

Target Value Counts:
 PCOS
0    364
1    177
Name: count, dtype: int64


In [3]:

# 3️ Train/Test Split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

# 4️ Build the Pipeline
# Individual pipelines
rf_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(random_state=42, class_weight='balanced'))
])

gb_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", GradientBoostingClassifier(random_state=42))
])

lr_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(random_state=42, max_iter=1000))
])
ensemble = VotingClassifier(
    estimators=[
        ("rf", rf_pipeline),
        ("gb", gb_pipeline),
        ("lr", lr_pipeline),
    ],
    voting="soft"
)

In [4]:
# Train
ensemble.fit(X_train, y_train)


0,1,2
,estimators,"[('rf', ...), ('gb', ...), ...]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [5]:


# Evaluate
y_pred = ensemble.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))



Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.95      0.92        77
           1       0.86      0.75      0.80        32

    accuracy                           0.89       109
   macro avg       0.88      0.85      0.86       109
weighted avg       0.89      0.89      0.89       109


Confusion Matrix:
 [[73  4]
 [ 8 24]]


In [7]:

# 7️ Save the Pipeline
os.makedirs("models", exist_ok=True)
joblib.dump(ensemble, "models/pcos.pkl")  # Single .pkl containing BOTH scaler + model
print(" Model saved as 'models/pcos.pkl'")


 Model saved as 'models/pcos.pkl'
