# PROJECT -  Polycystic ovary syndrome (PCOS) PREDICTION USING MACHINE LEARNING 

In [1]:
import pandas as pd
import pickle
import numpy as np
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv("PCOS_infertility.csv")
df = df.fillna(0)

In [3]:
df.columns

Index(['Sl. No', 'Patient File No.', 'PCOS (Y/N)', '  I   beta-HCG(mIU/mL)',
       'II    beta-HCG(mIU/mL)', 'AMH(ng/mL)'],
      dtype='object')

In [4]:
df

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),I beta-HCG(mIU/mL),II beta-HCG(mIU/mL),AMH(ng/mL)
0,1,10001,0,1.99,1.99,2.07
1,2,10002,0,60.80,1.99,1.53
2,3,10003,1,494.08,494.08,6.63
3,4,10004,0,1.99,1.99,1.22
4,5,10005,0,801.45,801.45,2.26
...,...,...,...,...,...,...
536,537,10537,0,1.99,1.99,1.7
537,538,10538,0,80.13,1.99,5.6
538,539,10539,0,1.99,1.99,3.7
539,540,10540,0,292.92,1.99,5.2


# DATA SEGREGATION INTO FEATURES AND TARGET

In [5]:
inputs = df.drop(["PCOS (Y/N)","Sl. No","Patient File No."],axis = 'columns')

In [6]:
inputs.head(50)
inputx = inputs.head(4)

In [7]:
inputx
inputy = inputx.tail(2)

In [8]:
outputy = inputy.applymap(lambda x: "")

  outputy = inputy.applymap(lambda x: "")


In [9]:
target = df["PCOS (Y/N)"]

In [10]:
targetdum = target.head(4)
inputsdumy = inputs.head(4)


# DATA ENCODING FOR TEXT BASED FEATURES & REFORMATION 

In [11]:
inputs

Unnamed: 0,I beta-HCG(mIU/mL),II beta-HCG(mIU/mL),AMH(ng/mL)
0,1.99,1.99,2.07
1,60.80,1.99,1.53
2,494.08,494.08,6.63
3,1.99,1.99,1.22
4,801.45,801.45,2.26
...,...,...,...
536,1.99,1.99,1.7
537,80.13,1.99,5.6
538,1.99,1.99,3.7
539,292.92,1.99,5.2


In [12]:
# NOT REQUIRED SINCE ALL FEATURES HAVE NUMERIC DATATYPE

## DATA SEGRECATION FOR TRAINING AND TESTING MACHINE MODEL(WITHOUT PCA)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(inputsdumy, targetdum, test_size=0.2)

In [14]:
df.dtypes

Sl. No                      int64
Patient File No.            int64
PCOS (Y/N)                  int64
  I   beta-HCG(mIU/mL)    float64
II    beta-HCG(mIU/mL)    float64
AMH(ng/mL)                 object
dtype: object

In [15]:
df['II    beta-HCG(mIU/mL)'] = pd.to_numeric(df['II    beta-HCG(mIU/mL)'])


# Applying Machine Learning Models without  Principal Component Analysis

## Logistic Regression

In [16]:
print(inputs.dtypes)
print(target.dtype)

# Convert to numeric if needed
inputs = inputs.apply(pd.to_numeric, errors='coerce')
target = pd.to_numeric(target, errors='coerce')

  I   beta-HCG(mIU/mL)    float64
II    beta-HCG(mIU/mL)    float64
AMH(ng/mL)                 object
dtype: object
int64


In [17]:
print(inputs.isnull().sum())
print(target.isnull().sum())

# Drop rows with missing values
inputs = inputs.dropna()
target = target[inputs.index]

  I   beta-HCG(mIU/mL)    0
II    beta-HCG(mIU/mL)    0
AMH(ng/mL)                1
dtype: int64
0


In [18]:
from sklearn.preprocessing import OneHotEncoder

# Identify categorical columns
categorical_cols = inputs.select_dtypes(include=['object', 'category']).columns

# One-hot encode categorical variables
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cats = encoder.fit_transform(inputs[categorical_cols])

# Combine with numeric columns
numeric_cols = inputs.select_dtypes(include=['int64', 'float64']).columns
inputs_encoded = pd.concat([inputs[numeric_cols], 
                            pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))], 
                           axis=1)

In [19]:
import sklearn
print(sklearn.__version__)

1.5.1


In [27]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

# Preprocess the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(inputs)

# Feature selection
selector = SelectKBest(f_classif, k=5)  # Select top 5 features
X_selected = selector.fit_transform(X_scaled, target)

# Create a pipeline
pipeline = Pipeline([
    ('classifier', RandomForestClassifier())
])

# Define parameter space
param_dist = {
    'classifier__n_estimators': randint(50, 500),
    'classifier__max_depth': randint(1, 20),
    'classifier__min_samples_split': randint(2, 11),
    'classifier__min_samples_leaf': randint(1, 11)
}

# Randomized search with cross-validation
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, 
                                   n_iter=100, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
random_search.fit(X_selected, target)

# Print best parameters and score
print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

# Evaluate on the entire dataset
best_model = random_search.best_estimator_
score = best_model.score(X_selected, target)*100


print("Final model score:", score)



Best parameters: {'classifier__max_depth': 3, 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 116}
Best cross-validation score: 0.6796296296296296
Final model score: 72.22222222222221


In [21]:
reg_model = LogisticRegression()
reg_model.fit(inputs,target)


In [22]:
reg_model.score(X_test,y_test)
Score = reg_model.score(X_train,y_train)*100
print("prediction score is",Score )

prediction score is 66.66666666666666


## Support Vector Machine

In [23]:
print("Unique classes in y_train:", np.unique(y_train))


Unique classes in y_train: [0 1]


In [24]:
SV_model = SVC()
SV_model.fit(X_train, y_train)
SV_model.score(X_test,y_test)
Score = SV_model.score(inputs,target)*100
print("prediction score is",Score )

prediction score is 62.40740740740741


## Random Forest Classification

In [25]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
Score = clf.score(inputs,target)*100
print("prediction score is",Score )

prediction score is 67.4074074074074


### Machine learning model without Principle Component Analysis had Better prediction score.

## Making Pickle file to save the machine learning model

In [26]:
pickle.dump(clf, open('modelfinal1.pkl', 'wb'))