In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:

print("Proper implementation of ensemble learning with cascading technique for PCOS prediction")

# Load Data
data = pd.read_csv('clean_data.csv')
data.drop(data.columns[0], inplace=True, axis=1)

cols = list(data.columns.values)
cols.pop(cols.index('City'))
cols.pop(cols.index('PCOS'))
cols.pop(cols.index('PCOS_from'))
data = data[cols+['PCOS']]

data['PCOS'] = data['PCOS'].map(dict(Yes = 1, No = 0))
print("\nData loaded and preprocessed successfully.")

x = data.drop('PCOS', axis = 1)
y = data['PCOS']

# Split and scale data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# Scale data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

print("\nOriginal model implementation (with issues):")
# Define Models
model1 = GaussianNB()
model2 = LogisticRegression(random_state=42)
model3 = DecisionTreeClassifier(random_state=42)

# Original implementation (problematic)
model1.fit(x_train_scaled, y_train)
pred1 = model1.predict(x_test_scaled)
print("Model 1 Accuracy:", accuracy_score(y_test, pred1))

X_train2 = pd.DataFrame(model1.predict_proba(x_train_scaled))
X_test2 = pd.DataFrame(model1.predict_proba(x_test_scaled))

model2.fit(X_train2, y_train)
pred2 = model2.predict(X_test2)
print("Model 2 Accuracy:", accuracy_score(y_test, pred2))

X_train3 = pd.DataFrame(model2.predict_proba(X_train2))
X_test3 = pd.DataFrame(model2.predict_proba(X_test2))

model3.fit(X_train3, y_train)
pred3 = model3.predict(X_test3)
print("Model 3 Accuracy:", accuracy_score(y_test, pred3))

print("\nImproved implementation with proper cascading ensemble technique:")
# Reset and define new models
model1 = GaussianNB()
model2 = LogisticRegression(max_iter=1000, random_state=42)
model3 = DecisionTreeClassifier(max_depth=5, random_state=42)

# Step 1: Train model 1 on original features
model1.fit(x_train_scaled, y_train)
pred1_train = model1.predict(x_train_scaled).reshape(-1, 1)
pred1_test = model1.predict(x_test_scaled).reshape(-1, 1)
prob1_train = model1.predict_proba(x_train_scaled)
prob1_test = model1.predict_proba(x_test_scaled)
print("Model 1 Accuracy:", accuracy_score(y_test, pred1_test.ravel()))

# Step 2: Combine original features with model 1 outputs for model 2
x_train_m2 = np.hstack((x_train_scaled, prob1_train))
x_test_m2 = np.hstack((x_test_scaled, prob1_test))

# Train model 2 with enhanced features
model2.fit(x_train_m2, y_train)
pred2_train = model2.predict(x_train_m2).reshape(-1, 1)
pred2_test = model2.predict(x_test_m2).reshape(-1, 1)
prob2_train = model2.predict_proba(x_train_m2)
prob2_test = model2.predict_proba(x_test_m2)
print("Model 2 Accuracy:", accuracy_score(y_test, pred2_test.ravel()))

# Step 3: Combine original features with model 1 and 2 outputs for model 3
x_train_m3 = np.hstack((x_train_scaled, prob1_train, prob2_train))
x_test_m3 = np.hstack((x_test_scaled, prob1_test, prob2_test))

# Train model 3 with enhanced features
model3.fit(x_train_m3, y_train)
pred3_test = model3.predict(x_test_m3)
print("Model 3 Accuracy:", accuracy_score(y_test, pred3_test))

# Step 4: Create final ensemble using weighted voting
weights = [1, 2, 1.5]  # Weights for model 1, 2, and 3
normalized_weights = [w/sum(weights) for w in weights]

# Get weighted probabilities for positive class
weighted_probs = (normalized_weights[0] * prob1_test[:, 1] + 
                 normalized_weights[1] * prob2_test[:, 1] + 
                 normalized_weights[2] * model3.predict_proba(x_test_m3)[:, 1])

# Convert probabilities to binary predictions
ensemble_pred = (weighted_probs > 0.5).astype(int)
print("Ensemble Model Accuracy:", accuracy_score(y_test, ensemble_pred)) 

Proper implementation of ensemble learning with cascading technique for PCOS prediction

Data loaded and preprocessed successfully.

Original model implementation (with issues):
Model 1 Accuracy: 0.8059701492537313
Model 2 Accuracy: 0.835820895522388
Model 3 Accuracy: 0.7761194029850746

Improved implementation with proper cascading ensemble technique:
Model 1 Accuracy: 0.8059701492537313
Model 2 Accuracy: 0.8656716417910447
Model 3 Accuracy: 0.835820895522388
Ensemble Model Accuracy: 0.8656716417910447
