Obejective: Predict whether the patient will get pregnant based on IVF treatment factors

In [161]:
# -----------------------------------------
# 📦 Step 1: Import Required Libraries
# -----------------------------------------
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from imblearn.over_sampling import RandomOverSampler

import warnings
warnings.filterwarnings("ignore")  # Suppress warnings for clean output

In [162]:
# -----------------------------------------
# 📂 Step 2: Load & Prepare Dataset
# -----------------------------------------
df = pd.read_csv("HealthCareProject.csv")


In [163]:
#About the Dataset
df.head()

# v1 - Age
# v2 - Classifier - Primary, secondary i.e., sterility. >> 
#     (Primary infertility (The patient has never conceived before).Secondary infertility (The patient has conceived before but is now struggling with infertility))
# v3 - Years before pregnancy(Years after marriage) 
# v4 - Classifier - infertity by m, f, both m and f
# v5 - Classifier - whose egg we are using, self or donatd
# v6 - Classifier - male sperm (self or donor or combination of both)
# v7 - classifier - method to activate sperms, stimulation
# v8--  how we got sperms 4 to 5 methods, example masturbation, etc. --- outcome classifier.
# v9 - Categorical number - No of eggs.
# v10 - Predictive Outcomes - classifier - no of eggs formed - Intermediate outcomes - How many eggs formed 
# v11 - Predictive outcomes - classifier - How many eggs got converted to embryosss
# v12 - Does it come under input variable in first tier or second tier. Immediately or after frozen
# v13 – Pregnant (Yes/No)

Unnamed: 0.1,Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13
0,METype,1,1,1.0,1,1,1,1,1,1,2,2,1,3
1,C1,32,2,3.5,3,1,1,2,1,5,2,3,2,1
2,C2,42,1,3.5,3,1,1,1,2,3,2,3,2,3
3,C3,29,2,3.5,2,1,1,1,2,8,7,8,2,1
4,C4,23,1,3.0,3,1,1,1,2,15,9,9,2,1


In [164]:

# Drop unnecessary column and first row (likely a header or placeholder)
df = df.drop(columns='Unnamed: 0').drop(index=0).reset_index(drop=True)

In [167]:
# Replace 999 placeholders with NaN and fill them using forward fill
df.replace(999, np.nan, inplace=True)
df.fillna(method='ffill', inplace=True)

# Convert all columns to integers
df = df.astype(int)

In [171]:
#checking for unique values in dataset having a look at data
for col in df.columns:
    unique_vals = df[col].unique()
    num_unique = len(unique_vals)
    print(f"Column: {col}")
    print(f"Unique values ({num_unique}): {unique_vals}\n")

    print("*"*50)

Column: v1
Unique values (27): [32 42 29 23 22 27 44 34 45 30 25 39 28 18 19 38 26 31 40 35 36 43 33 37
 24 41 21]

**************************************************
Column: v2
Unique values (2): [2 1]

**************************************************
Column: v3
Unique values (19): [ 3  5 13 15 27  2  1  8 10 12  4  7  6 17  9 11 19 14 16]

**************************************************
Column: v4
Unique values (4): [3 2 1 4]

**************************************************
Column: v5
Unique values (2): [1 2]

**************************************************
Column: v6
Unique values (3): [1 2 3]

**************************************************
Column: v7
Unique values (2): [2 1]

**************************************************
Column: v8
Unique values (5): [1 2 4 3 5]

**************************************************
Column: v9
Unique values (18): [ 5  3  8 15 16 10  7 12 14  9 13  6 11  0  2  4 17  1]

**************************************************
Column: v10


In [169]:

# -----------------------------------------
# 🧹 Step 3: Clean Target & Remove Class 2
# -----------------------------------------
# v13 → Target variable (Pregnant: Yes/No)
# Classes: 1 = Pregnant, 3 = Not Pregnant, 2 = Unknown (drop it)
df = df[df['v13'] != 2].reset_index(drop=True)

# Remap target: 1 → 0 (Pregnant), 3 → 1 (Not Pregnant)
df['v13'] = df['v13'].map({1: 0, 3: 1})

In [170]:
def treat_outliers_iqr(df, cols):
    """
    Applies IQR-based outlier treatment only to specified columns.
    Replaces extreme values with lower/upper bounds.
    
    Parameters:
        df (pd.DataFrame): The dataframe to modify.
        cols (list): List of column names to apply IQR on.
        
    Returns:
        pd.DataFrame: Modified dataframe with outliers capped.
    """
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[col] = np.clip(df[col], lower_bound, upper_bound)
    return df

# Apply outlier treatment only to continuous features
df = treat_outliers_iqr(df, cols=['v1', 'v9'])
# Convert all columns to integers
df = df.astype(int)

df.head()

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13
0,32,2,3,3,1,1,2,1,5,2,3,2,0
1,42,1,3,3,1,1,1,2,3,2,3,2,1
2,29,2,3,2,1,1,1,2,8,7,8,2,0
3,23,1,3,3,1,1,1,2,15,9,9,2,0
4,22,1,5,3,1,1,1,1,5,3,3,2,1


In [174]:
def restofthecolumns(df, cols):
    """
    Returns value counts for each specified column as a dictionary.
    """
    counts = {}
    for col in cols:
        counts[col] = df[col].value_counts()
    return counts


categorical_cols = ['v2',  'v4', 'v5', 'v6', 'v7','v8', 'v12']
restofthecolumns(df, categorical_cols)


{'v2': v2
 1    142
 2     74
 Name: count, dtype: int64,
 'v4': v4
 3    161
 2     40
 1      9
 4      6
 Name: count, dtype: int64,
 'v5': v5
 1    168
 2     48
 Name: count, dtype: int64,
 'v6': v6
 1    190
 2     19
 3      7
 Name: count, dtype: int64,
 'v7': v7
 1    209
 2      7
 Name: count, dtype: int64,
 'v8': v8
 1    116
 2     46
 3     31
 4     21
 5      2
 Name: count, dtype: int64,
 'v12': v12
 2    191
 1     25
 Name: count, dtype: int64}

In [None]:
# -----------------------------------------
# 🎯 Step 5: Feature & Target Selection
# -----------------------------------------
# Selected based on earlier feature importance analysis
selected_features = ['v1', 'v3', 'v9', 'v10', 'v11', 'v5']
X = df[selected_features]
y = df['v13']

# -----------------------------------------
# 📊 Step 6: Train/Test Split (with Stratify)
# Method: Stratified split keeps class balance in train/test
# -----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [None]:

# -----------------------------------------
# ⚖️ Step 7: Handle Class Imbalance with RandomOverSampler
# Method: Oversamples the minority class by duplicating examples
# -----------------------------------------
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# -----------------------------------------
# ⚖️ Step 8: Scale Features with MinMaxScaler
# Method: Scales values to 0–1 range; required for consistent ML performance
# -----------------------------------------
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [None]:


# -----------------------------------------
# 🌲 Step 9: Build & Train RandomForestClassifier
# Method: Ensemble of decision trees; handles non-linearity well
# -----------------------------------------
model = RandomForestClassifier(
    n_estimators=200,        # Number of trees
    max_depth=8,             # Controls tree complexity
    min_samples_split=4,     # Minimum samples to split an internal node
    min_samples_leaf=2,      # Minimum samples required at a leaf node
    random_state=42
)

model.fit(X_train_scaled, y_train_resampled)

# -----------------------------------------
# 📈 Step 10: Evaluate the Model
# -----------------------------------------
train_acc = model.score(X_train_scaled, y_train_resampled) * 100
test_acc = model.score(X_test_scaled, y_test) * 100

print(f"✅ Training Accuracy: {train_acc:.2f}%")
print(f"✅ Test Accuracy: {test_acc:.2f}%")

# Predictions
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]

# Classification Metrics
print("\n📋 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"🔍 ROC AUC Score: {roc_auc_score(y_test, y_proba):.2f}")


Determined that 1 is success and 2,3 failure ..hence mapping

In [None]:
df['v12'].value_counts()

Bi-Variant Analysis

In [None]:
# for col in df.columns:
#     plt.figure(figsize=(6,2))
#     sns.barplot(x=col, y="v13" , data = df)
#     plt.title(f"{col} vs v13")
#     plt.show()

Multi-variant-Analysis

In [None]:
# plt.figure(figsize=(12,8))
# sns.heatmap(df.corr(),annot= True, cmap= "coolwarm")
# plt.show()

In [None]:
# -----------------------------------------
# 📦 1. Import Libraries
# -----------------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import RandomOverSampler
import warnings

warnings.filterwarnings("ignore")

# -----------------------------------------
# 📂 2. Load & Preprocess Dataset
# -----------------------------------------
df = pd.read_csv("HealthCareProject.csv")
df = df.drop(columns='Unnamed: 0').drop(index=0).reset_index(drop=True)
df = df.replace(999, np.nan).fillna(method='ffill').astype(int)
df = df[df['v13'] != 2].reset_index(drop=True)
df['v13'] = df['v13'].map({1: 0, 3: 1})  # 0 = Pregnant, 1 = Not Pregnant

# -----------------------------------------
# ⚠️ 3. IQR Outlier Treatment for v1, v9, v12 only
# -----------------------------------------
def treat_outliers_iqr(df, cols):
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df[col] = np.clip(df[col], lower, upper)
    return df

df = treat_outliers_iqr(df, cols=['v1'])

# -----------------------------------------
# 🚨 4. Handle Rare Categories in Categorical Columns
# -----------------------------------------
def group_rare_categories(df, col, threshold=0.05):
    total = len(df)
    freq = df[col].value_counts(normalize=True)
    rare_vals = freq[freq < threshold].index
    df[col] = df[col].apply(lambda x: 0 if x in rare_vals else x)
    return df

categorical_cols = ['v2', 'v4', 'v5', 'v6', 'v7', 'v8', 'v12']
for col in categorical_cols:
    df = group_rare_categories(df, col, threshold=0.05)

# -----------------------------------------
# 🎯 5. Features and Target
# -----------------------------------------
features = ['v1', 'v3', 'v5', 'v6', 'v9', 'v10', 'v11', 'v12']
X = df[features]
y = df['v13']

# -----------------------------------------
# ✂️ 6. Split, Resample, and Scale
# -----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# -----------------------------------------
# 🌲 7. Random Forest Model Training
# -----------------------------------------
rf_model = RandomForestClassifier(
    class_weight='balanced', random_state=42, n_estimators=150, max_depth=8
)
rf_model.fit(X_train_scaled, y_train_resampled)

# -----------------------------------------
# 📊 8. Evaluation
# -----------------------------------------
train_acc = rf_model.score(X_train_scaled, y_train_resampled) * 100
test_acc = rf_model.score(X_test_scaled, y_test) * 100

y_pred = rf_model.predict(X_test_scaled)
y_proba = rf_model.predict_proba(X_test_scaled)[:, 1]

print(f"\n✅ Training Accuracy: {train_acc:.2f}%")
print(f"✅ Test Accuracy: {test_acc:.2f}%")

print("\n📋 Classification Report:")
print(classification_report(y_test, y_pred))

print("📊 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"🔍 ROC AUC Score: {roc_auc_score(y_test, y_proba):.2f}")
