In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif

# Load the dataset
file_path = r"C:\Users\Aman Desai\Desktop\IDA\4501_AMANHI_With_USG.xlsx"
data = pd.read_excel(file_path)

# Drop unnecessary columns
columns_to_drop = [
    'WHOWID', 'ORIG_ID', 'PARTICIPANT_ID', 'BABY_ID1', 'BIRTH_OUTCOME1', 'BABY_SEX1',
    'BABY_ID2', 'BIRTH_OUTCOME2', 'BABY_SEX2', 'BABY_ID3', 'BIRTH_OUTCOME3', 'BABY_SEX3',
    'DEL_DATE', 'age_death_b1', 'age_death_b2','age_death_b3'
]
data.drop(columns=columns_to_drop, inplace=True)


# Convert 'GAGEBRTH' to weeks and create target variable for spontaneous abortion (1 for <22 weeks, 0 for >=22 weeks)
data['GAGEBRTH'] = pd.to_numeric(data['GAGEBRTH'], errors='coerce') / 7
data['Spontaneous_Abortion'] = data['GAGEBRTH'].apply(lambda x: 1 if x < 22 else 0 if x >= 22 else np.nan)

# Drop the original 'GAGEBRTH' column as it's now converted
data.drop(columns=['GAGEBRTH'], inplace=True)

# Handle missing values in the target column by mode imputation
target_mode = data['Spontaneous_Abortion'].mode()[0]
data['Spontaneous_Abortion'].fillna(target_mode, inplace=True)

# Define target variable and features
target = 'Spontaneous_Abortion'
features = [col for col in data.columns if col != target]

# Replace special placeholders (-88, -77) with NaN
data.replace({-88: np.nan, -77: np.nan}, inplace=True)

print(data.head(20))

# Identify numerical columns (ignoring NaN)
numerical_cols = data.select_dtypes(include=['number']).columns.tolist()

# Identify categorical columns (including NaN)
categorical_cols = [col for col in data.columns if col not in numerical_cols]

print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)


    PW_AGE  PW_EDUCATION  PREV_SB  PREV_MIS  PREV_PTB  PREV_MULTIP  PREV_CS  \
0       36          10.0      1.0       2.0       0.0          1.0      0.0   
1       32          10.0      0.0       0.0       0.0          0.0      0.0   
2       18           6.0      0.0       1.0       0.0          0.0      0.0   
3       30           6.0      1.0       0.0       0.0          0.0      0.0   
4       22          10.0      NaN       NaN       NaN          NaN      NaN   
5       43          10.0      1.0       1.0       0.0          0.0      0.0   
6       18           6.0      NaN       NaN       NaN          NaN      NaN   
7       28          10.0      0.0       0.0       0.0          0.0      0.0   
8       22          10.0      NaN       NaN       NaN          NaN      NaN   
9       25           6.0      0.0       0.0       0.0          0.0      0.0   
10      27          10.0      NaN       NaN       NaN          NaN      NaN   
11      20          10.0      NaN       NaN       Na

In [12]:
import pandas as pd

# Print all column names to verify
print("Columns in the DataFrame:", data.columns)
 
# Replace blank spaces with NaN
data.replace(r'^\s*$', np.nan, regex=True, inplace=True)

# Compute mode for categorical columns
mode_columns = [
    'PREV_SB', 'PREV_MIS', 'PREV_PTB', 'PREV_MULTIP', 'PREV_CS',
    'LABOUR_HTN', 'LABOUR_24', 'UDIP_PROT1', 'UDIP_PROT2', 
    'UDIP_PROT3', 'UDIP_PROT4', 'APH', 'SINGLE_TWIN'
]

for col in mode_columns:
    mode_value = data[col].mode()[0]
    data[col].fillna(mode_value, inplace=True)

# Compute mean for numerical columns
mean_columns = [
    'PW_EDUCATION', 'BIRTH_WEIGHT1', 'BIRTH_WEIGHT2', 
    'SBP1', 'DBP1', 'SBP2', 'DBP2', 'SBP3', 'DBP3', 
    'SBP4', 'DBP4', 'MAT_WEIGHT'
]

for col in mean_columns:
    mean_value = pd.to_numeric(data[col], errors='coerce').mean()
    data[col].fillna(mean_value, inplace=True)

# Set GRAVIDITY and PARITY to 0 if blank
data['GRAVIDITY'].fillna(0, inplace=True)
data['PARITY'].fillna(0, inplace=True)

# One-hot encode WEALTH_INDEX and TYPEDELIV
data = pd.get_dummies(data, columns=['WEALTH_INDEX', 'TYPEDELIV'], drop_first=True)

# Check the first few rows of the cleaned dataset
print(data.head())

Columns in the DataFrame: Index(['PW_AGE', 'PW_EDUCATION', 'PREV_SB', 'PREV_MIS', 'PREV_PTB',
       'PREV_MULTIP', 'PREV_CS', 'WEALTH_INDEX', 'SINGLE_TWIN', 'GRAVIDITY',
       'PARITY', 'LABOUR_HTN', 'LABOUR_24', 'BIRTH_OUTCOME', 'BABY_SEX',
       'BIRTH_WEIGHT', 'BIRTH_WEIGHT1', 'BIRTH_WEIGHT2', 'BIRTH_WEIGHT3',
       'SBP1', 'DBP1', 'UDIP_PROT1', 'SBP2', 'DBP2', 'UDIP_PROT2', 'SBP3',
       'DBP3', 'UDIP_PROT3', 'SBP4', 'DBP4', 'UDIP_PROT4', 'TYPEDELIV', 'APH',
       'MAT_WEIGHT', 'Spontaneous_Abortion'],
      dtype='object')
   PW_AGE  PW_EDUCATION  PREV_SB  PREV_MIS  PREV_PTB  PREV_MULTIP  PREV_CS  \
0      36          10.0      1.0       2.0       0.0          1.0      0.0   
1      32          10.0      0.0       0.0       0.0          0.0      0.0   
2      18           6.0      0.0       1.0       0.0          0.0      0.0   
3      30           6.0      1.0       0.0       0.0          0.0      0.0   
4      22          10.0      0.0       0.0       0.0          0.0     

In [25]:
import pandas as pd

# Convert True/False to 1/0
data = data.replace({True: 1, False: 0})
data.info()
#data = data.drop(['BIRTH_OUTCOME','BABY_SEX','BIRTH_WEIGHT','BIRTH_WEIGHT1','BIRTH_WEIGHT2','BIRTH_WEIGHT3'])
data.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4501 entries, 0 to 4500
Data columns (total 34 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   PW_AGE                                 4501 non-null   int64  
 1   PW_EDUCATION                           4501 non-null   float64
 2   PREV_SB                                4501 non-null   float64
 3   PREV_MIS                               4501 non-null   float64
 4   PREV_PTB                               4501 non-null   float64
 5   PREV_MULTIP                            4501 non-null   float64
 6   PREV_CS                                4501 non-null   float64
 7   SINGLE_TWIN                            4501 non-null   float64
 8   GRAVIDITY                              4501 non-null   float64
 9   PARITY                                 4501 non-null   float64
 10  LABOUR_HTN                             4501 non-null   float64
 11  LABO

Unnamed: 0,PW_AGE,PW_EDUCATION,PREV_SB,PREV_MIS,PREV_PTB,PREV_MULTIP,PREV_CS,SINGLE_TWIN,GRAVIDITY,PARITY,...,APH,MAT_WEIGHT,Spontaneous_Abortion,WEALTH_INDEX_Middle,WEALTH_INDEX_Poor,WEALTH_INDEX_Poorest,WEALTH_INDEX_Rich,WEALTH_INDEX_Richest,TYPEDELIV_Caesarean Section,TYPEDELIV_Normally through the vagina
0,36,10.0,1.0,2.0,0.0,1.0,0.0,1.0,8.0,3.0,...,0.0,45.8,0.0,1,0,0,0,0,0,1
1,32,10.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,3.0,...,0.0,59.798537,0.0,0,1,0,0,0,0,1
2,18,6.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,1.0,...,0.0,68.0,0.0,1,0,0,0,0,0,1
3,30,6.0,1.0,0.0,0.0,0.0,0.0,1.0,8.0,6.0,...,0.0,59.798537,0.0,0,1,0,0,0,0,1
4,22,10.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,59.798537,0.0,0,0,0,0,1,0,1
5,43,10.0,1.0,1.0,0.0,0.0,0.0,1.0,13.0,9.0,...,0.0,59.798537,0.0,1,0,0,0,0,0,1
6,18,6.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,58.0,0.0,0,0,0,0,1,0,1
7,28,10.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,4.0,...,0.0,62.7,0.0,0,1,0,0,0,0,1
8,22,10.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,59.798537,0.0,0,0,0,0,1,0,1
9,25,6.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,3.0,...,0.0,59.798537,0.0,0,1,0,0,0,0,1


In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score


# Define features (X) and target (y)
X = data.drop('Spontaneous_Abortion', axis=1)
y = data['Spontaneous_Abortion']

# Check class distribution before balancing
print("Class Distribution Before Balancing:")
print(y.value_counts())


# Step 3: Feature selection using Mutual Information
mi_scores = mutual_info_classif(X, y, discrete_features='auto', random_state=42)


# Step 2: Encode categorical variables (if any)
X_encoded = pd.get_dummies(X, drop_first=True)

# Step 3: Feature selection using Mutual Information
mi_scores = mutual_info_classif(X_encoded, y, discrete_features='auto', random_state=42)

# Create a DataFrame to display scores
mi_scores_df = pd.DataFrame({
    'Feature': X_encoded.columns,
    'MI_Score': mi_scores
}).sort_values(by='MI_Score', ascending=False)
# Filter datasets to include only selected features
X_selected = X_encoded[selected_features]

# Step 4: Balance the classes using ADASYN
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_selected, y)

# Check class distribution after balancing
print("\nClass Distribution After Balancing:")
print(pd.Series(y_resampled).value_counts())

# Step 5: Train a Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Step 6: Perform Stratified K-Fold Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf_model, X_resampled, y_resampled, cv=cv, scoring='f1')
print("\nCross-Validation F1 Scores:")
print(scores)
print("Mean F1 Score:", scores.mean())

# Step 7: Evaluate on a Synthetic Balanced Test Set
# Split the original data into training and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, stratify=y, random_state=42
)

# Balance the test set using ADASYN
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)
X_test_resampled, y_test_resampled = adasyn.fit_resample(X_test, y_test)

# Train the model on the balanced training set
rf_model.fit(X_resampled, y_resampled)

# Evaluate the model on the balanced test set
y_pred = rf_model.predict(X_test_resampled)
y_predy = rf_model.predict(X_train_resampled)

print("\nConfusion Matrix:")
print(confusion_matrix(y_train_resampled, y_predy))

print("\nClassification Report:")
print(classification_report(y_train_resampled, y_predy))
# Confusion matrix and classification report
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_resampled, y_pred))

print("\nClassification Report:")
print(classification_report(y_test_resampled, y_pred))

# ROC-AUC Score
y_pred_proba = rf_model.predict_proba(X_test_resampled)[:, 1]
roc_auc = roc_auc_score(y_test_resampled, y_pred_proba)
print("\nROC-AUC Score:", roc_auc)

Class Distribution Before Balancing:
Spontaneous_Abortion
0    4445
1      56
Name: count, dtype: int64

Class Distribution After Balancing:
Spontaneous_Abortion
0    4445
1    4441
Name: count, dtype: int64

Cross-Validation F1 Scores:
[0.98776418 0.97952407 0.98230088 0.98065229 0.97738555]
Mean F1 Score: 0.9815253966658343

Confusion Matrix:
[[3436  119]
 [  27 3536]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      3555
           1       0.97      0.99      0.98      3563

    accuracy                           0.98      7118
   macro avg       0.98      0.98      0.98      7118
weighted avg       0.98      0.98      0.98      7118


Confusion Matrix:
[[863  27]
 [ 21 870]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       890
           1       0.97      0.98      0.97       891

    accuracy                           0.97  

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import ADASYN


# Define features (X) and target (y)
X = data.drop('Spontaneous_Abortion', axis=1)
y = data['Spontaneous_Abortion']

# Check class distribution before balancing
print("Class Distribution Before Balancing:")
print(y.value_counts())

# Step 2: Encode categorical variables (if any)
X_encoded = pd.get_dummies(X, drop_first=True)

# Step 3: Feature selection using Mutual Information
mi_scores = mutual_info_classif(X_encoded, y, discrete_features='auto', random_state=42)

# Create a DataFrame to display scores
mi_scores_df = pd.DataFrame({
    'Feature': X_encoded.columns,
    'MI_Score': mi_scores
}).sort_values(by='MI_Score', ascending=False)

# Select top 10 features
top_n_features = 10
selected_features = mi_scores_df.head(top_n_features)['Feature'].tolist()
print("\nSelected Features:")
print(selected_features)

# Filter datasets to include only selected features
X_selected = X_encoded[selected_features]

# Step 4: Split the data into training and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, stratify=y, random_state=42
)

# Balance the training and test sets using ADASYN
adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)
X_test_resampled, y_test_resampled = adasyn.fit_resample(X_test, y_test)

# Check class distribution after balancing
print("\nClass Distribution After Balancing (Training Set):")
print(pd.Series(y_train_resampled).value_counts())
print("\nClass Distribution After Balancing (Test Set):")
print(pd.Series(y_test_resampled).value_counts())

# Step 5: Define the XGBoost Classifier
xgb_model = XGBClassifier(random_state=42)

# Step 6: Define the hyperparameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200],          # Number of trees
    'max_depth': [3, 5, 7],                 # Maximum depth of each tree
    'learning_rate': [0.01, 0.1, 0.2],      # Step size shrinkage
    'subsample': [0.6, 0.8, 1.0],           # Fraction of samples used for training each tree
    'colsample_bytree': [0.6, 0.8, 1.0],    # Fraction of features used for training each tree
    'scale_pos_weight': [1, 5, 10]          # Weight of the minority class (useful for imbalanced data)
}

# Step 7: Perform Grid Search with Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',  # Optimize for F1-score
    cv=cv,
    n_jobs=-1,     # Use all available CPU cores
    verbose=2
)

# Fit the grid search to the resampled training data
grid_search.fit(X_train_resampled, y_train_resampled)


# ROC-AUC Score for Training Set
roc_auc_train = roc_auc_score(y_train_resampled, y_train_pred_proba)
print("\nROC-AUC Score (Balanced Training Set):", roc_auc_train)
# Print the best parameters and best score
print("\nBest Parameters:")
print(grid_search.best_params_)
print("\nBest F1 Score:")
print(grid_search.best_score_)

# Step 9: Evaluate the model on the balanced training set
y_train_pred = best_xgb_model.predict(X_train_resampled)
y_train_pred_proba = best_xgb_model.predict_proba(X_train_resampled)[:, 1]
print("\nConfusion Matrix (Balanced Training Set):")
print(confusion_matrix(y_train_resampled, y_train_pred))
print("\nClassification Report (Balanced Training Set):")
print(classification_report(y_train_resampled, y_train_pred))

# Step 8: Train the model with the best parameters
best_xgb_model = grid_search.best_estimator_

# Step 9: Evaluate the model on the balanced test set
y_pred = best_xgb_model.predict(X_test_resampled)
y_pred_proba = best_xgb_model.predict_proba(X_test_resampled)[:, 1]

# Confusion Matrix and Classification Report for Balanced Test Set
print("\nConfusion Matrix (Balanced Test Set):")
print(confusion_matrix(y_test_resampled, y_pred))
print("\nClassification Report (Balanced Test Set):")
print(classification_report(y_test_resampled, y_pred))

# ROC-AUC Score
roc_auc = roc_auc_score(y_test_resampled, y_pred_proba)
print("\nROC-AUC Score (Balanced Test Set):", roc_auc)

Class Distribution Before Balancing:
Spontaneous_Abortion
0    4445
1      56
Name: count, dtype: int64

Selected Features:
['TYPEDELIV_Normally through the vagina', 'DBP3', 'SBP3', 'SBP4', 'DBP4', 'SBP2', 'DBP2', 'APH', 'DBP1', 'UDIP_PROT1']

Class Distribution After Balancing (Training Set):
Spontaneous_Abortion
1    3563
0    3555
Name: count, dtype: int64

Class Distribution After Balancing (Test Set):
Spontaneous_Abortion
1    891
0    890
Name: count, dtype: int64
Fitting 5 folds for each of 729 candidates, totalling 3645 fits

ROC-AUC Score (Balanced Training Set): 0.9947172316822412

Best Parameters:
{'colsample_bytree': 0.6, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100, 'scale_pos_weight': 10, 'subsample': 1.0}

Best F1 Score:
0.9806021939883991

Confusion Matrix (Balanced Training Set):
[[3415  140]
 [   0 3563]]

Classification Report (Balanced Training Set):
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      