In [None]:
# Title: Feature Selection for Prediction of Perinatal Asphyxia
# Author: Cephas Ekow Biney
# Institution: Kwame Nkrumah University of Science and Technology (KNUST)
# Date: 6th October, 2025.
# Description: 
#               This notebook demonstrates the application of feature selection
#               techniques-SelectKBest and Recursive Feature Elimination (RFE)-to
#               identify the most predictive variables for perinatal asphyxia using cleaned
#               neonatal data.

In [None]:
# Libraries Used
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import f_classif, SelectKBest, RFE

In [9]:
# Loading Preprocessed Data 
#data = pd.read_csv("cleaned_data.csv")
data = pd.read_csv("~/Desktop/New set/cleaned_data.csv")

# Defining Features and Targets
X = data.drop(["birth_asphyxia"], axis=1)
y = data["birth_asphyxia"]

In [None]:
# Splitting Data into Train and Test Sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
# Feature Selection Using SelectKBest
# This technique ranks features by statistical significance. Since dataset
# contains both numeric and categorical variables,
# ANOVA was used.

selector = SelectKBest(score_func=f_classif, k=8)
X_new = selector.fit_transform(X_train1, y_train1)

selected_features = X_train1.columns[selector.get_support(indices=True)]

# Get F-values and p-values
F_scores = selector.scores_
pvalues = selector.pvalues_

# Create a DataFrame to display F-values and p-values
results_df = pd.DataFrame({'Feature': X.columns, 'F-value': F_scores, 'P-value': pvalues})
results_df.sort_values(by='P-value', ascending=True, inplace=True)

print("\nTop Features selected by SelectKBest:")
print(results_df)


Top Features selected by SelectKBest:
                       Feature     F-value       P-value
8          labor_complications  142.345367  1.701609e-31
10    status_of_amniotic_fluid   86.125738  5.313903e-20
11       presentation_of_fetus   56.766462  8.171214e-14
2   gestational_age (in weeks)   21.182521  4.505510e-06
9                         PROM   20.487556  6.446029e-06
4          Placental_anomalies   14.258445  1.651786e-04
0        maternal_age (in yrs)    8.253190  4.121770e-03
1          birth_weight (in g)    4.257169  3.924621e-02
7     illness_during_pregnancy    3.150430  7.609727e-02
12      maternal_comorbidities    2.319145  1.279881e-01
6                       parity    1.239609  2.657139e-01
3              delivery_method    0.898019  3.434568e-01
5                 sex_of_fetus    0.378240  5.386342e-01


In [None]:
# Feature Selection Using RFE
# This technique iteratively removes least important features based
# on model coefficients (Logistic Regression here)

base_model = LogisticRegression(max_iter=500)
rfe = RFE(estimator=base_model, n_features_to_select=12)
rfe.fit(X_train1, y_train1)

rfe_features = X_train1.columns[rfe.support_]

# Get ranking of features (lower rank means more important)
ranking = rfe.ranking_

# Create a DataFrame to display feature ranking
feature_ranking = pd.DataFrame({'Feature': X.columns, 'Rank': ranking})
feature_ranking = feature_ranking.sort_values(by='Rank')


# 1 means most important
print("Feature Ranking:")
print(feature_ranking)

# Selecting the top k features
k = 9  # Adjust this to select the top k features
X_selected = X.loc[:, rfe.support_]

# Optionally, use cross-validation to evaluate model performance with selected features
scores = cross_val_score(base_model, X_selected, y, cv=10)  # 5-fold cross-validation
print(f"Mean cross-validation accuracy with top {k} features: {scores.mean()}")

Feature Ranking:
                       Feature  Rank
0        maternal_age (in yrs)     1
2   gestational_age (in weeks)     1
3              delivery_method     1
4          Placental_anomalies     1
5                 sex_of_fetus     1
6                       parity     1
7     illness_during_pregnancy     1
8          labor_complications     1
9                         PROM     1
10    status_of_amniotic_fluid     1
11       presentation_of_fetus     1
12      maternal_comorbidities     1
1          birth_weight (in g)     2
Mean cross-validation accuracy with top 9 features: 0.6935
