In [1]:
# Title: Exploratory Data Analysis for Prediction of Perinatal Asphyxia
# Author: Cephas Ekow Biney
# Institution: Kwame Nkrumah University of Science and Technology (KNUST)
# Date: 6th October, 2025.
# Description: 
#               This notebook handles the exploratory data analysis aspect of
#               the dataset

In [2]:
# Libraries Used
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [6]:
# Loading the Dataset
data = pd.read_csv("cleaned_data.csv")

# Inspecting data
data.info()
data.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   maternal_age (in yrs)       2000 non-null   int64
 1   birth_weight (in g)         2000 non-null   int64
 2   gestational_age (in weeks)  2000 non-null   int64
 3   delivery_method             2000 non-null   int64
 4   Placental_anomalies         2000 non-null   int64
 5   sex_of_fetus                2000 non-null   int64
 6   parity                      2000 non-null   int64
 7   illness_during_pregnancy    2000 non-null   int64
 8   labor_complications         2000 non-null   int64
 9   PROM                        2000 non-null   int64
 10  status_of_amniotic_fluid    2000 non-null   int64
 11  presentation_of_fetus       2000 non-null   int64
 12  maternal_comorbidities      2000 non-null   int64
 13  birth_asphyxia              2000 non-null   int64
dtypes: int64

Unnamed: 0,maternal_age (in yrs),birth_weight (in g),gestational_age (in weeks),delivery_method,Placental_anomalies,sex_of_fetus,parity,illness_during_pregnancy,labor_complications,PROM,status_of_amniotic_fluid,presentation_of_fetus,maternal_comorbidities,birth_asphyxia
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,29.7425,2991.0545,36.523,0.5025,0.5185,0.4835,0.498,0.498,0.488,0.5015,0.4985,0.5085,0.5115,0.4035
std,4.902062,484.465146,1.95637,0.500119,0.499783,0.499853,0.500121,0.500121,0.499981,0.500123,0.500123,0.500053,0.499993,0.490722
min,20.0,1385.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,26.0,2666.75,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,30.0,2991.0,37.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
75%,33.0,3324.25,38.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,48.0,4000.0,40.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# Target Variable Distribution
X = data.drop(['birth_asphyxia'], axis=1)
y = data['birth_asphyxia']

sns.countplot(x=y, data=data)
plt.title('Distribution of Perinatal Asphyxia Cases')
plt.xlabel('Outcome')
plt.ylabel('Count')
plt.show()

print(data['birth_asphyxia'].value_counts(normalize=True) * 100)


In [None]:
# Numerical Variable Exploration

num_vars = ['gestational_age (in weeks)', 'maternal_age (in yrs)', 'birth_weight (in g)']

data[num_vars].hist(figsize=(12, 10), bins=20)
plt.suptitle('Distribution of Numerical Variables')
plt.show()

for var in num_vars:
    sns.boxplot(x=data[var])
    plt.title(f'{var} - Boxplot')
    plt.show()


In [None]:
# Categorical variable Exploration

cat_vars = ['sex_of_fetus', 'presentation_of_fetus', 'status_of_amniotic_fluid', 'delivery_method', 'illness_during_pregnancy',
            'maternal_comorbidities', 'labor_complications', 'parity', 'PROM', 'Placental_anomalies']

for var in cat_vars:
    sns.countplot(x=var, data=data)
    plt.title(f'{var} - Frequency')
    plt.xticks(rotation=45)
    plt.show()


In [None]:
# Bivariate Analysis

In [None]:
# a) Categorical vs Target
for var in cat_vars:
    ct = pd.crosstab(data[var], data['birth_asphyxia'], normalize='index') * 100
    print(f"\n{var}\n", ct)



In [None]:
#b) Numeric vs Target
for var in num_vars:
    sns.boxplot(x=y, y=var, data=data)
    plt.title(f'{var} by Outcome (PA)')
    plt.show()


In [None]:
# Correlation Analysis
corr = data[num_vars].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Numerical Variables')
plt.show()
