In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ds_functions import *

## Data Cleaning

In [2]:
maternal_risk = pd.read_csv('maternal_risk.csv')

### Preview Dataset

In [3]:
maternal_risk.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk


In [5]:
# Data type and missing data check to identify data cleaning
missing_data_check(maternal_risk)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1014 entries, 0 to 1013
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          1014 non-null   int64  
 1   SystolicBP   1014 non-null   int64  
 2   DiastolicBP  1014 non-null   int64  
 3   BS           1014 non-null   float64
 4   BodyTemp     1014 non-null   float64
 5   HeartRate    1014 non-null   int64  
 6   RiskLevel    1014 non-null   object 
dtypes: float64(2), int64(4), object(1)
memory usage: 55.6+ KB
None
Age            0
SystolicBP     0
DiastolicBP    0
BS             0
BodyTemp       0
HeartRate      0
RiskLevel      0
dtype: int64


### Renaming columns
To match naming conventions and convenience 

In [9]:
#Dict for renaming columns to pass into .rename
mapped_columns = {'Age':'age', 'SystolicBP': 'systolic_bp', 'DiastolicBP': 'diastolic_bp', 'BS': 'blood_sugar', 'HeartRate': 'heart_rate', 'RiskLevel': 'risk_level'}
maternal_risk.rename(columns=mapped_columns, inplace=True)
print(maternal_risk.columns)

Index(['age', 'systolic_bp', 'diastolic_bp', 'blood_sugar', 'BodyTemp',
       'heart_rate', 'risk_level'],
      dtype='object')


### Columns for EDA (Metadata)

--- Metadata on columns for maternal risk in Bangladesh (All patients are women) ---<br>
age = Age of patient in years during pregnancy<br>
systolic_bp = Systolic blood pressure of patient in mmHg (pressure in your arteries when the heart beats)<br>
diastolic_bp = Diastolic blood pressure of patient in mmHg (pressure in your arteries when the heart relaxes)<br>
blood_sugar = Blood glucose levels is in terms of a molar concentration, mmol/L.<br>
heart_rate = A normal resting heart rate in beats per minute.<br>
risk level = Predicted Risk Intensity Level during pregnancy considering the previous attributes.

In [12]:
# Variables to hold column names for future lookups in the dataset
age = 'age'
systolic_bp = 'systolic_bp'
diastolic_bp = 'diastolic_bp'
blood_sugar = 'blood_sugar'
heart_rate = 'heart_rate'
risk_level = 'risk_level'

In [23]:
#Examine unique values for each column to complete data check
columns = maternal_risk.columns

for column in columns:
    print(f"{column}, {maternal_risk[column].unique()}, \n")

age, [25 35 29 30 23 32 42 19 20 48 15 50 10 40 21 18 16 22 49 28 12 60 55 45
 31 17 26 54 44 33 13 34 38 39 63 14 37 51 62 43 65 66 56 70 27 36 59 24
 41 46], 

systolic_bp, [130 140  90 120  85 110  70 100  75  95  76  80 115 135 160 129  83  99
  78], 

diastolic_bp, [ 80  90  70  85  60  89  75 100  50  65  95  49  63  69  76  68], 

blood_sugar, [15.   13.    8.    7.    6.1   7.01 11.    6.9  18.    6.7   7.5   7.2
  7.1   6.4   9.    6.    7.7  12.   16.    7.8   6.8   7.9  17.   19.
 10.    6.3   6.6   6.5   7.6 ], 

BodyTemp, [ 98.  100.  102.  101.  103.   98.4  99.   98.6], 

heart_rate, [86 70 80 76 78 77 88 90 66 82 60 75 67 65 68  7], 

risk_level, ['high risk' 'low risk' 'mid risk'], 

