In [4]:
import logging
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import re

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
handler = logging.FileHandler('/content/drive/MyDrive/SGA/India dataset/data/india_may8_yf_cleaned_debug.txt', mode='w')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

def rename_column(df, alternative = None):
  logger.debug(df.columns)
  df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_', regex=False).str.replace(r'[, _]+', '_')
  if alternative != None:
    df = df.rename(columns = alternative)
  logger.debug(df.columns)
  #df.info()
  return df

def drop_null(df, cols):
  for col in cols:
    len_before = len(df)
    df.drop(df[df[col].isna()].index, axis = 0, inplace = True)
    len_after = len(df)
    logger.info('Removed ' + col + '.isna(). (' + str(len_before) + ' --> ' + str(len_after) + ')')
  return df

def to_numeric(df, col):
  len_before = len(df[df[col].notna()])
  df[col] = df[col].applymap(pd.to_numeric, errors='coerce')
  len_after = len(df[df[col].notna()])
  if len_after < len_before:
    logger.exception('No. of data before refill (' + str(len_before) + ' ) is more than after refill (' + str(len_after) + ')')
  return df

def replace_val(df, cols, replacement, to_num = False):
  for col in cols:
    df[col].replace(replacement, inplace = True)
    if to_num:
      df = to_numeric(df, [col])
  return df

def convert_to_days(s):
    weeks_match = re.search(r'(\d+)W', s)
    days_match = re.search(r'(\d+)D', s)
    if weeks_match is None or days_match is None:
        return None
    weeks = int(weeks_match.group(1))
    days = int(days_match.group(1))
    total_days = weeks * 7 + days
    return total_days

def compute_efw_centile(df):
    centile_df = pd.read_excel('/content/drive/MyDrive/SGA/Prev/EFW centile.xlsx')
    centile_df = centile_df.rename(columns={'GA' : 'ga'})

    df = pd.merge(df, centile_df, on = 'ga', how = 'left')
    refCentile = list(centile_df.columns)[1:]
    ranges = df[[2.5, 5, 10, 25, 50, 75, 90, 95, 97.5]].values.tolist()
    efw = df['efw'].values.tolist()
    centile = []

    for i in range(len(efw)):
        if efw[i] >= ranges[i][0] and efw[i] <= ranges[i][-1]:
            for j in range(len(ranges[i])):
                if efw[i] == ranges[i][j]:
                    centile.append(refCentile[j])
                    break
                if efw[i] < ranges[i][j]:
                    centile.append(refCentile[j] - (refCentile[j] - refCentile[j - 1]) * (ranges[i][j] - efw[i]) / (ranges[i][j] - ranges[i][j - 1]))
                    break
        else:
            centile.append(0)

    df['efw_centile'] = centile
    df.drop(df[df['efw_centile'] == 0].index, inplace = True)
    df['cur_sga'] = (df['efw_centile'] <= 10).astype(int)

    return df

def merge_groundtruth(df):
  gt = pd.read_csv('/content/drive/MyDrive/SGA/Ref_Centile/I21_BW.csv')
  gt.columns = gt.columns.str.lower()
  gt.rename(columns = {'ga' : 'birth_ga'}, inplace = True)
  df = pd.merge(df, gt, on = ['birth_ga', 'gender'], how = 'left')
  df.info()
  df['sga'] = df['bw'] / 1000 <= df['p_10']
  df['lbw'] = df['bw'] / 1000 <= 2.5
  df['sc'] = df['cur_sga'] ^ df['sga']
  df = to_numeric(df, ['sga', 'lbw', 'sc'])

  return df


df = pd.read_excel('/content/drive/MyDrive/SGA/India dataset/data/Cleaned data May 8.xlsx')
df = rename_column(df, alternative = {
      'studyid' : 'id',
      'maternalagecompletedyears' : 'm_age',
      'heightinmeters' : 'm_height',
       'weightinkilograms' : 'm_weight',
       'lastpregnancysga' : 'last_preg_sga',
       'lastpregnancyfgr' : 'last_preg_fgr',
       'lastpregnancynormalbaby' : 'last_preg_normal',
       'pregnancyinducedhypertension' : 'hypertension_0',
       'essentialhypertension' : 'hypertension_1',
       'gestationaldm' : 'diabetes_0',
       'pregestationaldm' : 'diabetes_1',
       'otherpregnancycomplication' : 'others',
       'gaatassesmentincompletedweek' : 'ga',
       'efwgrams' : 'efw',
       'efwcentile' : 'efw_centile',
       'meanutapi' : 'utapi_mean',
       'utapipercentile' : 'utapi_centile',
       'umbilicalapi' : 'umb_api',
       'umbilicalapipercentile' : 'umb_api_centile',
       'fetalpresentation' : 'presentation',
       'placentalsite' : 'placenta_site',
       'placentalthikness' : 'placenta_thickness',
       'singleverticalpocket' : 'single_vertical_pocket',
       'numberofumbilicalvessels' : 'num_umb_vessels',
       'knownhighriskofpe' : 'high_risk_pe',
       'onlowdoseaspirin' : 'low_dose_aspirin',
       'noncomplianceaspirinfollowu' : 'non_c_aspirin/follow_up',
       'knownhighriskoffgr' : 'high_risk_fgr',
       'gaatdeliveryweeks' : 'birth_ga',
       'birthweightgrams' : 'bw',
       'nicuadmission_(yes/no)' : 'nicu',
       'babymorbidityspecify' : 'morbidity'
})
df.info()
df = drop_null(df, ['bw', 'gender'])
df.info()
# Replace value into int / float
df = replace_val(df, ['last_preg_sga', 'last_preg_fgr', 'smoking', 'hypertension_0', 'hypertension_1', 'diabetes_0', 'diabetes_1', 'high_risk_pe', 'nicu'], replacement = {'YES' : 1, 'NO' : 0}, to_num = True)
df = replace_val(df, ['low_dose_aspirin'], replacement = {'NO' : 0, 'YES' : 1, ' NO' : 0, '`NO' : 0}, to_num = True)
df = replace_val(df, ['high_risk_fgr'], {'NO' : 0, 'YES' : 1, 'YES ' : 1}, to_num = True)
#df = replace_val(df, ['gender'], {'F' : 0, 'M' : 1}, to_num = True)
df = replace_val(df, ['presentation'], {'Vertex' : 1, 'Breech' : 0}, to_num = True)

# Remove death
df.drop(df[df['morbidity'].str.contains('DEATH|STILL BIRTH|PERINATAL MORTALITY ')].index, axis = 0, inplace = True)
df.drop(df[(~df['others'].str.contains('H/O')) & (df['others'].str.contains('DEATH|DIED'))].index, axis = 0, inplace = True)
df.info()
df['prev_failed_preg'] = df['gravida'] - df['para'] - 1

# Cross check others and high_risk_fgr, high_risk_pe, prev_failed_preg
df.loc[(~df['others'].str.contains('H/O')) & (df['others'].str.contains('SGA|FGR')), 'high_risk_fgr'] = 1
df.loc[(~df['others'].str.contains('H/O')) & (df['others'].str.contains('PE')), 'high_risk_pe'] = 1
df.loc[(df['others'].str.contains('H/O')) & (df['others'].str.contains('DEATH|DIED|STILL BIRTH')) & (df['prev_failed_preg'] < 1), 'prev_failed_preg'] = 1
df.loc[(df['last_preg_normal'].str.contains('STILL BIRTH|PERINATAL DEATH|MTP')) & (df['prev_failed_preg'] < 1), 'prev_failed_preg'] = 1
df.loc[~((df['last_preg_normal'].isna()) | (df['last_preg_normal'].str.contains('YES'))), 'last_preg_normal'] = 0
df = replace_val(df, ['last_preg_normal'], {'YES' : 1})
df.loc[df['others'].str.contains('H/O'), 'last_preg_normal'] = 0

# ga_week and ga_days to ga
df['ga'] = df['ga'].str.replace(r'\s+', '', regex = True)
df['ga'] = df['ga'].apply(convert_to_days)
df['birth_ga'] *= 7

# Calc bmi
df['bmi'] = df['m_weight'] / ((df['m_height'] / 100) ** 2)

# Recalc efw_centile
df = compute_efw_centile(df)
df.info()
# Convert to int / float
df = to_numeric(df, ['bpd', 'umb_api', 'umb_api_centile', 'placenta_thickness', 'last_preg_normal'])

# Obtain sga, lbw, sc
df = merge_groundtruth(df)

# Label encoding for gender
df = replace_val(df, ['gender'], {'F' : 0, 'M' : 1}, to_num = True)

df = df[['m_age', 'm_height', 'm_weight', 'last_preg_sga', 'last_preg_fgr', 'last_preg_normal', 'smoking', 'hypertension_0', 'hypertension_1', 'diabetes_0', 'diabetes_1', 'ga', 'bpd', 'hc', 'ac', 'fl', 'efw', 'efw_centile', 'utapi_mean', 'utapi_centile', 'umb_api', 'umb_api_centile', 'cpr', 'presentation', 'placenta_thickness', 'single_vertical_pocket', 'high_risk_pe', 'high_risk_fgr', 'bw', 'gender', 'prev_failed_preg', 'birth_ga', 'bmi', 'cur_sga', 'sga', 'lbw', 'sc']]

# no need to split into tri2 and tri3, tri2 only have 2 entries

df.to_csv('/content/drive/MyDrive/SGA/India dataset/data/india_may8_yf_cleaned.csv')
df.info()

  warn(msg)
DEBUG:__main__:Index(['studyid', 'maternalagecompletedyears', 'heightinmeters',
       'weightinkilograms', 'gravida', 'para', 'lastpregnancysga',
       'lastpregnancyfgr', 'lastpregnancynormalbaby', 'smoking',
       'pregnancyinducedhypertension', 'essentialhypertension',
       'gestationaldm', 'pregestationaldm', 'otherpregnancycomplication',
       'gaatassesmentincompletedweek', 'bpd', 'hc', 'ac', 'fl', 'tcd',
       'efwgrams', 'efwcentile', 'meanutapi', 'UtAPIPercentile',
       'umbilicalapi', 'UmbilicalAPIpercentile', 'cpr', 'fetalpresentation',
       'Placentalsite', 'Placentalthikness', 'afi', 'singleverticalpocket',
       'numberofumbilicalvessels', 'knownhighriskofpe', 'onlowdoseaspirin',
       'noncomplianceaspirinfollowu', 'knownhighriskoffgr',
       'gaatdeliveryweeks', 'birthweightgrams', 'gender',
       'nicuadmission (Yes/No)', 'babymorbidityspecify'],
      dtype='object')
  df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_', reg

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 982 entries, 0 to 981
Data columns (total 43 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       982 non-null    int64  
 1   m_age                    982 non-null    int64  
 2   m_height                 982 non-null    int64  
 3   m_weight                 963 non-null    float64
 4   gravida                  982 non-null    int64  
 5   para                     982 non-null    int64  
 6   last_preg_sga            369 non-null    object 
 7   last_preg_fgr            369 non-null    object 
 8   last_preg_normal         369 non-null    object 
 9   smoking                  982 non-null    object 
 10  hypertension_0           982 non-null    object 
 11  hypertension_1           981 non-null    object 
 12  diabetes_0               982 non-null    object 
 13  diabetes_1               981 non-null    object 
 14  others                   9

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
last_preg_normal
array([nan, 'YES', 'NO', 'STILL BIRTH ', 'PE ', 'MTP ',
       'PERINATAL DEATH '], dtype=object)

others
array(['NO', 'H/O PE ', 'OLIGOHYDROMINUS', 'EARLY FGR STAGE 3 ',
       'HANDICAP INCREASE ', 'MILD HYDROMINUS ', 'LGA', 'SGA',
       'PREVIOUS ECTOPIA CORDIS WITH T18', 'URINE PROTEIN +++ ',
       'SEVER OLIGOHYDROMNIOS ', 'H/O PE', 'H/O PIH AND PE ',
       'H/O PE , PIH AND PERINATAL DEATH ', 'ONLY SGA',
       'H/O MTP FOR SPINAL DEFECTS IN 22 WK', 'H/O PE AND HTN',
       'ESOPHAGEAL AND EOSINOPHAILIC', 'SHORT LIMB ', 'H/O HTN',
       'H/O PE AND PERINATAL DEATH ', 'H/O GDM AND STILL BIRTH  ',
       'H/O PREVIOUS CHILD CEREBRAL PALSY', 'LGA ',
       'H/O RT AOVARIAN TORSION ', 'H/O STILL BIRTH',
       'DEVELOPED PE ,URINE PROTEIN +', 'H/O PIH ',
       'H/O PERINATAL DEATH ', 'H/O CONGENITAL ABNORMALATY ',
       'H/O PE AND THYROID AND HTN ', 'MAJOR STRUCTURAL ABNORMALITIES ',
       'H/O PE IN PREVIOUS  PREGNANCY',
       'H/O MISCARRIAGE AND STILL BIRTH ', 'H/O AC2 MALFORMATION ',
       'H/O DOWN SYNDROME AND DIED AFTER BIRTH ', 'H/O GDM ', 'H/O BPV',
       'SGA ', 'TORCH POSITIVE, H/O PERINATAL DEATH',
       'H/O PE AND  PERINATAL DEATH IN 36 WK 2 KG ',
       'H/O ECLAMPSIA AND LBW', 'H/O TWIN PREGNANCY 33 WK 1700 G ',
       'ONLY SGA ', 'ANEMIA ', 'THYRODISM ', 'CONGENITAL ', 'PE RISK ',
       'H/O HOLOPROSENCEPHLY 27 WEEK MTP', 'H/O PIH AND PE 36 WK 1600 G ',
       'H/O SWELLING OVER BODY ', 'H/O PTB 1500 G 33 WK ',
       'H/O PERINATAL DEATH 34 WEK 1500 G ', 'PPH RISK',
       'PERINATAL DEATH 32 W 1500 G',
       'DIED OF BILLIARY ARTERIAN WITHIN 6 MONTH ', 'H/O CLUB FEET ',
       'H/O PIH AND PE IN PREVIOUS PREGNANCY '], dtype=object)


non_c_aspirin/follow_up
array(['NO', 'NON C FOLLOW UP ', 'NON C ASPIRIN ', 'YES'], dtype=object)


placenta_site
array(['F ANTERIOR', 'POSTERIOR ', 'F POSTERIOR', 'LATRAL FUNDAL',
       'ANTERIOR ', 'FUNDAL', 'RIGHT LATRAL ANT POSTERIOR',
       'F POSTERIOR LEFT LATRAL', 'LEFT LATRAL ANT POSTERIOR ', nan,
       'F POSTETIOR', 'LEFT LATERAL POSTERIOR', 'LATRAL POSTERIOR',
       'F POSTERIUOR', 'LEFT LATRAL F ANTERIOR', 'LEFT LATRAL',
       'LEFT LATRAL POSTERIOR ', 'F POSTERIOR ',
       'LEFT LATEAL F POSTERIOR', 'F ANT. POSTERIOR',
       'LEFT LATE POSTERIOR', 'FL ANTERIOR '], dtype=object)