# An Example of using WonderD149Data package to extract data correlation from Natality, 2016-2021 expanded database by CDC

First adding the package to sys path.

In [3]:
import sys
sys.path.append('../src')

Now importing helper from the package to investigate all required options

In [4]:
import wonderD149Data.data.helper as hp
import wonderD149Data.wonderD149Data as wd

Listing all columns

In [5]:
categories = hp.getGroupByCategories()
list_of_codes = [hp.getCodeDetailsForGivenCategory(category) for category in hp.getGroupByCategories()]
columns = {}
for ele in list_of_codes:
    columns = {**columns,**ele}

columns

{'D149.V60': 'Interval Since Last Live Birth',
 'D149.V61': 'Interval Since Last Other Pregnancy Outcome',
 'D149.V62': 'Interval of Last Pregnancy',
 'D149.V151': 'Prior Births Now Living',
 'D149.V152': 'Prior Births Now Dead',
 'D149.V153': 'Prior Other Pregnancy Outcomes',
 'D149.V28': 'Live Birth Order',
 'D149.V59': 'Total Birth Order',
 'D149.V66': 'WIC',
 'D149.V89': 'Successful External Cephalic Version',
 'D149.V90': 'Failed External Cephalic Version',
 'D149.V65': 'Number of Prenatal Visits Recode',
 'D149.V64': 'SNumber of Prenatal Visits',
 'D149.V63': 'Trimester Prenatal Care Began',
 'D149.V8': 'Month Prenatal Care Began',
 'D149.V74': 'Pre-pregnancy Diabetes',
 'D149.V75': 'Gestational Diabetes',
 'D149.V16': 'Pre-pregnancy Hypertension',
 'D149.V17': 'Gestational Hypertension',
 'D149.V18': 'Eclampsia',
 'D149.V76': 'Previous Preterm Birth',
 'D149.V77': 'Infertility Treatment Used',
 'D149.V78': 'Fertility Enhancing Drugs',
 'D149.V79': 'Assistive Reproductive Technol

checking all possible measures

In [7]:
hp.getMeasureCodesAndDescription()

{'M_002': {'D149.M002': 'Births'},
 'M_007': {'D149.M007': 'Percent of Total Births'},
 'M_070': {'D149.M070': 'Average Age of Mother (years)'},
 'M_071': {'D149.M071': 'Average Age of Mother (years) Standard Deviation'},
 'M_080': {'D149.M080': 'Average OE Gestational Age (weeks)'},
 'M_081': {'D149.M081': 'Average OE Gestational Age (weeks) Standard Deviation'},
 'M_090': {'D149.M091': 'Average LMP Gestational Age (weeks)'},
 'M_091': {'D149.M091': 'Average LMP Gestational Age (weeks) Standard Deviation'},
 'M_095': {'D149.M095': 'Average Birth Weight (grams)'},
 'M_096': {'D149.M096': 'Average Birth Weight (grams) Standard Deviation'},
 'M_100': {'D149.M100': 'Average Pre-pregnancy BMI'},
 'M_101': {'D149.M101': 'Average Pre-pregnancy BMI Standard Deviation'},
 'M_110': {'D149.M110': 'Average Number of Prenatal Visits'},
 'M_111': {'D149.M111': 'Average Number of Prenatal Visits Standard Deviation'},
 'M_120': {'D149.M120': 'Average Interval Since Last Live Birth (months)'},
 'M_121

Creating all query params for getting data

In [8]:
measure_selection = {
    'M_002': 'D149.M002', # Births
}
observation_selection = {}
variable_filter = {
    # 'V_D149.V31': ['1','2'] # filtering only stated deliveries
}

creating wonderD149Data Object and getting data

In [12]:
import pandas as pd
col_list = []
for code in columns:
    dataObj = wd.WonderD149Data([code],measure_selection,observation_selection,variable_filter)
    data_col = dataObj.getData()
    col_data = pd.Series(data_col['Births'])
    col_data.name = columns[code]
    col_list.append(col_data)
    print(columns[code],'done')


Interval Since Last Live Birth done
Interval Since Last Other Pregnancy Outcome done
Interval of Last Pregnancy done
Prior Births Now Living done
Prior Births Now Dead done
Prior Other Pregnancy Outcomes done
Live Birth Order done
Total Birth Order done
WIC done
Successful External Cephalic Version done
Failed External Cephalic Version done
Number of Prenatal Visits Recode done
SNumber of Prenatal Visits done
Trimester Prenatal Care Began done
Month Prenatal Care Began done
Pre-pregnancy Diabetes done
Gestational Diabetes done
Pre-pregnancy Hypertension done
Gestational Hypertension done
Eclampsia done
Previous Preterm Birth done
Infertility Treatment Used done
Fertility Enhancing Drugs done
Assistive Reproductive Technology done
Previous Cesarean Delivery done
Number of Previous Cesareans done
Risk Factors Checked done
OE Gestational Age Recode 10 done
OE Gestational Age Recode 11 done
OE Gestational Age Weekly done
LMP Gestational Recode 10 done
LMP Gestational Recode 11 done
LMP Ges

Adding helper methods

In [13]:
def interpolateSeries(col,MAX_COL_LEN):
    import math
    curr_col_len = len(col)
    new_index = list(range(0,MAX_COL_LEN,math.floor(MAX_COL_LEN/curr_col_len)))[:curr_col_len]

    col.index = new_index
    return col.reindex(range(MAX_COL_LEN),method='ffill')

def is_Different_Category(col_name1,col_name2):
    cat1=''
    cat2=''
    categories = hp.getGroupByCategories()
    for category in categories:
        colnames = hp.getCodeDetailsForGivenCategory(category).values()
        if col_name1 in colnames:
            cat1 = category
        if col_name2 in colnames:
            cat2 = category
    return cat1 != cat2

Interpolating columns to all same length

In [16]:
print(len(col_list), 'columns encountered')
max_col = max(col_list,key=lambda x:len(x))
MAX_COL_LEN = len(max_col)
converted_col_list  =map(lambda x: interpolateSeries(x,MAX_COL_LEN),col_list)
converted_col_list = [*converted_col_list]
col_data_cleaned = pd.DataFrame(converted_col_list).T

137 columns encountered


In [80]:
col_data_cleaned['Year']

0      3945875.0
1      3945875.0
2      3945875.0
3      3945875.0
4      3945875.0
         ...    
239    3664292.0
240    3664292.0
241    3664292.0
242    3664292.0
243    3664292.0
Name: Year, Length: 244, dtype: float64

Selecting columns of high negative correlations

In [92]:
import itertools
# Create correlation matrix
corr_matrix = col_data_cleaned.corr()

# Find features with correlation greater than 0.995
to_investigate = [(row,column) for row,column in itertools.product(corr_matrix.index,corr_matrix.columns) if corr_matrix[column][row] < -0.75 and is_Different_Category(row,column)]
print(len(to_investigate),'combinations')


38 combinations


Selecting columns of high positive correlations

In [98]:
# Find features with correlation lesser than 0.01
to_investigate_low = [(row,column) for row,column in itertools.product(corr_matrix.index,corr_matrix.columns) if corr_matrix[column][row] > 0.99999999 and is_Different_Category(row,column)]
print(len(to_investigate_low),'combinations')

34 combinations


Creating graph for high negative correlation

In [103]:
set_of_nodes = set()
for combi in to_investigate:
    set_of_nodes = set_of_nodes.union(combi)

nodes = list(set_of_nodes)
print()

import networkx as nx

G = nx.Graph()

for combi in to_investigate:
    left,right = combi
    G.add_nodes_from([(nodes.index(left),{'name': left}),(nodes.index(right),{'name': right})])
    G.add_edge(nodes.index(left),nodes.index(right))

divergence_edge_attrs = {}
name_edge_attrs = {}

for start_node, end_node, _ in G.edges(data=True):
    divergence_edge_attrs[(start_node, end_node)] = (1.1-corr_matrix[G.nodes[start_node]["name"]][G.nodes[end_node]["name"]])*1000
    name_edge_attrs[(start_node, end_node)] = f'({G.nodes[start_node]["name"]}:{G.nodes[end_node]["name"]})'

nx.set_edge_attributes(G, divergence_edge_attrs, "edge_length")
nx.set_edge_attributes(G, name_edge_attrs, "edge_nodes")




Creating graph for high positive correlation

In [100]:
set_of_nodes = set()
for combi in to_investigate_low:
    set_of_nodes = set_of_nodes.union(combi)

nodes = list(set_of_nodes)
print()

import networkx as nx

G_low = nx.Graph()

for combi in to_investigate_low:
    left,right = combi
    G_low.add_nodes_from([(nodes.index(left),{'name': left}),(nodes.index(right),{'name': right})])
    G_low.add_edge(nodes.index(left),nodes.index(right))

divergence_edge_attrs = {}
name_edge_attrs = {}

for start_node, end_node, _ in G_low.edges(data=True):
    divergence_edge_attrs[(start_node, end_node)] = (corr_matrix[G_low.nodes[start_node]["name"]][G_low.nodes[end_node]["name"]])*1000
    name_edge_attrs[(start_node, end_node)] = f'({G_low.nodes[start_node]["name"]}:{G_low.nodes[end_node]["name"]})'

nx.set_edge_attributes(G_low, divergence_edge_attrs, "edge_length")
nx.set_edge_attributes(G_low, name_edge_attrs, "edge_nodes")




In [106]:
from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, NodesAndLinkedEdges, MultiLine,EdgesAndLinkedNodes
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
from bokeh.palettes import Spectral4

#Choose a title!
title = 'Pregnancy_related_births_data_high_negative_correlation'

#Establish which categories will appear when hovering over each node
HOVER_TOOLTIPS = [("Feature Names", "@edge_nodes" ),("Divergence","@edge_length")]

#Create a plot — set dimensions, toolbar, and title
plot = figure(tooltips = HOVER_TOOLTIPS,
              tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
            x_range=Range1d(-10.1, 10.1), y_range=Range1d(-10.1, 10.1), title=title)

#Create a network graph object with spring layout
# https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.drawing.layout.spring_layout.html
network_graph = from_networkx(G, nx.spring_layout, scale=10, center=(0, 0))

#Set node size and color
network_graph.node_renderer.glyph = Circle(size=15, fill_color='skyblue')
network_graph.node_renderer.selection_glyph = Circle(size=15, fill_color=Spectral4[2])
network_graph.node_renderer.hover_glyph = Circle(size=15, fill_color=Spectral4[1])

#Set edge opacity and width
network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.8, line_width=1)
network_graph.edge_renderer.selection_glyph = MultiLine(line_color=Spectral4[2], line_width=1)
network_graph.edge_renderer.hover_glyph = MultiLine(line_color=Spectral4[1], line_width=1)

network_graph.selection_policy = NodesAndLinkedEdges()
network_graph.inspection_policy = EdgesAndLinkedNodes()

#Add network graph to the plot
plot.renderers.append(network_graph)

output_notebook()

show(plot)
save(plot, filename=f"{title}.html")

  save(plot, filename=f"{title}.html")
  save(plot, filename=f"{title}.html")


'/Users/abhijithdasharathi/Study/UCSD/Winter_23/ECE143/CDC_Project/wonderD149Data/examples/Pregnancy_related_births_data_high_negative_correlation.html'

In [107]:
from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, NodesAndLinkedEdges, MultiLine,EdgesAndLinkedNodes
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
from bokeh.palettes import Spectral4

#Choose a title!
title = 'Pregnancy_related_births_data_high_positive_correlation'

#Establish which categories will appear when hovering over each node
HOVER_TOOLTIPS = [("Feature Names", "@edge_nodes" ),("Divergence","@edge_length")]

#Create a plot — set dimensions, toolbar, and title
plot = figure(tooltips = HOVER_TOOLTIPS,
              tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
            x_range=Range1d(-10.1, 10.1), y_range=Range1d(-10.1, 10.1), title=title)

#Create a network graph object with spring layout
# https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.drawing.layout.spring_layout.html
network_graph = from_networkx(G_low, nx.spring_layout, scale=10, center=(0, 0))

#Set node size and color
network_graph.node_renderer.glyph = Circle(size=15, fill_color='skyblue')
network_graph.node_renderer.selection_glyph = Circle(size=15, fill_color=Spectral4[2])
network_graph.node_renderer.hover_glyph = Circle(size=15, fill_color=Spectral4[1])

#Set edge opacity and width
network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.8, line_width=1)
network_graph.edge_renderer.selection_glyph = MultiLine(line_color=Spectral4[2], line_width=1)
network_graph.edge_renderer.hover_glyph = MultiLine(line_color=Spectral4[1], line_width=1)

network_graph.selection_policy = NodesAndLinkedEdges()
network_graph.inspection_policy = EdgesAndLinkedNodes()

#Add network graph to the plot
plot.renderers.append(network_graph)

output_notebook()

show(plot)
save(plot, filename=f"{title}.html")

  save(plot, filename=f"{title}.html")
  save(plot, filename=f"{title}.html")


'/Users/abhijithdasharathi/Study/UCSD/Winter_23/ECE143/CDC_Project/wonderD149Data/examples/Pregnancy_related_births_data_high_positive_correlation.html'