In [7]:
import numpy as np
import pandas as pd
from scipy import stats

import dowhy
from dowhy import CausalModel

from sklearn.linear_model import LassoCV
from sklearn.ensemble import GradientBoostingRegressor

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import graphviz

import numpy as np
import networkx as nx


In [8]:
#Treat dataset

#Import data 
# data = pd.read_csv('../datasets/consolidated.csv',header=0, index_col=0)
data = pd.read_csv('./GenerateDatasets/consolidated.csv',header=0, index_col=0)
consolidated = pd.DataFrame(data)

consolidated = consolidated.drop(columns=['experiment'])#, 'messageDuplicated'])#, 'n_nodes', 'consensus', 'messageDuplicated'])
consolidated = consolidated.rename(columns={'topology':'top'})
consolidated['topology'] = 0
consolidated.loc[consolidated['top'] == 'general', 'topology'] = 1
consolidated = consolidated.drop(columns=['top'])
# consolidated.head(100)

df = consolidated[['d', 'messageReceived', 'graft', 'prune', 'iwant', 'dlo', 'dhi', 'dscore', 'dlazy', 'ihave', 'dout', 'gossipFactor', 
                  'interval', 'topology', 'propagationTime', 'messageOverhead', 'messageBandwidth', 'totalBandwidth', 'gossipBandwidth']]#.to_numpy()

# X = consolidated.to_numpy()
print(df)

        d  messageReceived  graft  prune  iwant  dlo  dhi  dscore  dlazy  \
0       8           8684.0    0.0    0.0    0.0    6   12       4      8   
1       8          16138.0    0.0    0.0    0.0    6   12       4      8   
2       8          13689.0    0.0    0.0    0.0    6   12       4      8   
3       8          13750.0    0.0    0.0    0.0    6   12       4      8   
4       8          13993.0    0.0    0.0    2.0    6   12       4      8   
...    ..              ...    ...    ...    ...  ...  ...     ...    ...   
33816  21              0.0    0.0    0.0    0.0   16   24       5     16   
33817  21              0.0    0.0    0.0    0.0   16   24       5     16   
33818  21              0.0    0.0    0.0    0.0   16   24       5     16   
33819  21              0.0    0.0    0.0    0.0   16   24       5     16   
33820  21              0.0    0.0    0.0    0.0   16   24       5     16   

        ihave  dout  gossipFactor  interval  topology  propagationTime  \
0       698.0

In [9]:
#Causal graph
causal_graph = nx.DiGraph([('d', 'messageReceived'),
                           ('d', 'graft'),
                           ('d', 'prune'),
                           ('d', 'iwant'),
                           ('dlo', 'graft'),
                           ('dhi', 'prune'),
                           ('dscore', 'graft'),
                           ('dscore', 'prune'),
                           ('dlazy', 'iwant'),
                           ('dlazy', 'ihave'),
                           ('dout', 'graft'),
                           ('dout', 'prune'),
                           ('gossipFactor', 'iwant'),
                           ('gossipFactor', 'ihave'),
                           ('interval', 'graft'),
                           ('interval', 'prune'),
                           ('interval', 'iwant'),
                           ('interval', 'ihave'),
                           ('topology', 'graft'),
                           ('topology', 'prune'),
                           ('topology', 'propagationTime'),
                           ('topology', 'messageReceived'),
                           ('prune', 'graft'),
                           ('prune', 'propagationTime'),
                           ('messageReceived', 'messageOverhead'),
                           ('messageOverhead', 'messageBandwidth'),
                           ('messageReceived', 'messageBandwidth'),
                           ('messageBandwidth', 'totalBandwidth'),
                           ('gossipBandwidth', 'totalBandwidth'),
                           ('ihave', 'gossipBandwidth'),
                           ('iwant', 'gossipBandwidth')
                         ])

gml_graph = "\n".join(nx.generate_gml(causal_graph))
# nx.write_gml(causal_graph, gml_graph)

print(gml_graph)

graph [
  directed 1
  node [
    id 0
    label "d"
  ]
  node [
    id 1
    label "messageReceived"
  ]
  node [
    id 2
    label "graft"
  ]
  node [
    id 3
    label "prune"
  ]
  node [
    id 4
    label "iwant"
  ]
  node [
    id 5
    label "dlo"
  ]
  node [
    id 6
    label "dhi"
  ]
  node [
    id 7
    label "dscore"
  ]
  node [
    id 8
    label "dlazy"
  ]
  node [
    id 9
    label "ihave"
  ]
  node [
    id 10
    label "dout"
  ]
  node [
    id 11
    label "gossipFactor"
  ]
  node [
    id 12
    label "interval"
  ]
  node [
    id 13
    label "topology"
  ]
  node [
    id 14
    label "propagationTime"
  ]
  node [
    id 15
    label "messageOverhead"
  ]
  node [
    id 16
    label "messageBandwidth"
  ]
  node [
    id 17
    label "totalBandwidth"
  ]
  node [
    id 18
    label "gossipBandwidth"
  ]
  edge [
    source 0
    target 1
  ]
  edge [
    source 0
    target 2
  ]
  edge [
    source 0
    target 3
  ]
  edge [
    source 0
    ta

In [10]:
# With graph
model = CausalModel(
    data=df,
    treatment='d',
    outcome='totalBandwidth',
    graph=gml_graph
)
model.view_model()

In [11]:
estimand = model.identify_effect()
print(estimand)

Estimand type: nonparametric-ate

### Estimand : 1
Estimand name: backdoor
Estimand expression:
 d                     
────(E[totalBandwidth])
d[d]                   
Estimand assumption 1, Unconfoundedness: If U→{d} and U→totalBandwidth then P(totalBandwidth|d,,U) = P(totalBandwidth|d,)

### Estimand : 2
Estimand name: iv
No such variable(s) found!

### Estimand : 3
Estimand name: frontdoor
No such variable(s) found!



In [12]:
estimate = model.estimate_effect(
    identified_estimand=estimand,
    method_name='backdoor.econml.dml.DML',
    method_params={
        'init_params': {
            'model_y': GradientBoostingRegressor(),
            'model_t': GradientBoostingRegressor(),
            'model_final': LassoCV(fit_intercept=False),
        },
        'fit_params': {}}
)

print(f'Estimate of causal effect (DML): {estimate.value}')

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


Estimate of causal effect (DML): 14.69242494042134


In [13]:
estimate_lr = model.estimate_effect(
    identified_estimand=estimand,
    method_name='backdoor.linear_regression')

print(f'Estimate of causal effect (linear regression): {estimate_lr.value}')

linear_regression
{'control_value': 0, 'treatment_value': 1, 'test_significance': None, 'evaluate_effect_strength': False, 'confidence_intervals': False, 'target_units': 'ate', 'effect_modifiers': ['gossipFactor', 'ihave', 'topology', 'interval', 'dlazy']}



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Estimate of causal effect (linear regression): 23.536463116463665


In [14]:
random_cause = model.refute_estimate(
    estimand=estimand, 
    estimate=estimate,
    method_name='random_common_cause'
)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please cha

In [15]:
print(random_cause)

Refute: Add a random common cause
Estimated effect:14.69242494042134
New effect:15.3857859271261
p value:0.02



In [16]:
placebo_refuter = model.refute_estimate(
    estimand=estimand, 
    estimate=estimate,
    method_name='placebo_treatment_refuter'
)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please cha

In [17]:
print(placebo_refuter)

Refute: Use a Placebo Treatment
Estimated effect:14.69242494042134
New effect:-0.007964416033333753
p value:1.28

