diff --git a/.github/workflows/deploy_docu_dev.yml b/.github/workflows/deploy_docu_dev.yml index a058ffc1..7ac3d8d1 100644 --- a/.github/workflows/deploy_docu_dev.yml +++ b/.github/workflows/deploy_docu_dev.yml @@ -70,7 +70,7 @@ jobs: run: | install.packages('remotes') remotes::install_github('DoubleML/doubleml-for-r', dependencies = TRUE) - install.packages(c('ggplot2', 'IRkernel', 'xgboost', 'hdm', 'reshape2', 'gridExtra', "igraph", "mlr3filters")) + install.packages(c('ggplot2', 'IRkernel', 'xgboost', 'hdm', 'reshape2', 'gridExtra', "igraph", "mlr3filters", 'DiagrammeR')) IRkernel::installspec() shell: Rscript {0} diff --git a/.github/workflows/test_build_docu_dev.yml b/.github/workflows/test_build_docu_dev.yml index e6a96e01..a4dea407 100644 --- a/.github/workflows/test_build_docu_dev.yml +++ b/.github/workflows/test_build_docu_dev.yml @@ -98,7 +98,7 @@ jobs: run: | install.packages('remotes') remotes::install_github('DoubleML/doubleml-for-r', dependencies = TRUE) - install.packages(c('ggplot2', 'IRkernel', 'xgboost', 'hdm', 'reshape2', 'gridExtra', "igraph", "mlr3filters")) + install.packages(c('ggplot2', 'IRkernel', 'xgboost', 'hdm', 'reshape2', 'gridExtra', "igraph", "mlr3filters", 'DiagrammeR')) IRkernel::installspec() shell: Rscript {0} @@ -107,7 +107,7 @@ jobs: run: | install.packages('remotes') remotes::install_github('DoubleML/doubleml-for-r@${{ github.event.inputs.doubleml-r-branch }}', dependencies = TRUE) - install.packages(c('ggplot2', 'IRkernel', 'xgboost', 'hdm', 'reshape2', 'gridExtra', "igraph", "mlr3filters")) + install.packages(c('ggplot2', 'IRkernel', 'xgboost', 'hdm', 'reshape2', 'gridExtra', "igraph", "mlr3filters", 'DiagrammeR')) IRkernel::installspec() shell: Rscript {0} diff --git a/doc/examples/index.rst b/doc/examples/index.rst index a67c9d6a..23bc059b 100644 --- a/doc/examples/index.rst +++ b/doc/examples/index.rst @@ -16,6 +16,7 @@ These are case studies with the R package :ref:`DoubleML `. R_double_ml_pension.ipynb r_double_ml_multiway_cluster.ipynb + r_doubleml_basic_iv.ipynb Python: Case studies --------------------- @@ -37,6 +38,7 @@ These are case studies with the Python package :ref:`DoubleML py_double_ml_learner.ipynb py_double_ml_sensitivity.ipynb py_double_ml_policy_tree.ipynb + py_doubleml_basic_iv.ipynb Sandbox ---------- diff --git a/doc/examples/py_double_ml_basic_iv.ipynb b/doc/examples/py_double_ml_basic_iv.ipynb new file mode 100644 index 00000000..261888f8 --- /dev/null +++ b/doc/examples/py_double_ml_basic_iv.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Basic Instrumental Variables calculation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example we show how to use the DoubleML functionality of Instrumental Variables (IVs) in the basic setting shown in the graph below, where:\n", + "\n", + "- Z is the instrument\n", + "- C is a vector of unobserved confounders\n", + "- D is the decision or treatment variable\n", + "- Y is the outcome\n", + "\n", + "So, we will first generate synthetic data using linear models compatible with the diagram, and then use the DoubleML package to estimate the causal effect from D to Y. \n", + "\n", + "We assume that you have basic knowledge of instrumental variables and linear regression." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false, + "tags": [ + "nbsphinx-thumbnail" + ] + }, + "outputs": [], + "source": [ + "from IPython.display import Image\n", + "from graphviz import Source\n", + "from numpy.random import seed, normal, binomial, uniform\n", + "from pandas import DataFrame\n", + "from sklearn.linear_model import LinearRegression, LogisticRegression\n", + "import doubleml as dml\n", + "\n", + "seed(1234)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instrumental Variables Directed Acyclic Graph (IV - DAG)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "C\n", + "\n", + "C\n", + "\n", + "\n", + "\n", + "D\n", + "\n", + "D\n", + "\n", + "\n", + "\n", + "C->D\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Y\n", + "\n", + "Y\n", + "\n", + "\n", + "\n", + "C->Y\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "D->Y\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Z\n", + "\n", + "Z\n", + "\n", + "\n", + "\n", + "Z->D\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "src = Source(\n", + " \"\"\"digraph G {\n", + " rankdir=\"UD\"\n", + "\n", + " node[shape=box, style=rounded]\n", + " C[style=\"rounded,dashed\"]\n", + " {\n", + " rank = same\n", + " D\n", + " Y\n", + " Z\n", + " }\n", + "\n", + " Z -> D[minlen=3]\n", + " C -> D\n", + " C -> Y\n", + " D -> Y[minlen=5]\n", + " }\"\"\")\n", + "src" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Simulation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This code generates `n` samples in which there is a unique binary confounder. The treatment is also a binary variable, while the outcome is a continuous linear model. \n", + "\n", + "The quantity we want to recover using IVs is the `decision_impact`, which is the impact of the decision variable into the outcome. " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5f8b1555", + "metadata": {}, + "outputs": [], + "source": [ + "n = 1000\n", + "instrument_impact = 0.7\n", + "decision_impact = - 2\n", + "\n", + "confounder = binomial(1, 0.3, n)\n", + "instrument = binomial(1, 0.5, n)\n", + "decision = (uniform(0, 1, n) <= instrument_impact*instrument + 0.4*confounder).astype(int)\n", + "outcome = 30 + decision_impact*decision + 10 * confounder + normal(0, 2, n)\n", + "\n", + "df = DataFrame({\n", + " 'instrument': instrument,\n", + " 'decision': decision,\n", + " 'outcome': outcome\n", + "})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Naive estimation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that if we make a direct estimation of the impact of the `decision` into the `outcome`, though the difference of the averages of outcomes between the two decision groups, we obtain a biased estimate. " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2d00221a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.1099472942084532\n" + ] + } + ], + "source": [ + "outcome_1 = df[df.decision==1].outcome.mean()\n", + "outcome_0 = df[df.decision==0].outcome.mean()\n", + "print(outcome_1 - outcome_0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using DoubleML" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DoubleML assumes that there is at least one observed confounder. For this reason, we create a fake variable that doesn't bring any kind of information to the model, called `obs_confounder`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use the DoubleML we need to specify the Machine Learning methods we want to use to estimate the different relationships between variables:\n", + "\n", + "- `ml_g` models the functional relationship betwen the `outcome` and the pair `instrument` and observed confounders `obs_confounders`. In this case we choose a `LinearRegression` because the outcome is continuous. \n", + "- `ml_m` models the functional relationship betwen the `obs_confounders` and the `instrument`. In this case we choose a `LogisticRegression` because the outcome is dichotomic.\n", + "- `ml_r` models the functional relationship betwen the `decision` and the pair `instrument` and observed confounders `obs_confounders`. In this case we choose a `LogisticRegression` because the outcome is dichotomic.\n", + "\n", + "\n", + "Notice that instead of using linear and logistic regression, we could use more flexible models capable of dealing with non-linearities such as random forests, boosting, ... " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "600b8196", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
coefstd errtP>|t|2.5 %97.5 %
decision-1.9170920.487533-3.9322270.000084-2.87264-0.961544
\n", + "
" + ], + "text/plain": [ + " coef std err t P>|t| 2.5 % 97.5 %\n", + "decision -1.917092 0.487533 -3.932227 0.000084 -2.87264 -0.961544" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['obs_confounders'] = 1\n", + "\n", + "ml_g = LinearRegression()\n", + "ml_m = LogisticRegression(penalty=None)\n", + "ml_r = LogisticRegression(penalty=None)\n", + "\n", + "obj_dml_data = dml.DoubleMLData(\n", + " df, y_col='outcome', d_cols='decision', \n", + " z_cols='instrument', x_cols='obs_confounders'\n", + ")\n", + "dml_iivm_obj = dml.DoubleMLIIVM(obj_dml_data, ml_g, ml_m, ml_r)\n", + "dml_iivm_obj.fit().summary" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the causal effect is estimated without bias." + ] + }, + { + "cell_type": "markdown", + "id": "7e3de685", + "metadata": {}, + "source": [ + "## References\n", + "\n", + "Ruiz de Villa, A. Causal Inference for Data Science, Manning Publications, 2024." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/examples/r_double_ml_basic_iv.ipynb b/doc/examples/r_double_ml_basic_iv.ipynb new file mode 100644 index 00000000..1a1ef706 --- /dev/null +++ b/doc/examples/r_double_ml_basic_iv.ipynb @@ -0,0 +1,254 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Basic Instrumental Variables calculation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example we show how to use the DoubleML functionality of Instrumental Variables (IVs) in the basic setting shown in the graph below, where:\n", + "\n", + "- Z is the instrument\n", + "- C is a vector of unobserved confounders\n", + "- D is the decision or treatment variable\n", + "- Y is the outcome\n", + "\n", + "So, we will first generate synthetic data using linear models compatible with the diagram, and then use the DoubleML package to estimate the causal effect from D to Y. \n", + "\n", + "We assume that you have basic knowledge of instrumental variables and linear regression." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "library(DoubleML)\n", + "library(mlr3learners)\n", + "library(DiagrammeR)\n", + "\n", + "set.seed(1234)\n", + "options(warn=-1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instrumental Variables Directed Acyclic Graph (IV - DAG)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "grViz(\n", + " 'digraph G {\n", + " rankdir=\"UD\"\n", + "\n", + " node[shape=box, style=rounded]\n", + " C[style=\"rounded,dashed\"]\n", + " {\n", + " rank = same\n", + " D\n", + " Y\n", + " Z\n", + " }\n", + "\n", + " Z -> D[minlen=3]\n", + " C -> D\n", + " C -> Y\n", + " D -> Y[minlen=5]\n", + " }')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "tags": [ + "nbsphinx-thumbnail" + ] + }, + "source": [ + "## Data Simulation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This code generates `n` samples in which there is a unique binary confounder. The treatment is also a binary variable, while the outcome is a continuous linear model. \n", + "\n", + "The quantity we want to recover using IVs is the `decision_impact`, which is the impact of the decision variable into the outcome. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f8b1555", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "n = 10000\n", + "decision_effect = -2\n", + "instrument_effect = 0.7\n", + "\n", + "confounder = rbinom(n, 1, 0.3)\n", + "instrument = rbinom(n, 1, 0.5)\n", + "decision = as.numeric(runif(n) = instrument_effect*instrument + 0.4*confounder)\n", + "outcome = 30 + decision_effect*decision + 10 * confounder + rnorm(n, sd=2)\n", + "df = data.frame(instrument, decision, outcome)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Naive estimation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that if we make a direct estimation of the impact of the `decision` into the `outcome`, though the difference of the averages of outcomes between the two decision groups, we obtain a biased estimate. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d00221a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "mean(df[df$decision==1, 'outcome']) - mean(df[df$decision==0, 'outcome'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using DoubleML" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DoubleML assumes that there is at least one observed confounder. For this reason, we create a fake variable that doesn't bring any kind of information to the model, called `obs_confounder`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use the DoubleML we need to specify the Machine Learning methods we want to use to estimate the different relationships between variables:\n", + "\n", + "- `ml_g` models the functional relationship betwen the `outcome` and the pair `instrument` and observed confounders `obs_confounders`. In this case we choose a `LinearRegression` because the outcome is continuous. \n", + "- `ml_m` models the functional relationship betwen the `obs_confounders` and the `instrument`. In this case we choose a `LogisticRegression` because the outcome is dichotomic.\n", + "- `ml_r` models the functional relationship betwen the `decision` and the pair `instrument` and observed confounders `obs_confounders`. In this case we choose a `LogisticRegression` because the outcome is dichotomic.\n", + "\n", + "\n", + "Notice that instead of using linear and logistic regression, we could use more flexible models capable of dealing with non-linearities such as random forests, boosting, ... " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "600b8196", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "df['obs_confounders'] <- 1\n", + "\n", + "obj_dml_data = DoubleMLData$new(\n", + " df, y_col=\"outcome\", d_col = \"decision\", \n", + " z_cols= \"instrument\", x_cols = \"obs_confounders\"\n", + ")\n", + "\n", + "ml_g = lrn(\"regr.lm\")\n", + "ml_m = lrn(\"classif.log_reg\")\n", + "ml_r = ml_m$clone()\n", + "\n", + "iv_2 = DoubleMLIIVM$new(obj_dml_data, ml_g, ml_m, ml_r)\n", + "result = iv_2$fit()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the causal effect is estimated without bias." + ] + }, + { + "cell_type": "markdown", + "id": "fe005575", + "metadata": {}, + "source": [ + "## References\n", + "\n", + "Ruiz de Villa, A. Causal Inference for Data Science, Manning Publications, 2024." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.2.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/requirements.txt b/requirements.txt index 4d619910..6d19fa1e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ plotly seaborn xgboost lightgbm +graphviz \ No newline at end of file