In [152]:
import pandas as pd
import numpy as np
import sys
import traceback
import seaborn as sns
import matplotlib.pyplot as plt
from typing import Final, List, TypeVar, Dict, Tuple, Any, Optional
import sklearn as skl

In [153]:
import warnings
warnings.filterwarnings("ignore")

#print(sns.plotting_context())

sns.set_theme(style="darkgrid",context="paper")

import matplotlib.style
if "seaborn-darkgrid" in matplotlib.style.available:
    matplotlib.style.use("seaborn-darkgrid")

%matplotlib inline

In [154]:
_seed: Final[int] = 42
"Using the meaning of life, the universe, and everything as the seed for RNG"

_rng: Final[np.random.Generator] = np.random.default_rng(seed=_seed)
"and creating a numpy random generator, using our set seed of 42"

""

''

# IHDP dataset processing


In [155]:
def turn_01_columns_into_int(
        dataframe_to_edit: pd.DataFrame,
) -> pd.DataFrame:
    """
    Finds all of the columns that just contain values of 0 and 1,
    and converts all of those columns to ints.

    Dataframe will have an '01' and 'not_01' attr added to it.
    Labels for series that only contain values 0 and 1 will be in the '01' tuple
    Labels for every other series will be in the 'not_01' tuple

    MODIFIES THE GIVEN DATAFRAME!
    :param dataframe_to_edit: the dataframe that is being edited
    :return: The modified dataframe.
    DOES NOT COPY THE GIVEN ORIGINAL DATAFRAME.

    >>> import pandas as pd
    >>> print(pd.__version__)
    1.4.1
    >>> before: pd.DataFrame = pd.DataFrame.from_dict(data={"int01":[0,1,1,0],"flt01":[0.0, 1.0, 0.0, 1.0], "intNo": [-1,0,1,2], "fltNo":[-1.0, 0.0, 1.0, 2.0], "intNan": [0,1,None,0], "fltNan":[0.0,1.0,None,0.0]})
    >>> before_types = before.dtypes.values
    >>> after: pd.DataFrame = turn_01_columns_into_int(before.copy())
    >>> after_types = after.dtypes.values
    >>> print(after_types[0])
    uint8
    >>> print(after_types[1])
    uint8
    >>> print(f"{before_types[2] == after_types[2]} {before_types[3] == after_types[3]} {before_types[4] == after_types[4]} {before_types[5] == after_types[5]}")
    True True True True
    >>> print(f"{after.attrs['01']}")
    ('int01', 'flt01')
    >>> print(f"{after.attrs['not_01']} ")
    ('intNo', 'fltNo', 'intNan', 'fltNan')
    """
    cols_01: List[str] = []
    not_01:  List[str] = []
    for c in dataframe_to_edit.columns:
        #if dataframe_to_edit[c].dtype == np.uint8:
        #    continue
        if dataframe_to_edit[c].isin([0,1]).all():
            dataframe_to_edit[c] = dataframe_to_edit[c].astype(np.uint8)
            cols_01.append(c)
        else:
            not_01.append(c)
    dataframe_to_edit.attrs["01"] = tuple(cols_01)
    dataframe_to_edit.attrs["not_01"] = tuple(not_01)
    return dataframe_to_edit

In [156]:

ihdp_full: pd.DataFrame = turn_01_columns_into_int(pd.read_csv("ihdp_full.csv"))
"The full IHDP dataset (with supplementary t0 and t1 info) as a dataframe"

ihdp_full.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x21,x22,x23,x24,t,yf,ycf,ite,t0,t1
0,1.397395,0.996346,-1.105624,-0.879606,0.308569,-1.023402,1,0,0,0,...,0,0,0,1,1,4.771232,-0.298509,4.657928,-0.298509,4.771232
1,0.269033,0.196818,0.383828,0.161703,-0.629189,1.460832,1,0,1,0,...,0,0,0,0,0,2.956273,5.78377,3.428604,2.956273,5.78377
2,1.051537,1.795874,-1.105624,0.161703,-0.629189,0.963985,1,0,1,1,...,0,0,0,1,0,4.164164,7.055789,3.658195,4.164164,7.055789
3,0.662446,0.196818,-0.733261,-0.879606,0.371086,-0.692171,1,0,0,0,...,0,0,0,0,1,6.172307,1.379697,4.585505,1.379697,6.172307
4,0.856992,1.795874,0.011465,-0.879606,0.558638,0.301522,0,1,1,0,...,0,0,0,0,1,7.834469,2.747986,4.265591,2.747986,7.834469


In [157]:
ihdp_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 747 entries, 0 to 746
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x0      747 non-null    float64
 1   x1      747 non-null    float64
 2   x2      747 non-null    float64
 3   x3      747 non-null    float64
 4   x4      747 non-null    float64
 5   x5      747 non-null    float64
 6   x6      747 non-null    uint8  
 7   x7      747 non-null    uint8  
 8   x8      747 non-null    uint8  
 9   x9      747 non-null    uint8  
 10  x10     747 non-null    uint8  
 11  x11     747 non-null    uint8  
 12  x12     747 non-null    uint8  
 13  x13     747 non-null    uint8  
 14  x14     747 non-null    uint8  
 15  x15     747 non-null    uint8  
 16  x16     747 non-null    uint8  
 17  x17     747 non-null    uint8  
 18  x18     747 non-null    uint8  
 19  x19     747 non-null    uint8  
 20  x20     747 non-null    uint8  
 21  x21     747 non-null    uint8  
 22  x2

In [158]:
ihdp_factuals: pd.DataFrame = ihdp_full.loc[:, ~ihdp_full.columns.isin(
    ["ycf","ite","t0","t1"]
)]
"A version of the IHDP dataset containing ONLY the factual data"

ihdp_factuals_no_y: pd.DataFrame = ihdp_factuals.loc[:, ihdp_factuals.columns != "yf"]
"IHDP dataset with the factual Y omitted"

ihdp_factuals_y: pd.DataFrame = ihdp_factuals.loc[:, ihdp_factuals.columns == "yf"]
"Only the Y data from the IHDP dataset"

""

''

In [159]:

from sklearn.model_selection import StratifiedKFold

ihdp_learn_validation_skf: StratifiedKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=_seed)
"Using this to remove 10% of the treated/untreated factuals from ihdp for use as part of the validation dataset later on"

ihdp_learn_indices, ihdp_validation_indices = [i for i in ihdp_learn_validation_skf.split(ihdp_factuals, ihdp_factuals["t"])][0]

ihdp_learn_df: pd.DataFrame = ihdp_factuals.iloc[ihdp_learn_indices]
"The dataframe that is the subset of the IHDP factual data which will be used for learning feature importances etc"

ihdp_learn_df_x: pd.DataFrame = ihdp_learn_df.loc[:, ihdp_learn_df.columns != "yf"]
"X/T info for the dataframe that is the subset of the IHDP factual data which will be used for learning feature importances etc"
ihdp_learn_df_y: pd.DataFrame = ihdp_learn_df.loc[:, ihdp_learn_df.columns == "yf"]
"Y info for the dataframe that is the subset of the IHDP factual data which will be used for learning feature importances etc"

ihdp_validation_factual_df: pd.DataFrame = ihdp_factuals.iloc[ihdp_validation_indices]
ihdp_validation_factual_df_x: pd.DataFrame = ihdp_validation_factual_df.loc[:, ihdp_validation_factual_df.columns != "yf"]
ihdp_validation_factual_df_y: pd.DataFrame = ihdp_validation_factual_df.loc[:, ihdp_validation_factual_df.columns == "yf"]

""

''

In [160]:
from sklearn.model_selection import KFold
from sklearn.model_selection._validation import NotFittedError
from sklearn.base import RegressorMixin, TransformerMixin
from sklearn.linear_model import ARDRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.preprocessing import QuantileTransformer
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.metrics import r2_score, make_scorer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from math import inf

set_config(display="diagram")

R = TypeVar('R', bound=RegressorMixin)

In [161]:
def np_data_and_targets(df: pd.DataFrame, targetname: str = "yf") -> Tuple[np.ndarray, np.ndarray]:
    """
    Converts dataframe into a couple of numpy ndarrays for the data without the labels,
    and the labels by themselves.
    :param df: the Dataframe
    :param targetname: The name of the column holding the targets
    :return: tuple of [ndarray of the values without the targets, just the class labels]
    """

    inputs:  np.ndarray = df.loc[:,df.columns != targetname].to_numpy()
    outputs: np.ndarray =  df.loc[:,targetname].to_numpy()

    return inputs, outputs

In [162]:
def halving_grid_searcher(
        regressor: R,
        param_grid: Dict[str, List[Any]],
        train_data: np.ndarray,
        train_targets: np.ndarray,
        k_fold_NO_SHUFFLE: KFold = KFold(n_splits=5, shuffle=False),
        class_weights: Optional[np.ndarray] = None
) -> HalvingGridSearchCV:

    pipe: Pipeline = Pipeline([
        ("scaler", QuantileTransformer(output_distribution="normal")),
        #("imputer",KNNImputer(add_indicator=False, weights="distance")),
        ("regressor",regressor)
    ])

    h_grid_search: HalvingGridSearchCV = HalvingGridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        factor=3,
        cv=k_fold_NO_SHUFFLE,
        scoring=make_scorer(r2_score),
        refit=True,
        verbose=1,
        n_jobs=-1,
        error_score=-1000000000000
        # I wanted to make this error score negative infinity, however, doing so caused a lot of
        # particularly unsightly warning messages to appear.

        # So, to save everyone involved from having to look at at a buttload of them with a buttload of numbers in them,
        # I'm just setting this to an incredibly low finite number which should be rather hard to reach.
        # And if this score (or an even lower score) somehow is reached legitimately, chances are that
        # the legitimate score being lower than the error score will be the least of one's concerns.
    )

    if class_weights is not None:

        h_grid_search.fit(
            train_data, train_targets, sample_weight=class_weights
        )
    else:
        h_grid_search.fit(
            train_data, train_targets
        )

    return h_grid_search



In [185]:
def nested_halving_grid_searcher(
        regressor: R,
        param_grid: Dict[str, List[Any]],
        train_data: np.ndarray,
        train_targets: np.ndarray,
        kfold_splits: int = 6,
        classes_ndarray: Optional[np.ndarray] = None
) -> Dict[HalvingGridSearchCV, float]:

    h_grid_search_dicts: Dict[HalvingGridSearchCV, float] = {}

    kf: KFold = KFold(n_splits=kfold_splits, shuffle=True, random_state=_seed)

    for i, (train_indices, test_indices) in enumerate(kf.split(train_data, train_targets), 1):
        print(f"-- {i}/{kfold_splits} start --")
        try:
            if classes_ndarray is not None:

                train_classes: np.ndarray = np.take(classes_ndarray, train_indices)

                train_classes = train_classes / np.sum(train_classes)

                test_classes: np.ndarray = np.take(classes_ndarray, test_indices)
                test_classes = test_classes / np.sum(test_classes)

                current_search: HalvingGridSearchCV = halving_grid_searcher(
                    regressor,
                    param_grid,
                    train_data[train_indices],
                    train_targets[train_indices],
                    KFold(n_splits=max(1, kfold_splits-1), shuffle=False),
                    class_weights = train_classes
                )

                current_score: float = current_search.score(
                    train_data[test_indices],
                    train_targets[test_indices]
                )

                h_grid_search_dicts[current_search] = current_score

            else:


                current_search: HalvingGridSearchCV = halving_grid_searcher(
                    regressor,
                    param_grid,
                    train_data[train_indices],
                    train_targets[train_indices],
                    KFold(n_splits=max(1, kfold_splits-1), shuffle=False)
                )

                current_score: float = current_search.score(
                    train_data[test_indices],
                    train_targets[test_indices]
                )

                h_grid_search_dicts[current_search] = current_score

            print(f"-- {i}/{kfold_splits} done --")

        except NotFittedError as e:
            print("oh no! there was a not fitted error!", sys.stderr)
            print(e, sys.stderr)
            print(traceback.format_exc(), sys.stderr)




    return h_grid_search_dicts

In [164]:
from sklearn.ensemble import RandomForestRegressor


In [165]:
fpipeline = Pipeline(
    steps=[
        ("scaler",QuantileTransformer()),
        ("learner",RandomForestRegressor())
        #("learner",ARDRegression())
        #("learner",AdaBoostRegressor(base_estimator=ARDRegression()))
        #("learner",LinearRegression())
    ]
)
learner = fpipeline["learner"]
fpipeline.fit(ihdp_learn_df_x.to_numpy(), ihdp_learn_df_y.to_numpy())

fpipeline

In [166]:
fpipeline.score(ihdp_validation_factual_df_x.to_numpy(), ihdp_validation_factual_df_y.to_numpy())


0.6928530080365853

In [186]:
random_forest_searched_dict: Dict[
    HalvingGridSearchCV, float
] = nested_halving_grid_searcher(
    RandomForestRegressor(criterion="squared_error"),
    {
        #"regressor__criterion": ["squared_error", "poisson"],
        "regressor__n_estimators": [75,100,125],
        "regressor__min_samples_split": [2,4,6,8],
        #"regressor__min_impurity_decrease": [0, *np.geomspace(0.00001,0.2,6)[1:]],
        "regressor__max_features": [None,"sqrt","log2",1,2],
        "regressor__oob_score": [False, True],
        #"regressor__ccp_alpha": [0, *np.geomspace(0.00001,0.2,6)[1:]]
    },
    ihdp_learn_df_x.values,
    ihdp_learn_df_y.values,
    kfold_splits=6,
    classes_ndarray=None #classes_ndarray=ihdp_learn_df_x["t"].to_numpy()
)

rf_searched: HalvingGridSearchCV = max(
    random_forest_searched_dict.keys(),
    key=lambda k: random_forest_searched_dict[k]
)

best_rf: Tuple[HalvingGridSearchCV, float] = (
    rf_searched,
    random_forest_searched_dict[rf_searched]
)

-- 1/6 start --
n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 10
max_resources_: 560
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 120
n_resources: 10
Fitting 5 folds for each of 120 candidates, totalling 600 fits
----------
iter: 1
n_candidates: 40
n_resources: 30
Fitting 5 folds for each of 40 candidates, totalling 200 fits
----------
iter: 2
n_candidates: 14
n_resources: 90
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 3
n_candidates: 5
n_resources: 270
Fitting 5 folds for each of 5 candidates, totalling 25 fits
-- 1/6 done --
-- 2/6 start --
n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 10
max_resources_: 560
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 120
n_resources: 10
Fitting 5 folds for each of 120 candidates, totalling 600 fits
----------
iter: 1
n_candidates: 40
n_resources: 30
Fitting 5 folds for each of 40 candidate

In [187]:
print(best_rf)

print(best_rf[1])

rf_searched.best_estimator_

(HalvingGridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
                    error_score=-1000000000000,
                    estimator=Pipeline(steps=[('scaler',
                                               QuantileTransformer(output_distribution='normal')),
                                              ('regressor',
                                               RandomForestRegressor())]),
                    n_jobs=-1,
                    param_grid={'regressor__max_features': [None, 'sqrt',
                                                            'log2', 1, 2],
                                'regressor__min_samples_split': [2, 4, 6, 8],
                                'regressor__n_estimators': [75, 100, 125],
                                'regressor__oob_score': [False, True]},
                    scoring=make_scorer(r2_score), verbose=1), 0.7744807092877399)
0.7744807092877399


In [192]:
ard_iter: List[int] = [200,300,400]
ard_tol: List[float] = [1e-2, 1e-3, 1e-4]
ard_alpha_lambda: List[float] = [1e-5, 1e-6, 1e-7]
ard_thresh_lambda: List[float] = [1e3, 1e4, 1e5]

ard_searched_dict: Dict[
    HalvingGridSearchCV, float
] = nested_halving_grid_searcher(
    ARDRegression(),
    {
        "regressor__n_iter": ard_iter,
        "regressor__tol": ard_tol,
        "regressor__alpha_1" : ard_alpha_lambda,
        "regressor__alpha_2" : ard_alpha_lambda,
        "regressor__lambda_1" : ard_alpha_lambda,
        "regressor__lambda_2" : ard_alpha_lambda,
        "regressor__threshold_lambda": ard_thresh_lambda
    },
    ihdp_learn_df_x.values,
    ihdp_learn_df_y.values,
    kfold_splits=6,
    classes_ndarray=None
)

ard_searched: HalvingGridSearchCV = max(
    ard_searched_dict.keys(),
    key=lambda k: ard_searched_dict[k]
)

best_ard: Tuple[HalvingGridSearchCV, float] = (
    ard_searched,
    ard_searched_dict[ard_searched]
)

-- 1/6 start --
n_iterations: 4
n_required_iterations: 7
n_possible_iterations: 4
min_resources_: 10
max_resources_: 560
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 729
n_resources: 10
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
----------
iter: 1
n_candidates: 243
n_resources: 30
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
----------
iter: 2
n_candidates: 81
n_resources: 90
Fitting 5 folds for each of 81 candidates, totalling 405 fits
----------
iter: 3
n_candidates: 27
n_resources: 270
Fitting 5 folds for each of 27 candidates, totalling 135 fits
-- 1/6 done --
-- 2/6 start --
n_iterations: 4
n_required_iterations: 7
n_possible_iterations: 4
min_resources_: 10
max_resources_: 560
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 729
n_resources: 10
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
----------
iter: 1
n_candidates: 243
n_resources: 30
Fitting 5 folds for each of 24

In [193]:
print(best_ard)

print(best_ard[1])

ard_searched.best_estimator_

(HalvingGridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
                    error_score=-1000000000000,
                    estimator=Pipeline(steps=[('scaler',
                                               QuantileTransformer(output_distribution='normal')),
                                              ('regressor', ARDRegression())]),
                    n_jobs=-1,
                    param_grid={'regressor__alpha_1': [1e-05, 1e-06, 1e-07],
                                'regressor__alpha_2': [1e-05, 1e-06, 1e-07],
                                'regressor__lambda_1': [1e-05, 1e-06, 1e-07],
                                'regressor__lambda_2': [1e-05, 1e-06, 1e-07],
                                'regressor__threshold_lambda': [1000.0, 10000.0,
                                                                100000.0],
                                'regressor__tol': [0.01, 0.001, 0.0001]},
                    scoring=make_scorer(r2_score), verbose=1), 0.779