In [1]:
%load_ext autoreload
%autoreload 2

# MAR experiments

## Data Preparation (Run once to get the dataset csv file)

In [2]:
import pandas as pd

data = pd.read_excel(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls"
)
last_col = data.columns[-1]
data.drop(columns=[last_col], inplace=True)
# rename
data.rename({
    c: 'Z_{}'.format(i) for c, i in zip(data.columns, range(len(data.columns)))
}, axis=1, inplace=True)

# normalize
for c in data.columns:
    data[c] = (data[c] - data[c].min()) / (data[c].max() - data[c].min())

In [3]:
data.to_csv('normalized_data.csv', index=False)

In [4]:
gdf = '# nodes\n'
for c in data.columns:
    gdf += '{}: data(./normalized_data.csv)\n'.format(c)
with open('gdf_Z.yml', 'w') as file:
    file.write(gdf)

## Main

### Run the same scenario as the original paper

Run the script `mar_original_paper_experiment.py`

## Gradually increasing Z->R edges' density

In [5]:
from parcs.helpers.missing_data import R_adj_matrix, indicator_graph_description_file, m_graph_convert
from parcs.graph_builder.randomizer import ConnectRandomizer, guideline_iterator
from parcs.cdag.graph_objects import Graph
import random as rand
import numpy as np
import pandas as pd
import json
from hyperimpute.plugins.imputers import Imputers
from hyperimpute.utils.distributions import enable_reproducible_results
import warnings
warnings.filterwarnings("ignore")

# 0. configs
data = pd.read_csv('normalized_data.csv')
N = 700  # number of samples
N_total = len(data.columns) # number of total variables
N_O = 4  # number of fully observed variables
N_m = N_total - N_O  # number of missing feauters
miss_ratio = 0.5  # missing ratio in total

results = {}
for dir_, epoch, value in guideline_iterator(guideline_dir='guideline_2.yml',
                                             to_iterate='graph/graph_density',
                                             steps=5, repeat=10):
    print('GRAPH_DENSITY: {}, EPOCH: {}'.format(value, epoch))
    results[value] = {'hyperimpute': [], 'missforest': []}
    enable_reproducible_results(epoch)
    # 2. fully and partially observed variables
    obs_v = sorted(rand.sample(['Z_{}'.format(i) for i in range(N_total)], N_O))
    miss_v = sorted(list(set(data.columns) - set(obs_v)))
    total_v = sorted(obs_v + miss_v)
    # 3. write GDF for R
    indicator_graph_description_file(adj_matrix=np.zeros(shape=(N_m, N_m)),
                                     node_names=miss_v, miss_ratio=miss_ratio, subscript_only=True,
                                     file_dir='./gdf_R.yml')
    # 4. randomize
    mask = pd.DataFrame(np.zeros(shape=(N_total, N_m)), index=total_v,
                        columns=['R_{}'.format(i.split('_')[1]) for i in miss_v])
    mask.loc[obs_v, :] = 1
    rndz = ConnectRandomizer(parent_graph_dir='gdf_Z.yml', child_graph_dir='gdf_R.yml', guideline_dir=dir_,
                             adj_matrix_mask=mask)
    # 5. samples
    nodes, edges = rndz.get_graph_params()
    g = Graph(nodes=nodes, edges=edges)
    s = g.sample(N)
    # outputs
    gt = s[total_v]
    ds = m_graph_convert(s, missingness_prefix='R_', shared_subscript=True)

    # main thread
    mask = ds.isna().values
    mf = Imputers().get('missforest')
    hi = Imputers().get('hyperimpute')
    imp_hi = hi.fit_transform(ds)
    imp_mf = mf.fit_transform(ds)
    results[value]['hyperimpute'].append(RMSE(gt.values, imp_hi.values, mask))
    results[value]['missforest'].append(RMSE(gt.values, imp_mf.values, mask))
    del gt, ds, mask, mf, hi, imp_hi, imp_mf

with open('MAR_ZR_edge_density_variation.json', 'w') as f:
    json.dump(results, f)

GRAPH_DENSITY: 0.0, EPOCH: 0


ConstructorError: could not determine a constructor for the tag 'tag:yaml.org,2002:python/object/apply:numpy.core.multiarray.scalar'
  in "./temp_analysis_guideline.yml", line 4, column 18

In [None]:
from parcs.helpers.missing_data import R_adj_matrix, indicator_graph_description_file, m_graph_convert
from parcs.graph_builder.randomizer import ConnectRandomizer
from parcs.cdag.graph_objects import Graph
import random as rand
import numpy as np
import pandas as pd

# 0. configs
data = pd.read_csv('normalized_data.csv')
N = 700  # number of samples
N_total = len(data.columns) # number of total variables
N_O = 4  # number of fully observed variables
miss_ratio = 0.5  # missing ratio in total

def get_miss_dataset(density_R=None):
    # 2. fully and partially observed variables
    obs_v = rand.sample(['Z_{}'.format(i) for i in range(N_total)], N_O)
    miss_v = list(set(data.columns) - set(obs_v))
    total_v = sorted(obs_v + miss_v)
    # 3. write GDF for R
    r_mask = R_adj_matrix(size=N_total-N_O, density=density_R)
    indicator_graph_description_file(
        adj_matrix=r_mask,
        node_names=miss_v,
        prefix='R',
        miss_ratio=miss_ratio,
        supress_asteriks=False,
        subscript_only=True,
        file_dir='./gdf_R.yml'
    )
    # 4. randomize
    rndz = ConnectRandomizer(
    parent_graph_dir='gdf_Z.yml',
    child_graph_dir='gdf_R.yml',
    guideline_dir='guideline_1.yml',
    adj_matrix_mask=pd.DataFrame(np.ones(shape=(N_total, N_total-N_O)),
                                 index=data.columns,
                                 columns=['R_{}'.format(i.split('_')[1]) for i in miss_v])
    )
    # 5. samples
    nodes, edges = rndz.get_graph_params()
    g = Graph(nodes=nodes, edges=edges)
    s = g.sample(N)

    # outputs
    gt = s[total_v]
    ds = m_graph_convert(s, missingness_prefix='R_', shared_subscript=True)
    return gt, ds[total_v]

## linear vs. nonlinear

In [None]:
# linear
np.random.seed(2022)
rmse_hi = []
rmse_mf = []
for it in range(iters):
    print(it)
    gt, ds = get_miss_dataset(density_R=0, linear=True)
    mask = ds.isna().values
    mf = Imputers().get('missforest')
    hi = Imputers().get('hyperimpute')
    imp_hi = hi.fit_transform(ds)
    imp_mf = mf.fit_transform(ds)
    rmse_hi.append(RMSE(gt.values, imp_hi.values, mask))
    rmse_mf.append(RMSE(gt.values, imp_mf.values, mask))

results = {'hyperimpute': rmse_hi, 'missforest': rmse_mf}

with open('MAR_linear_uci.json', 'w') as f:
    json.dump(results, f)

In [None]:
np.random.seed(2022)
# nonlinear
rmse_hi = []
rmse_mf = []
for it in range(iters):
    print(it)
    gt, ds = get_miss_dataset(density_R=0, linear=False)
    mask = ds.isna().values
    mf = Imputers().get('missforest')
    hi = Imputers().get('hyperimpute')
    imp_hi = hi.fit_transform(ds)
    imp_mf = mf.fit_transform(ds)
    rmse_hi.append(RMSE(gt.values, imp_hi.values, mask))
    rmse_mf.append(RMSE(gt.values, imp_mf.values, mask))

results = {'hyperimpute': rmse_hi, 'missforest': rmse_mf}

with open('MAR_nonlinear_uci.json', 'w') as f:
    json.dump(results, f)

In [None]:
from matplotlib import pyplot as plt
import json

with open('MAR_linear_uci.json') as f:
    res_l = json.load(f)

with open('MAR_nonlinear_uci.json') as f:
    res_nl = json.load(f)

print(np.mean(res_l['hyperimpute']), np.std(res_l['hyperimpute']))
print(np.mean(res_l['missforest']), np.std(res_l['missforest']))

print(np.mean(res_nl['hyperimpute']), np.std(res_nl['hyperimpute']))
print(np.mean(res_nl['missforest']), np.std(res_nl['missforest']))


In [None]:
from matplotlib import pyplot as plt
from hyperimpute.plugins.imputers import Imputers

range_ = np.linspace(0, 1, 3)

rmse_mean = []
rmse_std = []
for r_density in range_:
    print(r_density)
    temp = []
    for it in range(3):
        gt, ds = get_miss_dataset(density_R=r_density)
        mask = ds.isna().values
        n = mask.sum()
        # hpi = SimpleImputer(strategy='mean')
        hpi = Imputers().get(
            'hyperimpute',
            optimizer='hyperband',
            classifier_seed=['random_forest'],
            regression_seed=['random_forest_regressor']
        )
        imp = hpi.fit_transform(ds)
        temp.append(
            np.sqrt(
                np.sum(
                    np.square(gt.values[mask] - imp.values[mask])
                )/n
            )
        )
    rmse_mean.append(np.mean(temp))
    rmse_std.append(np.std(temp))

plt.errorbar(range_, rmse_mean, rmse_std, marker='^')
plt.show()

In [None]:
from sklearn.impute import KNNImputer
from matplotlib import pyplot as plt
from hyperimpute.plugins.imputers import Imputers

range_ = np.linspace(0, 1, 2)
rmse_mean = []
rmse_std = []
for r_density in range_:
    print(r_density)
    temp = []
    for it in range(10):
        gt, ds = get_miss_dataset(density_R=r_density)
        # kni = KNNImputer(n_neighbors=5)
        hpi = Imputers().get(
            'hyperimpute',
            optimizer='hyperband',
            classifier_seed=['logistic_regression'],
            regression_seed=['linear_regression']
        )
        imp = hpi.fit_transform(ds)
        temp.append(np.sqrt(np.sum(np.square(gt.values - imp.values))/N))
    rmse_mean.append(np.mean(temp))
    rmse_std.append(np.std(temp))

plt.errorbar(range_, rmse_mean, rmse_std, marker='^')
plt.show()