In [1]:
%load_ext autoreload
%autoreload 2

## Check MNAR settings

We test the method in the following settings:

1. different identifiable MNAR scenarios (nsc, block conditional, etc.)
2. in sc setting
3. in general mnar (everything allowed) settings

Still, the relations are linear

## SC setting

In [18]:
from parcs.helpers.missing_data import R_adj_matrix, indicator_graph_description_file, m_graph_convert, sc_mask
from parcs.graph_builder.randomizer import ConnectRandomizer
from parcs.cdag.graph_objects import Graph
import random as rand
import numpy as np
import pandas as pd

# 0. configs
data = pd.read_csv('normalized_data.csv')
N = 1000  # number of samples
N_total = len(data.columns) # number of total variables
miss_ratio = 0.5  # missing ratio in total
v_names = list(data.columns)
r_names = ['R_{}'.format(v.split('_')[1]) for v in v_names]

def miss_dataset():
    # 3. write GDF for R
    indicator_graph_description_file(
        adj_matrix=np.zeros(shape=(len(r_names), len(r_names))),
        node_names=data.columns,
        prefix='R',
        miss_ratio=miss_ratio,
        supress_asteriks=False,
        subscript_only=True,
        file_dir='./gdf_R.yml'
    )
    # 4. randomize
    rndz = ConnectRandomizer(
        parent_graph_dir='gdf_Z.yml',
        child_graph_dir='gdf_R.yml',
        guideline_dir='guideline.yml',
        adj_matrix_mask=pd.DataFrame(
            np.ones(shape=(N_total, N_total)),
            index=v_names,
            columns=r_names
        ),
        delete_temp_graph_description=False
    )

    # 5. samples
    nodes, edges = rndz.get_graph_params()
    g = Graph(nodes=nodes, edges=edges)
    s = g.sample(N)

    # outputs
    gt = s[v_names]
    ds = m_graph_convert(s, missingness_prefix='R_', shared_subscript=True)
    return gt, ds[v_names]

In [21]:
from sklearn.impute import SimpleImputer
from matplotlib import pyplot as plt
import numpy as np
from hyperimpute.plugins.imputers import Imputers

rmse = []
rmse_k = []

for it in range(3):
    if it % 2 == 0:
        print(it)
    gt, ds = miss_dataset()
    mask_ = ds.isna().values
    n = mask_.sum()
    hpi = Imputers().get(
        'hyperimpute',
        optimizer='hyperband',
        classifier_seed=['logistic_regression'],
        regression_seed=['linear_regression']
    )
    mpi = Imputers().get('missforest')
    imp = hpi.fit_transform(ds)
    imp_k = mpi.fit_transform(ds)
    sq = np.square(
        gt.values[mask_] - imp.values[mask_]
    )
    ssq = sq.sum()/n
    sqr = np.sqrt(ssq)
    rmse.append(sqr)

    sq = np.square(
        gt.values[mask_] - imp_k.values[mask_]
    )
    ssq = sq.sum()/n
    sqr = np.sqrt(ssq)
    rmse_k.append(sqr)

print('hyp')
print(np.mean(rmse))
print(np.std(rmse))
print('mis')
print(np.mean(rmse_k))
print(np.std(rmse_k))

0




2




hyp
0.22882583902482756
0.0031686464572957063
mis
0.23726889528003867
0.0038363937271725223


