This notebook demonstrates how to generate missing data using the pattern-based multivariate amputation module available in the mdatagen package. This module is a wrapper around the pyampute package and addresses key challenges in generating artificial missing data under the MAR (Missing At Random) mechanism.

In addition, an excellent video lesson is available on YouTube at the following link:
📺 https://youtu.be/jMEzKFV-ilc?si=bVQ-kYjOelAqdN0h

In [None]:
# Import the libraries
import numpy as np 
import pmlb
from mdatagen.multivariate import mMAR
from mdatagen.plots import PlotMissingData

# Function to help split data
def split_data(data):
    df = data.copy()
    X = df.drop(columns=["target"])
    y = data["target"]

    return X,np.array(y)

# The data from PMLB
kddcup = pmlb.fetch_data('kddcup')
X_, y_ = split_data(kddcup)

generator = mMAR(X=X_, y=y_,n_Threads=1)
gen_md = generator.pattern_missingness()

sepal length (cm)     0
sepal width (cm)      0
petal length (cm)    38
petal width (cm)      0
target                0
dtype: int64


In [None]:
miss_plot = PlotMissingData(data_missing=gen_md, 
                            data_original=X_)

miss_plot.visualize_miss("normal", save=False)