# Make some import and load data

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    AddMissingIndicator,
    CategoricalImputer,
    MeanMedianImputer
)

In [4]:
data = pd.read_csv('crx.csv')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689 entries, 0 to 688
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   b       689 non-null    object 
 1   30.83   689 non-null    object 
 2   0       689 non-null    float64
 3   u       689 non-null    object 
 4   g       689 non-null    object 
 5   w       689 non-null    object 
 6   v       689 non-null    object 
 7   1.25    689 non-null    float64
 8   t       689 non-null    object 
 9   t.1     689 non-null    object 
 10  1       689 non-null    int64  
 11  f       689 non-null    object 
 12  g.1     689 non-null    object 
 13  202     689 non-null    object 
 14  0.1     689 non-null    int64  
 15  +       689 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.3+ KB


In [6]:
# Create a function to form artificial missing observations in the dataset randomly

def introduce_missing_data(df, proportion=0.1, exclude_columns=None, random_state=None):
    """
    Randomly introduces missing values (NaN) into a dataset.
    
    Parameters:
        df (pd.DataFrame): The dataset to modify.
        proportion (float): Proportion of total observations to set as missing (default: 0.01 or 1%).
        exclude_columns (list): List of columns to exclude from missing data insertion (default: None).
        random_state (int): Seed for reproducibility (default: None).
        
    Returns:
        pd.DataFrame: Dataset with missing values introduced.
    """
    if exclude_columns is None:
        exclude_columns = []
        
    if random_state is not None:
        np.random.seed(random_state)
    
    df = df.copy()  # Make a copy to avoid modifying the original dataset
    total_cells = df.size
    n_missing = int(total_cells * proportion)
    
    # Flatten the DataFrame into row, column index pairs
    valid_columns = [col for col in df.columns if col not in exclude_columns]
    if not valid_columns:
        raise ValueError("All columns are excluded from missing data introduction.")
    
    rows, cols = df.shape
    flat_indices = [(i, j) for i in range(rows) for j in range(cols) if df.columns[j] in valid_columns]
    selected_indices = np.random.choice(len(flat_indices), n_missing, replace=False)
    
    # Introduce missing values
    for index in selected_indices:
        i, j = flat_indices[index]
        df.iat[i, j] = np.nan
    
    return df

modified_data = introduce_missing_data(data, proportion=0.01, exclude_columns=None, random_state=42)
modified_data.head()

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,1,f,g.1,202,0.1,+
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6.0,f,g,43,560.0,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0.0,f,g,280,824.0,+
2,b,,1.54,u,g,w,v,3.75,t,t,5.0,t,g,100,3.0,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0.0,f,s,120,0.0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0.0,t,g,360,0.0,+


In [7]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    modified_data.drop("+", axis=1),
    modified_data['+'],
    test_size=0.3,
    random_state=42
)

In [23]:
# Create a list for the variables
varnames = ['0', '1.25', '1', '0.1']

In [24]:
# Create an object of indicators for the missing values
indicators = [f'{var}_n' for var in varnames]

In [25]:
# Create a copy of both train and test sets from original dataframe

X_train_cp = X_train.copy()
X_test_cp = X_test.copy()
X_train_cp.isnull().sum()

b        5
30.83    7
0        7
u        4
g        5
w        4
v        3
1.25     5
t        4
t.1      2
1        3
f        5
g.1      4
202      8
0.1      3
dtype: int64

In [28]:
# Assign boolean for missing values and convert them to int

X_train[indicators] = X_train[varnames].isna().astype(int)
X_test[indicators] = X_test[varnames].isna().astype(int)

In [31]:
X_train.head()

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,1,f,g.1,202,0.1,0_n,1.25_n,1_n,0.1_n
178,b,20.17,8.17,u,g,aa,v,1.96,t,t,14.0,f,g,60,158.0,0,0,0,0
265,b,18.33,1.21,y,p,e,dd,0.0,f,f,0.0,f,g,100,0.0,0,0,0,0
352,b,30.75,1.585,u,g,d,v,0.585,f,f,0.0,t,s,0,0.0,0,0,0,0
495,a,25.0,0.875,u,g,x,h,1.04,t,f,0.0,t,,160,5860.0,0,0,0,0
408,b,17.08,0.25,u,g,q,v,0.335,f,t,4.0,f,g,160,8.0,0,0,0,0


In [32]:
# Set up imputer to add a binary indicator to every missing variable
imputer = AddMissingIndicator(variables=None, missing_only=True)

In [33]:
# fit imputer to trainset so that it finds variables with missing data
imputer.fit(X_train)

In [54]:
# Finally, add missing indicator
X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)

X_train_t.isnull().sum()

b           5
30.83       7
0           7
u           4
g           5
w           4
v           3
1.25        5
t           4
t.1         2
1           3
f           5
g.1         4
202         8
0.1         3
0_n         0
1.25_n      0
1_n         0
0.1_n       0
b_na        0
30.83_na    0
0_na        0
u_na        0
g_na        0
w_na        0
v_na        0
1.25_na     0
t_na        0
t.1_na      0
1_na        0
f_na        0
g.1_na      0
202_na      0
0.1_na      0
dtype: int64

In [44]:
pipe = Pipeline([
    ("indicators", AddMissingIndicator(missing_only=True)),
    ('categorical', CategoricalImputer(imputation_method='frequent')),
    ('numerical', MeanMedianImputer()),
])

In [49]:
X_train_t = pipe.fit_transform(X_train)
X_test_t = pipe.transform(X_test)

In [51]:
X_train_t.isnull().sum()
# X_test_t.isnull().sum()

b           0
30.83       0
0           0
u           0
g           0
w           0
v           0
1.25        0
t           0
t.1         0
1           0
f           0
g.1         0
202         0
0.1         0
0_n         0
1.25_n      0
1_n         0
0.1_n       0
b_na        0
30.83_na    0
0_na        0
u_na        0
g_na        0
w_na        0
v_na        0
1.25_na     0
t_na        0
t.1_na      0
1_na        0
f_na        0
g.1_na      0
202_na      0
0.1_na      0
dtype: int64

In [52]:
X_train_t.head()

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,...,w_na,v_na,1.25_na,t_na,t.1_na,1_na,f_na,g.1_na,202_na,0.1_na
178,b,20.17,8.17,u,g,aa,v,1.96,t,t,...,0,0,0,0,0,0,0,0,0,0
265,b,18.33,1.21,y,p,e,dd,0.0,f,f,...,0,0,0,0,0,0,0,0,0,0
352,b,30.75,1.585,u,g,d,v,0.585,f,f,...,0,0,0,0,0,0,0,0,0,0
495,a,25.0,0.875,u,g,x,h,1.04,t,f,...,0,0,0,0,0,0,0,1,0,0
408,b,17.08,0.25,u,g,q,v,0.335,f,t,...,0,0,0,0,0,0,0,0,0,0


# Use of scikit learn to impute both categorical and nuerical missing indicators

In [60]:
# Create a list of categorical and numerical variables and convert them in a list

catvars = X_train.select_dtypes(
    include="O"
).columns.to_list()
numvars = X_train.select_dtypes(
    exclude='O'
).columns.to_list()

In [67]:
pipe = ColumnTransformer([
    ('num_imputer', SimpleImputer(strategy='mean', add_indicator=True), numvars),
    ('cat_imputer', SimpleImputer(strategy='most_frequent', add_indicator=True), catvars)
]).set_output(transform="pandas")

In [68]:
# Let perform the imputation
X_train_t = pipe.fit_transform(X_train)
X_test_t =pipe.transform(X_test)

In [69]:
X_train_t.head()

Unnamed: 0,num_imputer__0,num_imputer__1.25,num_imputer__1,num_imputer__0.1,num_imputer__0_n,num_imputer__1.25_n,num_imputer__1_n,num_imputer__0.1_n,num_imputer__missingindicator_0,num_imputer__missingindicator_1.25,...,cat_imputer__missingindicator_30.83,cat_imputer__missingindicator_u,cat_imputer__missingindicator_g,cat_imputer__missingindicator_w,cat_imputer__missingindicator_v,cat_imputer__missingindicator_t,cat_imputer__missingindicator_t.1,cat_imputer__missingindicator_f,cat_imputer__missingindicator_g.1,cat_imputer__missingindicator_202
178,8.17,1.96,14.0,158.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
265,1.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
352,1.585,0.585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
495,0.875,1.04,0.0,5860.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
408,0.25,0.335,4.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False


In [70]:
X_train_t.isnull().sum()

num_imputer__0                         0
num_imputer__1.25                      0
num_imputer__1                         0
num_imputer__0.1                       0
num_imputer__0_n                       0
num_imputer__1.25_n                    0
num_imputer__1_n                       0
num_imputer__0.1_n                     0
num_imputer__missingindicator_0        0
num_imputer__missingindicator_1.25     0
num_imputer__missingindicator_1        0
num_imputer__missingindicator_0.1      0
cat_imputer__b                         0
cat_imputer__30.83                     0
cat_imputer__u                         0
cat_imputer__g                         0
cat_imputer__w                         0
cat_imputer__v                         0
cat_imputer__t                         0
cat_imputer__t.1                       0
cat_imputer__f                         0
cat_imputer__g.1                       0
cat_imputer__202                       0
cat_imputer__missingindicator_b        0
cat_imputer__mis