In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from feature_engine.imputation import EndTailImputer

In [2]:
data = pd.read_csv('crx.csv')
data.head()

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,1,f,g.1,202,0.1,+
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+


In [3]:
# Create a function to form artificial missing observations in the dataset randomly

def introduce_missing_data(df, proportion=0.1, exclude_columns=None, random_state=None):
    """
    Randomly introduces missing values (NaN) into a dataset.
    
    Parameters:
        df (pd.DataFrame): The dataset to modify.
        proportion (float): Proportion of total observations to set as missing (default: 0.01 or 1%).
        exclude_columns (list): List of columns to exclude from missing data insertion (default: None).
        random_state (int): Seed for reproducibility (default: None).
        
    Returns:
        pd.DataFrame: Dataset with missing values introduced.
    """
    if exclude_columns is None:
        exclude_columns = []
        
    if random_state is not None:
        np.random.seed(random_state)
    
    df = df.copy()  # Make a copy to avoid modifying the original dataset
    total_cells = df.size
    n_missing = int(total_cells * proportion)
    
    # Flatten the DataFrame into row, column index pairs
    valid_columns = [col for col in df.columns if col not in exclude_columns]
    if not valid_columns:
        raise ValueError("All columns are excluded from missing data introduction.")
    
    rows, cols = df.shape
    flat_indices = [(i, j) for i in range(rows) for j in range(cols) if df.columns[j] in valid_columns]
    selected_indices = np.random.choice(len(flat_indices), n_missing, replace=False)
    
    # Introduce missing values
    for index in selected_indices:
        i, j = flat_indices[index]
        df.iat[i, j] = np.nan
    
    return df

modified_data = introduce_missing_data(data, proportion=0.01, exclude_columns=None, random_state=42)
modified_data.head()

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,1,f,g.1,202,0.1,+
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6.0,f,g,43,560.0,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0.0,f,g,280,824.0,+
2,b,,1.54,u,g,w,v,3.75,t,t,5.0,t,g,100,3.0,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0.0,f,s,120,0.0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0.0,t,g,360,0.0,+


In [6]:
# Capture numerical variable in a list and exclude target
numvars = [var for var in modified_data.select_dtypes(
    exclude='O'
).columns.to_list() if var != '+']

In [8]:
# Split the dataset to train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    modified_data[numvars],
    modified_data['+'],
    test_size=0.3,
    random_state=0
)

In [10]:
# Determine the Inter Quartile Range
IQR = X_train.quantile(0.75) - X_train.quantile(0.25)
print(IQR)

0         6.480
1.25      2.835
1         3.000
0.1     354.250
dtype: float64


In [11]:
imputation_dict = (
    X_train.quantile(0.75) + 1.5 * IQR
).to_dict()

In [12]:
imp_dict_normal_distr = (X_train.mean() + 3 * X_train.std()).to_dict()

In [14]:
X_train_t = X_train.fillna(imputation_dict)
X_test_t = X_test.fillna(imputation_dict)

X_train_t.tail()

Unnamed: 0,0,1.25,1,0.1
359,5.5,5.5,0.0,0.0
192,1.585,3.085,6.0,0.0
629,1.25,0.25,0.0,809.0
559,12.33,3.5,6.0,458.0
684,10.085,1.25,0.0,0.0


In [18]:
X_train_t.isnull().sum()

0       0
1.25    0
1       0
0.1     0
dtype: int64

In [16]:
X_train_t1 = X_train.fillna(imp_dict_normal_distr)
X_test_t1 = X_test.fillna(imp_dict_normal_distr)

X_train.head()

Unnamed: 0,0,1.25,1,0.1
595,3.0,2.375,8.0,4159.0
303,13.665,1.5,0.0,1.0
204,12.0,14.0,8.0,6590.0
643,0.42,0.29,0.0,2.0
118,10.335,0.335,1.0,50.0


In [17]:
X_train_t1.isnull().sum()

0       0
1.25    0
1       0
0.1     0
dtype: int64

# We use feature_engine for imputing value

In [19]:
# Let's set up the imputer to estimate the value to the right of the didtribution
imputer = EndTailImputer(
    imputation_method='iqr',
    tail='right',
    fold=3,
    variables=None
)

In [20]:
imputer.fit(X_train)

In [21]:
# Now, verify the learned values
imputer.imputer_dict_

{'0': 26.959999999999997, '1.25': 11.504999999999999, '1': 12.0, '0.1': 1417.0}

In [23]:
X_train_t2 = imputer.transform(X_train)
X_test2 = imputer.transform(X_test)

In [24]:
X_train_t2.isnull().sum()

0       0
1.25    0
1       0
0.1     0
dtype: int64