In [9]:
# Import pandas and required functions and packages in scikit learn and feature engine
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from feature_engine.imputation import MeanMedianImputer

In [4]:
data = pd.read_csv('crx.csv')
data.head()

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,1,f,g.1,202,0.1,+
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+


In [5]:
# Give the dataset header names
cols = [
    'Gender','Age','Debt','Married','BankCustomer','EducationLevel','Ethnicity',
        'YearsEmployed','PriorDefault','Employed','CreditScore','DriversLicense','Citizen',
        'ZipCode','Income','Target'
]

data.columns = cols # Pass the cols object to the dataframe to update the heading
data.head()

  Gender    Age   Debt Married BankCustomer EducationLevel Ethnicity  \
0      a  58.67  4.460       u            g              q         h   
1      a   24.5  0.500       u            g              q         h   
2      b  27.83  1.540       u            g              w         v   
3      b  20.17  5.625       u            g              w         v   
4      b  32.08  4.000       u            g              m         v   

   YearsEmployed PriorDefault Employed  CreditScore DriversLicense Citizen  \
0           3.04            t        t            6              f       g   
1           1.50            t        f            0              f       g   
2           3.75            t        t            5              t       g   
3           1.71            t        f            0              f       s   
4           2.50            t        f            0              t       g   

  ZipCode  Income Target  
0      43     560      +  
1     280     824      +  
2     100       3

In [6]:
data.tail() # Check the last 5 mail

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Target
684,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260,0,-
685,a,22.67,0.75,u,g,c,v,2.0,f,t,2,t,g,200,394,-
686,a,25.25,13.5,y,p,ff,ff,2.0,f,t,1,t,g,200,1,-
687,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280,750,-
688,b,35.0,3.375,u,g,c,h,8.29,f,f,0,t,g,0,0,-


In [10]:
# Create a function to form artificial missing observations in the dataset randomly

def introduce_missing_data(df, proportion=0.1, exclude_columns=None, random_state=None):
    """
    Randomly introduces missing values (NaN) into a dataset.
    
    Parameters:
        df (pd.DataFrame): The dataset to modify.
        proportion (float): Proportion of total observations to set as missing (default: 0.01 or 1%).
        exclude_columns (list): List of columns to exclude from missing data insertion (default: None).
        random_state (int): Seed for reproducibility (default: None).
        
    Returns:
        pd.DataFrame: Dataset with missing values introduced.
    """
    if exclude_columns is None:
        exclude_columns = []
        
    if random_state is not None:
        np.random.seed(random_state)
    
    df = df.copy()  # Make a copy to avoid modifying the original dataset
    total_cells = df.size
    n_missing = int(total_cells * proportion)
    
    # Flatten the DataFrame into row, column index pairs
    valid_columns = [col for col in df.columns if col not in exclude_columns]
    if not valid_columns:
        raise ValueError("All columns are excluded from missing data introduction.")
    
    rows, cols = df.shape
    flat_indices = [(i, j) for i in range(rows) for j in range(cols) if df.columns[j] in valid_columns]
    selected_indices = np.random.choice(len(flat_indices), n_missing, replace=False)
    
    # Introduce missing values
    for index in selected_indices:
        i, j = flat_indices[index]
        df.iat[i, j] = np.nan
    
    return df


In [12]:
modified_dt = introduce_missing_data(data, proportion=0.1, exclude_columns=None, random_state=42) # apppy the function inserting the parameters

modified_dt

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Target
0,a,58.67,4.460,,g,q,h,3.04,t,t,6.0,f,g,43,,+
1,a,24.5,0.500,u,g,q,h,1.50,t,f,0.0,f,g,280,824.0,
2,b,,1.540,,g,w,v,,t,t,5.0,t,g,100,3.0,
3,b,20.17,,u,g,w,v,1.71,t,f,0.0,f,s,120,0.0,+
4,b,32.08,4.000,u,g,m,,2.50,t,f,0.0,t,g,360,0.0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,b,21.08,10.085,y,p,e,,1.25,f,,0.0,f,g,260,,-
685,a,22.67,0.750,u,g,c,v,2.00,f,t,2.0,t,g,200,,-
686,a,25.25,13.500,,p,ff,ff,,f,t,,,g,200,1.0,-
687,b,,0.205,u,g,,v,0.04,f,f,0.0,f,g,280,750.0,-


In [13]:
# Split the dataset into both train and test sets with respective target

X_train, X_test, y_train, y_test = train_test_split(
    modified_dt.drop('Target', axis=1),
    modified_dt['Target'],
    test_size = 0.2,
    random_state=43
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((551, 15), (138, 15), (551,), (138,))

In [42]:
# Make a list of numerical variable that exclude type object
numeric_vars = X_train.select_dtypes(
    exclude='O'
).columns.to_list()

numeric_vars

['Debt', 'YearsEmployed', 'CreditScore', 'Income']

In [43]:
# Capture the variable median value in a dict
median_vals = X_train[numeric_var].median().to_dict()

median_vals

{'Debt': 2.73, 'YearsEmployed': 1.0, 'CreditScore': 0.0, 'Income': 5.5}

In [44]:
# Fill the missing data in train and test sets with median values
X_train_t = X_train.fillna(value = median_vals)
X_test_t = X_test.fillna(value = median_vals)

In [45]:
# Check the output of both objects
X_train_t, X_test_t 

(    Gender    Age    Debt Married BankCustomer EducationLevel Ethnicity  \
 641      b  31.58   0.750       y          NaN             aa         v   
 547      b  33.17   1.000       u            g            NaN         v   
 398      b     31   2.085       u            g              c         v   
 497      b  25.75   0.500       u            g            NaN         v   
 658      a  28.58   3.750       u            g              c       NaN   
 ..     ...    ...     ...     ...          ...            ...       ...   
 16       a  23.25   5.875     NaN            g              q         v   
 58       b  43.25   3.000       u          NaN              q         h   
 277      b  24.58  13.500       y            p             ff        ff   
 255    NaN     20  11.045       u            g              c         v   
 320      a  18.08   0.375       l           gg             cc        ff   
 
      YearsEmployed PriorDefault Employed  CreditScore DriversLicense Citizen  \
 641 

In [46]:
X_train[numeric_var].isnull().sum(), X_test[numeric_var].isnull().sum()

(Debt             55
 YearsEmployed    60
 CreditScore      68
 Income           55
 dtype: int64,
 Debt             16
 YearsEmployed    18
 CreditScore      10
 Income           17
 dtype: int64)

In [47]:
X_train_t[numeric_var].isnull().sum(), X_test_t[numeric_var].isnull().sum()

(Debt             0
 YearsEmployed    0
 CreditScore      0
 Income           0
 dtype: int64,
 Debt             0
 YearsEmployed    0
 CreditScore      0
 Income           0
 dtype: int64)

# Using Imputer technique to replace missing data by median

In [48]:
# Let's set up imputer first
imputer = SimpleImputer(strategy='median')

Note: that to use the mean instead of median, all you just need to do is set the strategy to **mean**

In [51]:
ct = ColumnTransformer(
    [("imputer", imputer, numeric_vars)],
    remainder = 'passthrough',
    force_int_remainder_cols=False
).set_output(transform='pandas')

In [52]:
# fit the imputer in the dataset to learn the median
ct.fit(X_train)
# Check the learned median values
ct.named_transformers_.imputer.statistics_

array([2.73, 1.  , 0.  , 5.5 ])

In [53]:
# replace missing value with the median
X_train_t = ct.transform(X_train)
X_test_t = ct.transform(X_test)

In [54]:
print(X_train_t.head())

     imputer__Debt  imputer__YearsEmployed  imputer__CreditScore  \
641          0.750                   3.500                   0.0   
547          1.000                   0.750                   7.0   
398          2.085                   0.085                   0.0   
497          0.500                   1.460                   5.0   
658          3.750                   0.250                   1.0   

     imputer__Income remainder__Gender remainder__Age remainder__Married  \
641              0.0                 b          31.58                  y   
547           4071.0                 b          33.17                  u   
398              0.0                 b             31                  u   
497              0.0                 b          25.75                  u   
658            154.0                 a          28.58                  u   

    remainder__BankCustomer remainder__EducationLevel remainder__Ethnicity  \
641                     NaN                        aa   

# Finally, perform median imputation using feature engine technique

In [58]:
# set up feature engine imputer
imputer = MeanMedianImputer(
    imputation_method='median',
    variables=numeric_vars
)

In [59]:
# fit the imputer in the dataset so as to learn the median value from it
imputer.fit(X_train)

# view the learned median
imputer.imputer_dict_

{'Debt': 2.73, 'YearsEmployed': 1.0, 'CreditScore': 0.0, 'Income': 5.5}

In [60]:
X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)

In [63]:
X_train_t[numeric_vars].isnull().sum()

Debt             0
YearsEmployed    0
CreditScore      0
Income           0
dtype: int64

In [64]:
X_test_t[numeric_vars].isnull().sum()

Debt             0
YearsEmployed    0
CreditScore      0
Income           0
dtype: int64