In [1]:
# Import libraries to help make the task possible
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from feature_engine.imputation import CategoricalImputer

# Load credit approval datasets and follow the same steps to splitting stage

In [2]:
data = pd.read_csv('crx.csv')

cols = [
    'Gender','Age','Debt','Married','BankCustomer','EducationLevel','Ethnicity',
        'YearsEmployed','PriorDefault','Employed','CreditScore','DriversLicense','Citizen',
        'ZipCode','Income','Target'
]

data.columns = cols
data.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Target
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+


In [7]:
# Create a function to form artificial missing observations in the dataset randomly

def introduce_missing_data(df, proportion=0.1, exclude_columns=None, random_state=None):
    """
    Randomly introduces missing values (NaN) into a dataset.
    
    Parameters:
        df (pd.DataFrame): The dataset to modify.
        proportion (float): Proportion of total observations to set as missing (default: 0.01 or 1%).
        exclude_columns (list): List of columns to exclude from missing data insertion (default: None).
        random_state (int): Seed for reproducibility (default: None).
        
    Returns:
        pd.DataFrame: Dataset with missing values introduced.
    """
    if exclude_columns is None:
        exclude_columns = []
        
    if random_state is not None:
        np.random.seed(random_state)
    
    df = df.copy()  # Make a copy to avoid modifying the original dataset
    total_cells = df.size
    n_missing = int(total_cells * proportion)
    
    # Flatten the DataFrame into row, column index pairs
    valid_columns = [col for col in df.columns if col not in exclude_columns]
    if not valid_columns:
        raise ValueError("All columns are excluded from missing data introduction.")
    
    rows, cols = df.shape
    flat_indices = [(i, j) for i in range(rows) for j in range(cols) if df.columns[j] in valid_columns]
    selected_indices = np.random.choice(len(flat_indices), n_missing, replace=False)
    
    # Introduce missing values
    for index in selected_indices:
        i, j = flat_indices[index]
        df.iat[i, j] = np.nan
    
    return df

modified_data = introduce_missing_data(data, proportion=0.01, exclude_columns=None, random_state=42)
modified_data.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Target
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6.0,f,g,43,560.0,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0.0,f,g,280,824.0,+
2,b,,1.54,u,g,w,v,3.75,t,t,5.0,t,g,100,3.0,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0.0,f,s,120,0.0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0.0,t,g,360,0.0,+


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    modified_data.drop('Target', axis = 1),
    modified_data['Target'],
    test_size = 0.2,
    random_state=0
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((551, 15), (138, 15), (551,), (138,))

In [10]:
# capture the categorical variables
cat_vars = X_train.select_dtypes(
    include='O'
).columns.to_list()
cat_vars

['Gender',
 'Age',
 'Married',
 'BankCustomer',
 'EducationLevel',
 'Ethnicity',
 'PriorDefault',
 'Employed',
 'DriversLicense',
 'Citizen',
 'ZipCode']

In [12]:
# store variables most frequent categories in dict
freq_vals = X_train[cat_vars].mode().iloc[0].to_dict()
freq_vals

{'Gender': 'b',
 'Age': '?',
 'Married': 'u',
 'BankCustomer': 'g',
 'EducationLevel': 'c',
 'Ethnicity': 'v',
 'PriorDefault': 't',
 'Employed': 'f',
 'DriversLicense': 'f',
 'Citizen': 'g',
 'ZipCode': '0'}

In [16]:
# replace missing values in frequent categories
X_train_t = X_train.fillna(value=freq_vals)
X_test = X_test.fillna(value=freq_vals)

In [19]:
# Create a categorical variable name as key and arbitrary string as value in dictionary as a method to use and replace missing data
imputation_dict = {
    var: "no data" for var in cat_vars
}
imputation_dict

{'Gender': 'no data',
 'Age': 'no data',
 'Married': 'no data',
 'BankCustomer': 'no data',
 'EducationLevel': 'no data',
 'Ethnicity': 'no data',
 'PriorDefault': 'no data',
 'Employed': 'no data',
 'DriversLicense': 'no data',
 'Citizen': 'no data',
 'ZipCode': 'no data'}

In [24]:
X_train_t['Married'].value_counts()

Married
u    415
y    128
?      6
l      2
Name: count, dtype: int64

In [26]:
X_train['Married'].value_counts()

Married
u    409
y    128
?      6
l      2
Name: count, dtype: int64

# So, let's try scikit learn imputation technique 


In [31]:
# Set up imputer to identify the most frequent category per variable
imputer = SimpleImputer(strategy='most_frequent')

In [32]:
ct = ColumnTransformer(
    [("imputer", imputer, cat_vars)],
    remainder = 'passthrough'
).set_output(transform="pandas")

In [33]:
ct.fit(X_train) # Fit the column transformer to the dataset to learn

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [34]:
ct.named_transformers_.imputer.statistics_ # Most frequent value learned by imputer

array(['b', '?', 'u', 'g', 'c', 'v', 't', 'f', 'f', 'g', '0'],
      dtype=object)

In [36]:
X_train_t = ct.transform(X_train)
X_test_t = ct.transform(X_test)
X_train_t.head()

Unnamed: 0,imputer__Gender,imputer__Age,imputer__Married,imputer__BankCustomer,imputer__EducationLevel,imputer__Ethnicity,imputer__PriorDefault,imputer__Employed,imputer__DriversLicense,imputer__Citizen,imputer__ZipCode,remainder__Debt,remainder__YearsEmployed,remainder__CreditScore,remainder__Income
127,b,34.42,u,g,i,bb,t,t,f,g,274,4.25,3.25,2.0,610.0
390,b,39.92,u,g,i,bb,f,f,f,g,550,5.0,0.21,0.0,0.0
21,a,47.75,u,g,c,v,t,t,t,g,0,8.0,7.875,6.0,1260.0
463,a,23.0,u,g,j,j,f,t,f,g,200,1.835,0.0,1.0,53.0
338,b,28.0,u,g,w,v,f,f,t,g,300,3.0,0.75,0.0,67.0


# Now, Feature Engine Technique

In [40]:
# Set up 
imputer = CategoricalImputer(
    imputation_method='frequent',
    variables=cat_vars
)

In [41]:
# Fit imputer to train set to learn most frequent categories
imputer.fit(X_train)

In [43]:
imputer.imputer_dict_ # checks the imputer dictionary

{'Gender': 'b',
 'Age': '?',
 'Married': 'u',
 'BankCustomer': 'g',
 'EducationLevel': 'c',
 'Ethnicity': 'v',
 'PriorDefault': 't',
 'Employed': 'f',
 'DriversLicense': 'f',
 'Citizen': 'g',
 'ZipCode': '0'}

In [44]:
X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)

In [45]:
X_train_t

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income
127,b,34.42,4.250,u,g,i,bb,3.250,t,t,2.0,f,g,274,610.0
390,b,39.92,5.000,u,g,i,bb,0.210,f,f,0.0,f,g,550,0.0
21,a,47.75,8.000,u,g,c,v,7.875,t,t,6.0,t,g,0,1260.0
463,a,23,1.835,u,g,j,j,0.000,f,t,1.0,f,g,200,53.0
338,b,28,3.000,u,g,w,v,0.750,f,f,0.0,t,g,300,67.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,a,30.25,5.500,u,g,k,v,5.500,f,f,0.0,t,s,100,0.0
192,b,22.67,1.585,y,p,w,v,3.085,t,t,6.0,f,g,80,0.0
629,a,22.92,1.250,u,g,q,v,0.250,f,f,0.0,t,g,120,809.0
559,a,25,12.330,u,g,cc,h,3.500,t,t,6.0,f,g,400,458.0
