In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from feature_engine.imputation import ArbitraryNumberImputer

In [7]:
data = pd.read_csv("crx.csv")

In [8]:
data.head()

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,1,f,g.1,202,0.1,+
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+


In [9]:
# Create a function to form artificial missing observations in the dataset randomly

def introduce_missing_data(df, proportion=0.1, exclude_columns=None, random_state=None):
    """
    Randomly introduces missing values (NaN) into a dataset.
    
    Parameters:
        df (pd.DataFrame): The dataset to modify.
        proportion (float): Proportion of total observations to set as missing (default: 0.01 or 1%).
        exclude_columns (list): List of columns to exclude from missing data insertion (default: None).
        random_state (int): Seed for reproducibility (default: None).
        
    Returns:
        pd.DataFrame: Dataset with missing values introduced.
    """
    if exclude_columns is None:
        exclude_columns = []
        
    if random_state is not None:
        np.random.seed(random_state)
    
    df = df.copy()  # Make a copy to avoid modifying the original dataset
    total_cells = df.size
    n_missing = int(total_cells * proportion)
    
    # Flatten the DataFrame into row, column index pairs
    valid_columns = [col for col in df.columns if col not in exclude_columns]
    if not valid_columns:
        raise ValueError("All columns are excluded from missing data introduction.")
    
    rows, cols = df.shape
    flat_indices = [(i, j) for i in range(rows) for j in range(cols) if df.columns[j] in valid_columns]
    selected_indices = np.random.choice(len(flat_indices), n_missing, replace=False)
    
    # Introduce missing values
    for index in selected_indices:
        i, j = flat_indices[index]
        df.iat[i, j] = np.nan
    
    return df

modified_data = introduce_missing_data(data, proportion=0.01, exclude_columns=None, random_state=42)
modified_data.head()

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,1,f,g.1,202,0.1,+
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6.0,f,g,43,560.0,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0.0,f,g,280,824.0,+
2,b,,1.54,u,g,w,v,3.75,t,t,5.0,t,g,100,3.0,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0.0,f,s,120,0.0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0.0,t,g,360,0.0,+


In [10]:
modified_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689 entries, 0 to 688
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   b       681 non-null    object 
 1   30.83   676 non-null    object 
 2   0       679 non-null    float64
 3   u       682 non-null    object 
 4   g       683 non-null    object 
 5   w       684 non-null    object 
 6   v       685 non-null    object 
 7   1.25    682 non-null    float64
 8   t       682 non-null    object 
 9   t.1     687 non-null    object 
 10  1       683 non-null    float64
 11  f       682 non-null    object 
 12  g.1     684 non-null    object 
 13  202     678 non-null    object 
 14  0.1     685 non-null    float64
 15  +       681 non-null    object 
dtypes: float64(4), object(12)
memory usage: 86.3+ KB


In [11]:
modified_data.isnull().sum()

b         8
30.83    13
0        10
u         7
g         6
w         5
v         4
1.25      7
t         7
t.1       2
1         6
f         7
g.1       5
202      11
0.1       4
+         8
dtype: int64

In [14]:
# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    modified_data.drop("+", axis = 1),
    modified_data["+"],
    test_size=0.3,
    random_state=0
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((482, 15), (207, 15), (482,), (207,))

In [16]:
# Let's view the numerical variable from the dataset
X_train.select_dtypes(exclude='O')

Unnamed: 0,0,1.25,1,0.1
595,3.000,2.375,8.0,4159.0
303,13.665,1.500,0.0,1.0
204,12.000,14.000,8.0,6590.0
643,0.420,0.290,0.0,2.0
118,10.335,0.335,1.0,50.0
...,...,...,...,...
359,5.500,5.500,0.0,0.0
192,1.585,3.085,6.0,0.0
629,1.250,0.250,0.0,809.0
559,12.330,3.500,6.0,458.0


In [18]:
# Find the value of the four numerical variables
X_train[['0', '1.25', '1', '0.1']].max()

0           28.0
1.25        28.5
1           67.0
0.1     100000.0
dtype: float64

In [22]:
# Let's make a copy of the DataFrame
X_train_cp = X_train.copy()
X_test_cp = X_test.copy()

In [24]:
# We fill the mssing data with 99
X_train_cp[['0', '1.25', '1', '0.1']] = X_train[['0', '1.25', '1', '0.1']].fillna(99)
X_test_cp[['0', '1.25', '1', '0.1']] = X_test[['0', '1.25', '1', '0.1']].fillna(99)

# Method for replacing missing value with an arbitrary number

In [32]:
# Set up imputer to replace missing value with 99
imputer = SimpleImputer(strategy='constant', fill_value=99)

In [34]:
# We fit imputer to the train set containing the numerical variable
var = ['0', '1.25', '1', '0.1']
imputer.fit(X_train[var])

In [35]:
# replace missing value with 99 in the desired variable
X_train[var] = imputer.transform(X_train[var])
X_test[var] = imputer.transform(X_test[var])

In [36]:
X_test_cp[['0', '1.25', '1', '0.1']].isnull().sum()

0       0
1.25    0
1       0
0.1     0
dtype: int64

# Lastly, Imputing missing value using feature engine

In [38]:
# Let set up imputer to replace missing value with arbitrary value 99
imputer = ArbitraryNumberImputer(arbitrary_number=99,
                                variables=['0', '1.25', '1', '0.1']
                                )

In [40]:
# finally, repace missing value with 99
X_train_t = imputer.fit_transform(X_train)
X_test_t = imputer.transform(X_test)

In [42]:
X_train_t.isnull().sum()

b         6
30.83    10
0         0
u         5
g         4
w         5
v         4
1.25      0
t         6
t.1       2
1         0
f         4
g.1       5
202       7
0.1       0
dtype: int64

In [44]:
X_train_t[var].isnull().sum()

0       0
1.25    0
1       0
0.1     0
dtype: int64

In [46]:
X_test_t[var].isnull().sum()

0       0
1.25    0
1       0
0.1     0
dtype: int64