## Steps
* import Libraries
* Load the data
* find the columns with missing values and store in an object
* find the columns based on data type
    - numeric
    - Categoricals
    - Boolean
* Define the function to impute missing values
* apply the function to our dataset with missing values
* check the missing values after imputation

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, precision_score, r2_score, mean_squared_error

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split

In [2]:
# load the dataset
df = pd.read_csv('heart_disease_uci.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [3]:
# lets look at the number of missing columns which are categorical
df.isnull().sum()[(df.isnull().sum() > 0)]

trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
dtype: int64

In [4]:
# find object columns with missing values
missing_cat_col = df.select_dtypes(include=['object']).columns[df.select_dtypes(include=['object']).isnull().any()].to_list()
print(missing_cat_col)
cat_columns = df.select_dtypes(include=['object']).columns.to_list()
print(cat_columns)

['fbs', 'restecg', 'exang', 'slope', 'thal']
['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']


In [5]:
def encode_categorical_data(df, categorical_features):
    """
    Encode categorical data using LabelEncoder for a dataframe having multiple features.

    Parameters:
    df (pandas DataFrame): Input dataframe
    categorical_features (list): List of categorical feature names

    Returns:
    encoded_df (pandas DataFrame): Encoded dataframe
    le_dict (dict): Dictionary of LabelEncoders for each categorical feature
    """
    le_dict = {}
    encoded_df = df.copy()
    for feature in categorical_features:
        le = LabelEncoder()
        encoded_df[feature] = le.fit_transform(encoded_df[feature])
        le_dict[feature] = le
    return encoded_df, le_dict

def inverse_transform_categorical_data(encoded_df, le_dict, categorical_features):
    """
    Inverse transform the encoded categorical data using the stored LabelEncoders.

    Parameters:
    encoded_df (pandas DataFrame): Encoded dataframe
    le_dict (dict): Dictionary of LabelEncoders for each categorical feature
    categorical_features (list): List of categorical feature names

    Returns:
    original_df (pandas DataFrame): Original dataframe with categorical data
    """
    original_df = encoded_df.copy()
    for feature in categorical_features:
        le = le_dict[feature]
        original_df[feature] = le.inverse_transform(original_df[feature])
    return original_df

In [6]:
# data encoding
df_encoded, le_dict = encode_categorical_data(df, categorical_features=cat_columns)
print(df_encoded.head())
print(le_dict)

   id  age  sex  dataset  cp  trestbps   chol  fbs  restecg  thalch  exang  \
0   1   63    1        0   3     145.0  233.0    1        0   150.0      0   
1   2   67    1        0   0     160.0  286.0    0        0   108.0      1   
2   3   67    1        0   0     120.0  229.0    0        0   129.0      1   
3   4   37    1        0   2     130.0  250.0    0        1   187.0      0   
4   5   41    0        0   1     130.0  204.0    0        0   172.0      0   

   oldpeak  slope   ca  thal  num  
0      2.3      0  0.0     0    0  
1      1.5      1  3.0     1    2  
2      2.6      1  2.0     2    1  
3      3.5      0  0.0     1    0  
4      1.4      2  0.0     1    0  
{'sex': LabelEncoder(), 'dataset': LabelEncoder(), 'cp': LabelEncoder(), 'fbs': LabelEncoder(), 'restecg': LabelEncoder(), 'exang': LabelEncoder(), 'slope': LabelEncoder(), 'thal': LabelEncoder()}


In [7]:
# inverse transformation
#df_original = inverse_transform_categorical_data(df_encoded, le_dict, categorical_features=cat_columns)
#print(df_original.head())

In [9]:
# Now we will write a function to impute categorical missing values with random forest classifier

def impute_categorical_missing_values_rf(df, categorical_cols, n_estimators= 100, random_state=20):
    """
    This function imputes categorical missing values using Random Forest Classifier.

    Parameters:
    df (DataFrame): The DataFrame containing missing categorical values.
    categorical_cols (list): A list of categorical columns in the DataFrame.
    n_estimators (int): The number of estimators in the Random Forest Classifier. Default is 100
    random_state (int): The random state for the Random Forest Classifier. Default is 20.

    Returns:
    imputed_df (DataFrame): The DataFrame with imputed categorical missing values.

    Libraries:
    The following libraries must be imported for executing the function
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    
    Data Frame provided as input must be encoded
    """
    # Create a copy of dataframe inorder to avoid modifying it
    imputed_df = df.copy()
    
    # iterate over each categorical column
    for col in categorical_cols:
        # create a mask to identify missing values
        missing_mask = imputed_df[col].isnull()

        # check if there are any missing values in the columns
        if missing_mask.any():
            # create a random forest classifier to impute missing values
            rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

            # fit the random forest to non missing values
            rf.fit(imputed_df[~missing_mask].drop(col, axis=1), imputed_df[~missing_mask][col])

            # predict the missing values
            predicted_values = rf.predict(imputed_df[missing_mask].drop(col, axis=1))

            # replace missing values with predicted values
            imputed_df.loc[missing_mask, col] = predicted_values

    return imputed_df

In [10]:
imputed_data = impute_categorical_missing_values_rf(df_encoded, cat_columns)
print(imputed_data)

      id  age  sex  dataset  cp  trestbps   chol  fbs  restecg  thalch  exang  \
0      1   63    1        0   3     145.0  233.0    1        0   150.0      0   
1      2   67    1        0   0     160.0  286.0    0        0   108.0      1   
2      3   67    1        0   0     120.0  229.0    0        0   129.0      1   
3      4   37    1        0   2     130.0  250.0    0        1   187.0      0   
4      5   41    0        0   1     130.0  204.0    0        0   172.0      0   
..   ...  ...  ...      ...  ..       ...    ...  ...      ...     ...    ...   
915  916   54    0        3   0     127.0  333.0    1        2   154.0      0   
916  917   62    1        3   3       NaN  139.0    0        2     NaN      2   
917  918   55    1        3   0     122.0  223.0    1        2   100.0      0   
918  919   58    1        3   0       NaN  385.0    1        0     NaN      2   
919  920   62    1        3   1     120.0  254.0    0        0    93.0      1   

     oldpeak  slope   ca  t

In [11]:
imputed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    int32  
 3   dataset   920 non-null    int32  
 4   cp        920 non-null    int32  
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       920 non-null    int32  
 8   restecg   920 non-null    int32  
 9   thalch    865 non-null    float64
 10  exang     920 non-null    int32  
 11  oldpeak   858 non-null    float64
 12  slope     920 non-null    int32  
 13  ca        309 non-null    float64
 14  thal      920 non-null    int32  
 15  num       920 non-null    int64  
dtypes: float64(5), int32(8), int64(3)
memory usage: 86.4 KB
