In [2]:
from typing import Union
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer
import logging

In [3]:
df = pd.read_csv('../data/bankloans.csv')

In [6]:
miss_impute = MissingValueImputer()

In [7]:
df.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1.0
1,27,1,10,6,31,17.3,1.362202,4.000798,0.0
2,40,1,15,14,55,5.5,0.856075,2.168925,0.0
3,41,1,15,14,120,2.9,2.65872,0.82128,0.0
4,24,2,2,0,28,17.3,1.787436,3.056564,1.0


In [8]:
df.isnull().sum()

age           0
ed            0
employ        0
address       0
income        0
debtinc       0
creddebt      0
othdebt       0
default     450
dtype: int64

In [9]:
index = df.index[df['default'].isnull()].tolist()
df.loc[index, 'default_bool'] = None

In [10]:
df.select_dtypes('float').columns

Index(['debtinc', 'creddebt', 'othdebt', 'default', 'default_bool'], dtype='object')

In [11]:
miss_impute.regression_imputation(df.drop(columns = 'default'), target_column='default_bool', param_grid={
        'n_estimators': [100, 200],
        'max_depth': [3, 5],
        'min_samples_split': [2, 5]
    })

ERROR:root:An error occurred during regression imputation.
ERROR:root:Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=0.


Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default_bool
0,41,3,17,12,176,9.3,11.359392,5.008608,
1,27,1,10,6,31,17.3,1.362202,4.000798,
2,40,1,15,14,55,5.5,0.856075,2.168925,
3,41,1,15,14,120,2.9,2.658720,0.821280,
4,24,2,2,0,28,17.3,1.787436,3.056564,
...,...,...,...,...,...,...,...,...,...
1145,34,1,12,15,32,2.7,0.239328,0.624672,
1146,32,2,12,11,116,5.7,4.026708,2.585292,
1147,48,1,13,11,38,10.8,0.722304,3.381696,
1148,35,2,1,11,24,7.8,0.417456,1.454544,


In [13]:
miss_impute.knn_imputation(df.drop(columns="default_bool"), 8)

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41.0,3.0,17.0,12.0,176.0,9.3,11.359392,5.008608,1.000
1,27.0,1.0,10.0,6.0,31.0,17.3,1.362202,4.000798,0.000
2,40.0,1.0,15.0,14.0,55.0,5.5,0.856075,2.168925,0.000
3,41.0,1.0,15.0,14.0,120.0,2.9,2.658720,0.821280,0.000
4,24.0,2.0,2.0,0.0,28.0,17.3,1.787436,3.056564,1.000
...,...,...,...,...,...,...,...,...,...
1145,34.0,1.0,12.0,15.0,32.0,2.7,0.239328,0.624672,0.000
1146,32.0,2.0,12.0,11.0,116.0,5.7,4.026708,2.585292,0.000
1147,48.0,1.0,13.0,11.0,38.0,10.8,0.722304,3.381696,0.125
1148,35.0,2.0,1.0,11.0,24.0,7.8,0.417456,1.454544,0.375


In [49]:
df.default.unique()

array([ 1.,  0., nan])

In [5]:
class MissingValueImputer:
    """Class to impute missing values"""
    def __init__(self, strategy: str = 'mean'):
        self.strategy = strategy

    def mean_imputation(self, dataframe: pd.DataFrame) -> pd.DataFrame:
        """
        Perform mean imputation for missing values in the DataFrame.

        Args:
            dataframe (pd.DataFrame): The input DataFrame.

        Returns:
            pd.DataFrame: The DataFrame with missing values imputed using mean imputation.
        """
        imputer = SimpleImputer(strategy='mean')
        dataframe_imputed = pd.DataFrame(imputer.fit_transform(dataframe), columns=dataframe.columns)
        return dataframe_imputed

    def median_imputation(self, dataframe: pd.DataFrame) -> pd.DataFrame:
        """
        Perform median imputation for missing values in the DataFrame.

        Args:
            dataframe (pd.DataFrame): The input DataFrame.

        Returns:
            pd.DataFrame: The DataFrame with missing values imputed using median imputation.
        """
        imputer = SimpleImputer(strategy='median')
        dataframe_imputed = pd.DataFrame(imputer.fit_transform(dataframe), columns=dataframe.columns)
        return dataframe_imputed

    def mode_imputation(self, dataframe: pd.DataFrame) -> pd.DataFrame:
        """
        Perform mode imputation for missing values in the DataFrame.

        Args:
            dataframe (pd.DataFrame): The input DataFrame.

        Returns:
            pd.DataFrame: The DataFrame with missing values imputed using mode imputation.
        """
        imputer = SimpleImputer(strategy='most_frequent')
        dataframe_imputed = pd.DataFrame(imputer.fit_transform(dataframe), columns=dataframe.columns)
        return dataframe_imputed

    def forward_fill_imputation(self, dataframe: pd.DataFrame) -> pd.DataFrame:
        """
        Perform forward fill imputation for missing values in the DataFrame.

        Args:
            dataframe (pd.DataFrame): The input DataFrame.

        Returns:
            pd.DataFrame: The DataFrame with missing values imputed using forward fill imputation.
        """
        dataframe_imputed = dataframe.ffill()
        return dataframe_imputed

    def backward_fill_imputation(self, dataframe: pd.DataFrame) -> pd.DataFrame:
        """
        Perform backward fill imputation for missing values in the DataFrame.

        Args:
            dataframe (pd.DataFrame): The input DataFrame.

        Returns:
            pd.DataFrame: The DataFrame with missing values imputed using backward fill imputation.
        """
        dataframe_imputed = dataframe.bfill()
        return dataframe_imputed

    def value_imputation(self, dataframe: pd.DataFrame, value: Union[int, float, str]) -> pd.DataFrame:
        """
        Perform value imputation for missing values in the DataFrame.

        Args:
            dataframe (pd.DataFrame): The input DataFrame.
            value (int or float or str): The value to be used for imputing missing values.

        Returns:
            pd.DataFrame: The DataFrame with missing values imputed using value imputation.
        """
        if not isinstance(value, (int, float, str)):
            raise TypeError("Invalid value type. Value should be an int, float, or str.")

        if isinstance(value, (int, float)):
            columns_to_impute = dataframe.select_dtypes(include=[np.number]).columns
        elif isinstance(value, str):
            columns_to_impute = dataframe.select_dtypes(include=[object]).columns

        dataframe_imputed = dataframe.copy()
        dataframe_imputed[columns_to_impute] = dataframe_imputed[columns_to_impute].fillna(value)

        return dataframe_imputed

    def regression_imputation(self, dataframe: pd.DataFrame,
                              target_column: str, param_grid: dict = None
                              ) -> pd.DataFrame:
        """
        Perform regression imputation for missing values in the DataFrame.

        Args:
            dataframe (pd.DataFrame): The input DataFrame.
            target_column (str): The column with missing values to be imputed.
            param_grid (dict): Optional parameter grid for GridSearchCV.

        Returns:
            pd.DataFrame: The DataFrame with missing values imputed using regression imputation.
        """
        try:
            dataframe_missing = dataframe[dataframe[target_column].isnull()]
            dataframe_not_missing = dataframe[~dataframe[target_column].isnull()]

            if dataframe_missing.empty:
                logging.info("No missing values found in the target column.")
                return dataframe

            x_train = dataframe_not_missing.drop(target_column, axis=1)
            y_train = dataframe_not_missing[target_column]

            param_grid = param_grid or {
                'n_estimators': [100, 200, 300],
                'max_depth': [3, 5, None],
                'min_samples_split': [2, 5, 10]
            }

            rdf = RandomForestRegressor()
            grid_search = GridSearchCV(rdf, param_grid, scoring='neg_mean_squared_error', cv=5)
            grid_search.fit(x_train, y_train)

            best_rf = grid_search.best_estimator_

            x_missing = dataframe_missing.drop(target_column, axis=1)
            dataframe_missing_imputed = dataframe_missing.copy()
            dataframe_missing_imputed[target_column] = best_rf.predict(x_missing)

            dataframe_imputed = pd.concat([dataframe_not_missing, dataframe_missing_imputed])
            return dataframe_imputed

        except KeyError:
            logging.error(f"The target column '{target_column}' does not exist in the DataFrame.")
            return dataframe

        except Exception as error:
            logging.error("An error occurred during regression imputation.")
            logging.error(str(error))
            return dataframe

    def knn_imputation(self, dataframe: pd.DataFrame,
                    n_neighbors: int = 5) -> pd.DataFrame:
        """
        Perform KNN imputation for missing values in the DataFrame.

        Args:
            dataframe (pd.DataFrame): The input DataFrame.
            n_neighbors (int): The number of neighbors to consider for imputation. Defaults to 5.

        Returns:
            pd.DataFrame: The DataFrame with missing values imputed using KNN imputation.
        """
        try:
            imputer = KNNImputer(n_neighbors=n_neighbors)
            dataframe_imputed = imputer.fit_transform(dataframe)
            # Convert the imputed array back to a DataFrame
            dataframe_imputed = pd.DataFrame(dataframe_imputed, columns=dataframe.columns)
            return dataframe_imputed
        except Exception as e:
            logging.error("An error occurred during KNN imputation.")
            logging.error(str(e))
            return dataframe

    def drop_null_columns(self, dataframe: pd.DataFrame, threshold: float = 0.5) -> pd.DataFrame:
        """
        Drop all columns in the DataFrame with a greater percentage of null values than the specified threshold.

        Args:
            dataframe (pd.DataFrame): The input DataFrame.
            threshold (float): The maximum percentage of null values allowed (between 0 and 1). Defaults to 0.5

        Returns:
            pd.DataFrame: The DataFrame with null columns dropped.
        """
        # Calculate the percentage of null values in each column
        null_percentages = dataframe.isnull().mean()
        # Get the names of columns with null percentages greater than the threshold
        columns_to_drop = null_percentages[null_percentages >= threshold].index
        # Drop the columns from the DataFrame
        dataframe_dropped = dataframe.drop(columns=columns_to_drop)
        return dataframe_dropped

    def drop_null_rows(self, dataframe: pd.DataFrame, threshold: int = None) -> pd.DataFrame:
        """
        Drop rows with missing values based on the threshold.

        Args:
            dataframe (pd.DataFrame): The input DataFrame.
            threshold (int): The minimum number of non-null values required in a row. Defaults to None.

        Returns:
            pd.DataFrame: The DataFrame with null rows dropped.
        """
        if threshold is None:
            logging.error("No threshold for deleting rows with missing values provided.")
            return dataframe

        dataframe_dropped = dataframe.dropna(thresh=threshold)
        return dataframe_dropped


In [17]:
def main():
    # Create a sample DataFrame with missing values
    import numpy as np
    data = {
        'A': [1, 2, np.nan, 4, 5],
        'B': [np.nan, 2, 3, np.nan, 5],
        'C': [1, 2, 3, 4, np.nan]
    }
    df = pd.DataFrame(data)
    print(data)
    # Create an instance of the MissingValueImputer class
    imputer = MissingValueImputer()


    # Test the mean_imputation function
    df_imputed = imputer.mean_imputation(df)
    print("DataFrame after mean imputation:")
    print(df_imputed)
    print()

    # Test the regression_imputation function
    df_imputed = imputer.regression_imputation(df, target_column='A')
    print("DataFrame after regression imputation:")
    print(df_imputed)
    print()

    # Test the knn_imputation function
    df_imputed = imputer.knn_imputation(df,  n_neighbors=3)
    print("DataFrame after KNN imputation:")
    print(df_imputed)
    print()

if __name__ == "__main__":
    main()


ERROR:root:An error occurred during regression imputation.
ERROR:root:Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=4.


{'A': [1, 2, nan, 4, 5], 'B': [nan, 2, 3, nan, 5], 'C': [1, 2, 3, 4, nan]}
DataFrame after mean imputation:
     A         B    C
0  1.0  3.333333  1.0
1  2.0  2.000000  2.0
2  3.0  3.000000  3.0
3  4.0  3.333333  4.0
4  5.0  5.000000  2.5

DataFrame after regression imputation:
     A    B    C
0  1.0  NaN  1.0
1  2.0  2.0  2.0
2  NaN  3.0  3.0
3  4.0  NaN  4.0
4  5.0  5.0  NaN

DataFrame after KNN imputation:
          A         B    C
0  1.000000  3.333333  1.0
1  2.000000  2.000000  2.0
2  2.333333  3.000000  3.0
3  4.000000  3.333333  4.0
4  5.000000  5.000000  3.0

