In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler, LabelEncoder
from scipy.stats import zscore

In [None]:
#@title preprocessing_classes
class Duplicates:
    def __init__(self, duplicates=True):
        self.duplicates = duplicates

    def handle(self, df):
        if self.duplicates:
            df.drop_duplicates(inplace=True, ignore_index=True)
        return df

class MissingValues:
    def __init__(self, missing_num=None, missing_categ=None):
        self.missing_num = missing_num
        self.missing_categ = missing_categ

    def handle(self, df, _n_neighbors=5):
        if self.missing_num or self.missing_categ:
            if df.isna().sum().sum() != 0:
                if self.missing_num:
                    df = self._handle_missing_num(df, _n_neighbors)
                if self.missing_categ:
                    df = self._handle_missing_categ(df, _n_neighbors)
        return df

    def _handle_missing_num(self, df, _n_neighbors):
        num_cols = df.select_dtypes(include=np.number).columns
        for col in num_cols:
            if self.missing_num in ['auto', 'knn']:  # Use KNN imputation
                imputer = KNNImputer(n_neighbors=_n_neighbors)
                df[col] = imputer.fit_transform(df[[col]])
                df[col] = df[col].round().astype('Int64')
        return df

    def _handle_missing_categ(self, df, _n_neighbors):
        cat_cols = set(df.columns) - set(df.select_dtypes(include=np.number).columns)
        for col in cat_cols:
            if self.missing_categ in ['auto', 'logreg', 'most_frequent']:
                if self.missing_categ == 'most_frequent':
                    strategy = self.missing_categ
                else:
                    strategy = 'constant'
                imputer = SimpleImputer(strategy=strategy)
                df[col] = imputer.fit_transform(df[[col]])
        return df

class Outliers:
    def __init__(self):
        pass

    def handle(self, df):
        df_outliers = self.detect_outliers(df)
        df[df_outliers] = np.nan
        df.fillna(df.mean(), inplace=True)
        return df

    def detect_outliers(self, df):
        Q1 = df.quantile(0.25)
        Q3 = df.quantile(0.75)
        IQR = Q3 - Q1
        return ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)


class Adjust:
    def __init__(self, scaler=None, extract_datetime=False):
        self.scaler = scaler
        self.extract_datetime = extract_datetime

    def handle(self, df):
        if self.scaler:
            if self.scaler in ['MinMax', 'Standard', 'Robust']:
                scaler = preprocessing.__getattribute__(self.scaler+'Scaler')()
                df[df.columns] = scaler.fit_transform(df[df.columns])
        if self.extract_datetime:
            df = self._convert_datetime(df)
        return df

    def _convert_datetime(self, df):
        cols = set(df.columns) ^ set(df.select_dtypes(include=np.number).columns)
        for col in cols:
            try:
                df[col] = pd.to_datetime(df[col], infer_datetime_format=True)
                if self.extract_datetime != False:
                    df = df.join(pd.to_datetime(df[col]).dt.__getattribute__(self.extract_datetime))
            except:
                pass
        return df

class EncodeCateg:
    def __init__(self, encode_categ=None):
        self.encode_categ = encode_categ

    def handle(self, df):
        if self.encode_categ:
            if self.encode_categ == 'auto':
                self._auto_encode(df)
            elif isinstance(self.encode_categ, list):
                for col in self.encode_categ:
                    if col in df.columns:
                        self._auto_encode(df, col)
        return df

    def _auto_encode(self, df, col=None):
        if col:
            if len(df[col].unique()) <= 10:
                df = pd.get_dummies(df, columns=[col], prefix=[col])
            else:
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col])
        else:
            for col in df.select_dtypes(include='object'):
                if len(df[col].unique()) <= 10:
                    df = pd.get_dummies(df, columns=[col], prefix=[col])
                else:
                    le = LabelEncoder()
                    df[col] = le.fit_transform(df[col])
        return df


In [None]:
#@title master_class
class DataCleaner:
    def __init__(self,
                 remove_duplicates=True,
                 handle_missing_num=True,
                 handle_missing_categ=True,
                 handle_outliers=True,
                 scale_numerical=True,
                 extract_datetime=False,
                 encode_categorical=True):
        self.remove_duplicates = remove_duplicates
        self.handle_missing_num = handle_missing_num
        self.handle_missing_categ = handle_missing_categ
        self.handle_outliers = handle_outliers
        self.scale_numerical = scale_numerical
        self.extract_datetime = extract_datetime
        self.encode_categorical = encode_categorical


    def clean(self, df, _n_neighbors=5):
        if self.duplicates:
            self._handle_duplicates(df)

        if self.missing_num or self.missing_categ:
            self._handle_missing_values(df, _n_neighbors)

        if self.scaler:
            self._handle_scaling(df)

        if self.extract_datetime:
            self._handle_datetime(df)

        if self.encode_categ:
            self._handle_categorical_encoding(df)

        return df

    def _handle_duplicates(self, df):
        df.drop_duplicates(inplace=True, ignore_index=True)

    def _handle_missing_values(self, df, _n_neighbors):
        if self.missing_num:
            self._handle_missing_num(df, _n_neighbors)
        if self.missing_categ:
            self._handle_missing_categ(df)

    def _handle_missing_num(self, df, _n_neighbors):
        num_cols = df.select_dtypes(include=np.number).columns
        for col in num_cols:
            if self.missing_num in ['auto', 'knn']:  # Use KNN imputation
                imputer = KNNImputer(n_neighbors=_n_neighbors)
                df[col] = imputer.fit_transform(df[[col]])
                df[col] = df[col].round().astype('Int64')

    def _handle_missing_categ(self, df):
        cat_cols = set(df.columns) - set(df.select_dtypes(include=np.number).columns)
        for col in cat_cols:
            if self.missing_categ in ['auto', 'logreg', 'most_frequent']:
                if self.missing_categ == 'most_frequent':
                    strategy = self.missing_categ
                else:
                    strategy = 'constant'
                imputer = SimpleImputer(strategy=strategy)
                df[col] = imputer.fit_transform(df[[col]])

    def _handle_scaling(self, df):
        if self.scaler in ['minMax', 'standard', 'robust']:
            scaler = globals()[self.scaler.capitalize() + 'Scaler']()
            df[df.columns] = scaler.fit_transform(df[df.columns])

    def _handle_datetime(self, df):
        cols = set(df.columns) ^ set(df.select_dtypes(include=np.number).columns)
        for col in cols:
            try:
                df[col] = pd.to_datetime(df[col], infer_datetime_format=True)
                if self.extract_datetime:
                    df = df.join(pd.to_datetime(df[col]).dt.__getattribute__(self.extract_datetime))
            except:
                pass

    def _handle_categorical_encoding(self, df):
        if self.encode_categ == 'auto':
            self._auto_encode(df)
        elif isinstance(self.encode_categ, list):
            for col in self.encode_categ:
                if col in df.columns:
                    self._auto_encode(df, col)

    def _auto_encode(self, df, col=None):
        if col:
            if len(df[col].unique()) <= 10:
                df = pd.get_dummies(df, columns=[col], prefix=[col])
            else:
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col])
        else:
            for col in df.select_dtypes(include='object'):
                if len(df[col].unique()) <= 10:
                    df = pd.get_dummies(df, columns=[col], prefix=[col])
                else:
                    le = LabelEncoder()
                    df[col] = le.fit_transform(df[col])


In [None]:
# # Load the dataset into a DataFrame
# df = pd.read_csv('dp_data1.csv')
# df.info()

# # Instantiate DataCleaner with desired configuration
# cleaner = DataCleaner(remove_duplicates=True,
#                       handle_missing_num=True,
#                       handle_missing_categ=True,
#                       handle_outliers=True,
#                       scale_numerical=True,
#                       extract_datetime=False,
#                       encode_categorical=True)

# # Apply data cleaning
# cleaned_df = cleaner.clean(df)

In [None]:
def clean_data(df, config):
    # Instantiate preprocessing classes
    duplicates_handler = Duplicates(config.get('handle_duplicates', True))
    missing_values_handler = MissingValues(config.get('handle_missing_num'), config.get('handle_missing_categ'))
    outliers_handler = Outliers()
    adjust_handler = Adjust(config.get('scaler'), config.get('extract_datetime'))
    encode_categ_handler = EncodeCateg(config.get('encode_categ'))

    # Define preprocessing sequence
    preprocessing_steps = [
        duplicates_handler,
        missing_values_handler,
        outliers_handler,
        adjust_handler,
        encode_categ_handler
    ]

    # Apply preprocessing steps
    cleaned_df = df.copy()
    for step in preprocessing_steps:
        cleaned_df = step.handle(cleaned_df)

    return cleaned_df

In [None]:
dataset = pd.read_csv('dp_data1.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Feature_1   1000 non-null   float64
 1   Feature_2   1000 non-null   float64
 2   Feature_3   1000 non-null   float64
 3   Feature_4   1000 non-null   int64  
 4   Feature_5   1000 non-null   int64  
 5   Feature_6   1000 non-null   object 
 6   Feature_7   1000 non-null   int64  
 7   Feature_8   1000 non-null   float64
 8   Feature_9   1000 non-null   float64
 9   Feature_10  1000 non-null   float64
 10  Feature_11  1000 non-null   int64  
 11  Feature_12  1000 non-null   float64
 12  Feature_13  1000 non-null   float64
 13  Feature_14  1000 non-null   float64
 14  Feature_15  1000 non-null   int64  
 15  Feature_16  1000 non-null   float64
 16  Feature_17  782 non-null    float64
 17  Feature_18  1000 non-null   float64
 18  Feature_19  746 non-null    object 
 19  Feature_20  1000 non-null   

In [None]:
config = {
    'duplicates': True,
    'missing_num': 'auto',
    'missing_categ': 'most_frequent',
    'handle_outliers': True,
    'scaler': 'minMax',
    'extract_datetime': 'dayofweek',
    'encode_categ': 'auto'
}

cleaned_df = clean_data(dataset, config)

  Q1 = df.quantile(0.25)
  Q3 = df.quantile(0.75)
  return ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)
  df.fillna(df.mean(), inplace=True)


In [None]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Feature_1   1000 non-null   float64
 1   Feature_2   1000 non-null   float64
 2   Feature_3   1000 non-null   float64
 3   Feature_4   1000 non-null   float64
 4   Feature_5   1000 non-null   float64
 5   Feature_6   670 non-null    object 
 6   Feature_7   1000 non-null   float64
 7   Feature_8   1000 non-null   float64
 8   Feature_9   1000 non-null   float64
 9   Feature_10  1000 non-null   float64
 10  Feature_11  1000 non-null   float64
 11  Feature_12  1000 non-null   float64
 12  Feature_13  1000 non-null   float64
 13  Feature_14  1000 non-null   float64
 14  Feature_15  1000 non-null   float64
 15  Feature_16  1000 non-null   float64
 16  Feature_17  1000 non-null   float64
 17  Feature_18  1000 non-null   float64
 18  Feature_19  499 non-null    object 
 19  Feature_20  1000 non-null   

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from scipy.stats import zscore
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

In [None]:
#@title Data Cleaning
class DataCleaning:
    def __init__(self, duplicates=True, missing_num=None, missing_categ=None, outliers_method=None, scaler=None, extract_datetime=False, encode_categ=None):
        self.duplicates = duplicates
        self.missing_num = missing_num
        self.missing_categ = missing_categ
        self.outliers_method = outliers_method
        self.scaler = scaler
        self.extract_datetime = extract_datetime
        self.encode_categ = encode_categ

    def fit_transform(self, df):
        original_dtypes = df.dtypes

        if self.duplicates:
            df = Duplicates().handle(df)
        if self.missing_num or self.missing_categ:
            df = MissingValues(self.missing_num, self.missing_categ).handle(df)
        if self.outliers_method:
            df = Outliers().handle(df)
        if self.scaler or self.extract_datetime:
            df = Adjust(self.scaler, self.extract_datetime).handle(df)
        if self.encode_categ:
            df = EncodeCateg(self.encode_categ).handle(df)

        for col in df.columns:
            if original_dtypes[col] in [np.float64, np.int64]:
                df[col] = df[col].astype(original_dtypes[col])

        return df


class Duplicates:
    def handle(self, df):
        df.drop_duplicates(inplace=True, ignore_index=True)
        return df


class MissingValues:
    def __init__(self, missing_num=None, missing_categ=None):
        self.missing_num = missing_num
        self.missing_categ = missing_categ

    def handle(self, df, _n_neighbors=5):
        if self.missing_num or self.missing_categ:
            if df.isna().sum().sum() != 0:
                if self.missing_num:
                    df = self._handle_missing_num(df, _n_neighbors)
                if self.missing_categ:
                    df = self._handle_missing_categ(df, _n_neighbors)
        return df

    def _handle_missing_num(self, df, _n_neighbors):
        num_cols = df.select_dtypes(include=np.number).columns
        for col in num_cols:
            if self.missing_num in ['auto', 'knn']:
                imputer = KNNImputer(n_neighbors=_n_neighbors)
                df[col] = imputer.fit_transform(df[[col]])
                df[col] = df[col].round().astype('Int64')
        return df

    def _handle_missing_categ(self, df, _n_neighbors):
        cat_cols = set(df.columns) - set(df.select_dtypes(include=np.number).columns)
        for col in cat_cols:
            if self.missing_categ in ['auto', 'logreg', 'most_frequent']:
                if self.missing_categ == 'most_frequent':
                    strategy = self.missing_categ
                else:
                    strategy = 'constant'
                imputer = SimpleImputer(strategy=strategy)
                df[col] = imputer.fit_transform(df[[col]])
        return df


class Outliers:
    def handle(self, df):
        df = self.replace_outliers(df)
        return df

    def detect_outliers(self, df):
        Q1 = df.quantile(0.25)
        Q3 = df.quantile(0.75)
        IQR = Q3 - Q1
        return ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)

    def replace_outliers(self, df, replacement_value=None):
        if replacement_value is None:
            replacement_value = df.median(numeric_only=True)
        for col in df.columns:
            if df[col].dtype != 'O':
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                df[col] = np.where((df[col] < lower_bound) | (df[col] > upper_bound), replacement_value[col], df[col])
        return df


class Adjust:
    def __init__(self, scaler=None, extract_datetime=False):
        self.scaler = scaler
        self.extract_datetime = extract_datetime

    def handle(self, df):
        if self.scaler or self.extract_datetime:
            df = self._convert_datetime(df)
            if self.scaler:
                if self.scaler in ['MinMax', 'Standard', 'Robust']:
                    scaler = preprocessing.__getattribute__(self.scaler+'Scaler')()
                    df[df.columns] = scaler.fit_transform(df[df.columns])
        return df

    def _convert_datetime(self, df):
        cols = set(df.columns) & set(self.extract_datetime)
        for col in cols:
            try:
                df[col] = pd.to_datetime(df[col], errors='coerce')
                if self.extract_datetime:
                    df[col + '_year'] = df[col].dt.year
                    df[col + '_month'] = df[col].dt.month
                    df[col + '_day'] = df[col].dt.day
                    df.drop(columns=[col], inplace=True)
            except:
                pass
        return df


class EncodeCateg:
    def __init__(self, encode_categ=None):
        self.encode_categ = encode_categ

    def handle(self, df):
        if self.encode_categ:
            if self.encode_categ == 'auto':
                self._auto_encode(df)
            elif isinstance(self.encode_categ, list):
                for col in self.encode_categ:
                    if col in df.columns:
                        self._auto_encode(df, col)
        return df

    def _auto_encode(self, df, col=None):
        if col:
            if df[col].dtype == 'O':
                if len(df[col].unique()) <= 10:
                    df[col] = df[col].astype('category')
                    df = pd.get_dummies(df, columns=[col], prefix=[col], drop_first=True)
                else:
                    le = LabelEncoder()
                    df[col] = le.fit_transform(df[col])
        else:
            for col in df.select_dtypes(include='object'):
                if len(df[col].unique()) <= 10:
                    df[col] = df[col].astype('category')
                    df = pd.get_dummies(df, columns=[col], prefix=[col], drop_first=True)
                else:
                    le = LabelEncoder()
                    df[col] = le.fit_transform(df[col])
        return df

In [None]:
data = pd.read_csv("dp_data1.csv")
data.info()

In [None]:
clean_data = DataCleaning(duplicates=True,missing_num='knn',missing_categ='most_frequent',outliers_method=True,
                                scaler='minMax',extract_datetime='year',encode_categ='auto')


cleaned_data = clean_data.fit_transform(data)
cleaned_data.info()