#### Installations

In [None]:
!pip install torchviz torch pyarrow==11.0.0 category_encoders

#### Imports

In [None]:
from sklearn import linear_model, metrics, model_selection, svm, neighbors, preprocessing, datasets, pipeline, ensemble
from sklearn.feature_selection import SelectKBest, SelectFromModel, GenericUnivariateSelect
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import chi2, mutual_info_classif
from sklearn.datasets import load_diabetes, make_classification
from sklearn.datasets import load_breast_cancer, fetch_openml
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_selection import mutual_info_regression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.covariance import empirical_covariance
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import normalize
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import sklearn.feature_selection as fs
import sklearn.preprocessing as skpr
from IPython.display import display
import matplotlib.pyplot as plt
import category_encoders as ce
from tqdm.notebook import tqdm
from sklearn import compose
from sklearn.svm import SVC
import lightgbm as lgb
import seaborn as sns
import pandas as pd
import numpy as np
import datetime
import warnings
import pickle
import torch
import copy
import math
import time
import sys

%pylab inline

#### Кодирование признаков (author: github.com/dmforit)

In [None]:
class CircularEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, limits=None, fit_replace=True, tol=1e-8):
        self.limits = limits
        self.fit_replace = fit_replace
        self.tol = tol
        self._shape = (0, 0)

    def __repr__(self):
        return "Circular Encoder"

    def __str__(self):
        return "Circular Encoder"

    def fit(self, X, y=None):

        # Shape setting
        self._shape = self.__set_shape(X)

        # Defining limit
        if self.fit_replace:
            self.limits = self.__set_limits(X)

        return self

    def transform(self, X, y=None):
        if self.limits is None:
            return copy.deepcopy(X)

        # column_names only for DataFrame
        column_names = None

        # cast to numpy array
        if isinstance(X, pd.DataFrame):
            X_ndarray = X.to_numpy().reshape(self._shape)
            column_names = np.zeros(2 * X.columns.shape[0], dtype=object)
        else:
            X_ndarray = X.reshape(self._shape)

        # main encoding
        result_sin = np.sin((2 * np.pi * X_ndarray) / self.limits)
        result_cos = np.cos((2 * np.pi * X_ndarray) / self.limits)

        result_sin[np.abs(result_sin) < self.tol] = 0.0
        result_cos[np.abs(result_cos) < self.tol] = 0.0

        # combine encoded arrays
        result = np.zeros((self._shape[0], self._shape[1] * 2))
        result[:, np.arange(0, result.shape[1], 2)] = result_sin
        result[:, np.arange(1, result.shape[1], 2)] = result_cos

        # set column_names names and return result
        if column_names is not None:
            column_names[np.arange(0, result.shape[1], 2)] = np.array([f'sin_{col}' for col in X.columns])
            column_names[np.arange(1, result.shape[1], 2)] = np.array([f'cos_{col}' for col in X.columns])
            return pd.DataFrame(result, columns=column_names).infer_objects()
        else:
            return result

    @staticmethod
    def __set_shape(X):
        if len(X.shape) == 1:
            return (X.shape[0], 1)
        elif len(X.shape) > 2:
            raise ValueError(f"You need 2 dimensions instead of {len(X.shape)}")
        else:
            return X.shape

    @staticmethod
    def __set_limits(X):
        if isinstance(X, pd.DataFrame):
            return np.max(np.abs(X.to_numpy()), axis=0) + 1
        else:
            return np.max(np.abs(X), axis=0) + 1


class DateTimeEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, fit_replace=False,
                 drop=True, min_rescale=False, fast_mode=True,
                 confidence_level=0.99, worst_proportion=0.01):
        self.cols = np.array(cols) if cols is not None else None
        self.fit_replace = fit_replace
        self.drop = drop
        self.min_rescale = min_rescale

        # stochastic approach
        eps = 1e-6
        confidence_level = max(eps, min(confidence_level, 1 - eps))
        worst_proportion = max(eps, min(worst_proportion, 1 - eps))
        self.fast_random_size = math.ceil(math.log(1 - confidence_level) / math.log(1 - worst_proportion))
        self.fast_mode = fast_mode

        # encoders for months, days and hours
        self.encoders = {'months': CircularEncoder(limits=[12], fit_replace=False),
                         'days': CircularEncoder(limits=[30], fit_replace=False),
                         'hours': CircularEncoder(limits=[24], fit_replace=False)}

    def __repr__(self):
        return "DateTimeEncoder"

    def __str__(self):
        return "DateTimeEncoder"

    def fit(self, X, y=None):
        if self.cols is None or self.fit_replace:
            if self.fast_mode and self.fast_random_size < X.shape[0]:
                self.cols = self.define_cols_fast(X)
            else:
                self.cols = self.define_cols(X)

        return self

    def transform(self, X, y=None):
        try:
            if self.cols is None or not self.cols.shape[0]:
                return copy.deepcopy(X)
        except IndexError:
            raise IndexError("cols should be a numpy array")

        # save self.cols and get indices for X
        saved_cols = self.cols.copy()
        self.cols = self.cols_to_numeric(X)

        if not self.cols.shape[0]:
            self.cols = saved_cols
            return copy.deepcopy(X)

        column_names = None

        # cast to pandas DataFrame
        if isinstance(X, pd.DataFrame):
            column_names = X.columns.to_numpy(dtype=object).copy()
            X_df = X
        else:
            X_df = pd.DataFrame(X)

        # shift for dropping columns
        shift = 0 if self.drop else 1

        # creating an empty result matrix
        result = np.zeros((X_df.shape[0], X_df.shape[1] + (6 + shift) * len(self.cols)), dtype=object)
        column_names_zeros = np.zeros(X_df.shape[1] + (6 + shift) * len(self.cols), dtype=object)

        # creating indices for fast numpy operation
        modified_inds = np.array([self.cols[i] + i * (6 + shift) for i in range(len(self.cols))])
        not_cols = list(set(np.arange(X.shape[1])) - set(self.cols))
        unmodified_inds = np.array([i + (7 + shift) * (not_cols[i] - i) for i in range(len(not_cols))])

        # setting not date columns
        if unmodified_inds.shape[0] > 0:
            result[:, unmodified_inds] = X_df.to_numpy()[:, not_cols]
            if not self.drop:
                result[:, modified_inds] = X_df.to_numpy()[:, self.cols]
            if column_names is not None:
                column_names_zeros[unmodified_inds] = column_names[not_cols]
                if not self.drop:
                    column_names_zeros[modified_inds] = column_names[self.cols]

        # for fast mode
        preventive_delete = []

        # tranform each datetime column
        for num, col in enumerate(self.cols):
            transformed_column = self.column_transform(X_df.iloc[:, col])
            # for fast mode
            if transformed_column is None:
                result[:, modified_inds[num] + 6 + shift] = X_df.iloc[:, col]
                column_names_zeros[modified_inds[num] + 6 + shift] = X_df.iloc[:, col].name
                preventive_delete += list(np.arange(modified_inds[num] + shift, modified_inds[num] + 6 + shift))
                continue
            result[:, np.arange(modified_inds[num] + shift, modified_inds[num] + 7 + shift)] = transformed_column.to_numpy()
            column_names_zeros[np.arange(modified_inds[num] + shift, modified_inds[num] + 7 + shift)] = transformed_column.columns.to_numpy(dtype=object).copy()

        # delete unsuccessful transformations
        if len(preventive_delete) > 0:
            result = np.delete(result, preventive_delete, 1)
            column_names_zeros = np.delete(column_names_zeros, preventive_delete)

        # self.cols backup
        self.cols = saved_cols

        if column_names is not None:
            # saving dtypes
            saved_dtypes = X_df.dtypes.to_dict()
            unmodified_dtypes = dict([(X.columns[col], saved_dtypes[X.columns[col]]) for col in not_cols])

            return pd.DataFrame(result, columns=column_names_zeros).infer_objects().astype(dtype=unmodified_dtypes)
        else:
            return result

    def column_transform(self, column):
        column_name = None

        # cast to pandas Series
        if not isinstance(column, pd.Series):
            pd_column = copy.deepcopy(pd.Series(column))
        else:
            pd_column = copy.deepcopy(column)
            column_name = column.name

        try:
            pd_column = pd.to_datetime(pd_column.astype(str))
        except (ValueError, TypeError):
            return None

        # rescaling by the start time
        if self.min_rescale:
            pd_column = pd_column.astype(int)
            min_seconds = pd_column.min()
            pd_column = pd_column.apply(lambda val: val - min_seconds)
            pd_column = pd.to_datetime(pd_column)

        # defining main columns
        years = pd_column.dt.year.to_numpy()
        months = self.encoders['months'].fit_transform(pd_column.dt.month.to_numpy())
        days = self.encoders['days'].fit_transform(pd_column.dt.day.to_numpy())
        hours = self.encoders['hours'].fit_transform(pd_column.dt.hour.to_numpy())
        result = np.array([years, months.T[0], months.T[1], days.T[0],
                           days.T[1], hours.T[0], hours.T[1]]).T

        # cast to the original type
        if column_name is not None:
            column_names = np.array([f'{column_name}_year', f'{column_name}_month_sin',
                                     f'{column_name}_month_cos', f'{column_name}_day_sin',
                                     f'{column_name}_day_cos', f'{column_name}_hour_sin',
                                     f'{column_name}_hour_cos'])
            result = pd.DataFrame(result, columns=column_names).infer_objects()

        return result

    def cols_to_numeric(self, X):
        if self.cols is None or not self.cols.shape[0]:
            return np.array([])

        inds = self.cols.copy()

        try:
            inds = inds.astype(int)
            return inds
        except (ValueError, TypeError):
            if not isinstance(X, pd.DataFrame):
                return np.array([])
            else:
                columns_array = X.columns.to_numpy(dtype=object).copy()
                names_dict = dict((columns_array[i], i) for i in range(columns_array.shape[0]))
                inds = np.array([names_dict[name] for name in inds if name in names_dict])
                return inds

    def define_cols(self, X):
        # cast to pandas DataFrame
        if isinstance(X, pd.DataFrame):
            X_df = X
        else:
            X_df = pd.DataFrame(X)

        # check all the columns
        datetime_cols = []
        for num, col in enumerate(X_df):
            if not pd.api.types.is_categorical_dtype(X_df[col]) and not pd.api.types.is_numeric_dtype(X_df[col]):
                if self.is_datetime(X_df[col].iloc[np.random.randint(0, X_df.shape[0])]):
                    datetime_cols.append(num)
        return np.array(datetime_cols)

    def define_cols_fast(self, X):
        # cast to pandas DataFrame
        if isinstance(X, pd.DataFrame):
            X_df = X
        else:
            X_df = pd.DataFrame(X)

        # check all the columns
        datetime_cols = []
        for num, col in enumerate(X_df):
            if not pd.api.types.is_categorical_dtype(X_df[col]) and not pd.api.types.is_numeric_dtype(X_df[col]):
                append_need = True
                for i in range(self.fast_random_size):
                    if not self.is_datetime(X_df[col].iloc[np.random.randint(0, X_df.shape[0])]):
                        append_need = False
                        break
                if append_need:
                    datetime_cols.append(num)
        return np.array(datetime_cols)

    def get_cols(self):
        if self.cols is not None:
            return self.cols.copy()
        else:
            return np.array([])

    def is_datetime(self, col):
        arg = None
        if not isinstance(col, pd.Series):
            arg = col
        else:
            arg = col.astype(str)

        try:
            pd.to_datetime(arg, errors='raise')
            return True
        except (ValueError, TypeError):
            return False


class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, fit_replace=False, encoder='binary',
                 category_rate=0.1, rated_search=True, fast_mode=True,
                 confidence_level=0.99, worst_proportion=0.01, **encoder_params):

        self._hashing_enc_name = 'hashing'
        self._encoders_list = {'onehot': ce.OneHotEncoder,
                               'target_loo': ce.LeaveOneOutEncoder,
                               self._hashing_enc_name: ce.HashingEncoder,
                               'binary': ce.BinaryEncoder}
        self._target_encoders_list = np.array(['target_loo'])

        # stochastic approach
        eps = 1e-6
        confidence_level = max(eps, min(confidence_level, 1 - eps))
        worst_proportion = max(eps, min(worst_proportion, 1 - eps))
        self.fast_random_size = math.ceil(math.log(1 - confidence_level) / math.log(1 - worst_proportion))
        self.fast_mode = fast_mode

        self.cols = np.array(cols) if cols is not None else None
        self.fit_replace = fit_replace
        self.encoder_name = encoder
        self.category_rate = category_rate
        self.rated_search = rated_search
        self.encoder_params = encoder_params
        self._encoder = None

    def __repr__(self):
        return "CategoricalEncoder"

    def __str__(self):
        return "CategoricalEncoder"

    def fit(self, X, y=None, **fit_params):
        # defining categorical columns
        if self.cols is None or self.fit_replace:
            self.cols = self.define_cols(X)

        try:
            if not self.cols.shape[0]:
                return self
        except IndexError:
            raise IndexError("cols should be a numpy array")

        # save self.cols and get indices for X
        saved_cols = self.cols.copy()
        self.cols = self.cols_to_numeric(X)

        if not self.cols.shape[0]:
            self.cols = saved_cols
            return self

        # defining the encoder
        self.encoder_params['cols'] = X.columns[self.cols] if isinstance(X, pd.DataFrame) else self.cols
        self._encoder = self._encoders_list[self.encoder_name](**self.encoder_params)

        # transforming y if it's not numeric
        y_copy = None

        if y is not None:
            y_copy = copy.deepcopy(y)
            if not self.is_y_approved(y_copy):
                y_copy = skpr.LabelEncoder().fit_transform(y_copy)

        # fitting the encoder
        self._encoder.fit(pd.DataFrame(X), y_copy, **fit_params)

        # self.cols backup
        self.cols = saved_cols

        return self

    def transform(self, X, y=None):
        try:
            if self.cols is None or not self.cols.shape[0] or self._encoder is None:
                return copy.deepcopy(X)
        except IndexError:
            raise IndexError("cols should be a numpy array")

        # save self.cols and get indices for X
        saved_cols = self.cols.copy()
        self.cols = self.cols_to_numeric(X)

        if not self.cols.shape[0]:
            self.cols = saved_cols
            return copy.deepcopy(X)

        # transforming y if it's not numeric
        y_copy = None

        if y is not None:
            y_copy = copy.deepcopy(y)
            if not self.is_y_approved(y_copy):
                y_copy = skpr.LabelEncoder().fit_transform(y_copy)

        # checking whether it's a target encoder or not
        if self.encoder_name in self._target_encoders_list:
            result = self._encoder.transform(pd.DataFrame(X), y_copy, override_return_df=True)
        else:
            result = self._encoder.transform(pd.DataFrame(X), override_return_df=True)

        # adjusting the column names
        cols_before = self._encoder.get_feature_names_in()
        cols_after = self._encoder.get_feature_names_out()
        new_columns = self._rename_transformed_cols(cols_before, cols_after)

        result = result.rename(columns={cols_after[i]: new_columns[i] for i in range(cols_after.shape[0])})

        if isinstance(X, pd.DataFrame):
            # saving X dtypes
            not_cols = list(set(np.arange(X.shape[1])) - set(self.cols))
            saved_dtypes = X.dtypes.to_dict()
            unmodified_dtypes = dict([(X.columns[col], saved_dtypes[X.columns[col]]) for col in not_cols])

            # self.cols backup
            self.cols = saved_cols

            return result.astype(dtype=unmodified_dtypes)
        else:
            # self.cols backup
            self.cols = saved_cols

            return result.to_numpy()

    def fit_transform(self, X, y=None, **fit_params):
        # transforming y if it's not numeric
        y_copy = None

        if y is not None:
            y_copy = copy.deepcopy(y)
            if not self.is_y_approved(y_copy):
                y_copy = skpr.LabelEncoder().fit_transform(y_copy)

        self.fit(X, y_copy, **fit_params)
        return self.transform(X, y_copy)

    def _rename_transformed_cols(self, before, after):
        if after is None:
            return np.array([])
        if self._encoder is None:
            return copy.deepcopy(after)

        result = copy.deepcopy(after)

        # hashing encoder unique renaming
        if self.encoder_name == self._hashing_enc_name:
            for i in range(self._encoder.n_components):
                result[i] = f'{self._hashing_enc_name}_{i}'
        else:
            # getting the columns transformed
            set_before = set(before)
            set_after = set(after)
            sample_names = set_before.intersection(set_after)

            # checking whether set is not empty
            if bool(sample_names):
                # renaming
                for num, col in enumerate(result):
                    if col not in sample_names:
                        result[num] = f'{self.encoder_name}_{result[num]}'

        return result

    def is_y_approved(self, y):
        try:
            y.astype(float)
            return True
        except (ValueError, TypeError):
            return False

    def cols_to_numeric(self, X):
        if self.cols is None or not self.cols.shape[0]:
            return np.array([])

        inds = self.cols.copy()

        try:
            inds = inds.astype(int)
            return inds
        except (ValueError, TypeError):
            if not isinstance(X, pd.DataFrame):
                return np.array([])
            else:
                columns_array = X.columns.to_numpy(dtype=object).copy()
                names_dict = dict((columns_array[i], i) for i in range(columns_array.shape[0]))
                inds = np.array([names_dict[name] for name in inds if name in names_dict])
                return inds

    def define_cols(self, X):
        # cast to pandas DataFrame with inferring object types
        if not isinstance(X, pd.DataFrame):
            X_df = pd.DataFrame(X).infer_objects()
        else:
            X_df = X.infer_objects()

        # check all the columns
        category_cols = []
        for num, col in enumerate(X_df):
            # if dtype is 'category'
            if pd.api.types.is_categorical_dtype(X_df[col]):
                category_cols.append(num)
            # checking object type (strings)
            elif pd.api.types.is_object_dtype(X_df[col]):
                # stochastic approach
                if self.fast_mode:
                    append_need = True
                    for i in range(self.fast_random_size):
                        if not self.is_one_word(X_df[col].iloc[np.random.randint(0, X_df.shape[0])]):
                            append_need = False
                            break
                    if append_need:
                        category_cols.append(num)
                else:
                    # basic approach
                    if np.all(np.vectorize(self.is_one_word)(X_df[col])):
                        category_cols.append(num)
            # checking numeric columns
            elif self.rated_search and pd.api.types.is_numeric_dtype(X_df[col]) and not pd.api.types.is_float_dtype(X_df[col]):
                if X_df[col].nunique() < self.category_rate * X_df.shape[0]:
                    category_cols.append(num)
        return np.array(category_cols)

    def is_one_word(self, s):
        if s is None or not isinstance(s, str):
            return False

        stripped_string = s.strip()
        if not stripped_string or ' ' in stripped_string:
            return False
        else:
            return True

    def get_cols(self):
        if self.cols is not None:
            return self.cols.copy()
        else:
            return np.array([])

    def get_encoder(self):
        return self.encoder_name

    def get_available_encoders(self):
        return np.array(list(self._encoders_list.keys()))


class NumericalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, fit_replace=False, encoder='standard', numeric_rate=0.1,
                 rated_search=True, only_float=True, **encoder_params):

        self._encoders_list = {'standard': skpr.StandardScaler,
                          'min_max': skpr.MinMaxScaler,
                          'normalizer': skpr.Normalizer,
                          'max_abs': skpr.MaxAbsScaler}

        self.only_float = only_float
        self.cols = np.array(cols) if cols is not None else None
        self.fit_replace = fit_replace
        self.encoder_name = encoder
        self.numeric_rate = numeric_rate
        self.rated_search = rated_search
        self.encoder_params = encoder_params
        self._encoder = None

    def __repr__(self):
        return "NumericalEncoder"

    def __str__(self):
        return "NumericalEncoder"

    def fit(self, X, y=None, **fit_params):
        # defining numerical columns
        if self.cols is None or self.fit_replace:
            self.cols = self.define_cols(X)

        try:
            if not self.cols.shape[0]:
                return self
        except IndexError:
            raise IndexError("cols should be a numpy array")

        # save self.cols and get indices for X
        saved_cols = self.cols.copy()
        self.cols = self.cols_to_numeric(X)

        if not self.cols.shape[0]:
            self.cols = saved_cols
            return self

        # defining the encoder
        self._encoder = self._encoders_list[self.encoder_name](**self.encoder_params)

        # fitting the encoder
        X_ndarray = X.to_numpy()[:, self.cols] if isinstance(X, pd.DataFrame) else X[:, self.cols]
        self._encoder.fit(X_ndarray, y, **fit_params)

        # self.cols backup
        self.cols = saved_cols

        return self

    def transform(self, X, y=None):
        try:
            if self.cols is None or not self.cols.shape[0] or self._encoder is None:
                return copy.deepcopy(X)
        except IndexError:
            raise IndexError("cols should be a numpy array")

        # save self.cols and get indices for X
        saved_cols = self.cols.copy()
        self.cols = self.cols_to_numeric(X)

        if not self.cols.shape[0]:
            self.cols = saved_cols
            return copy.deepcopy(X)

        column_names = None

        # cast to numpy array
        if isinstance(X, pd.DataFrame):
            X_ndarray = X.iloc[:, self.cols].to_numpy().astype(float)
            column_names = X.columns.to_numpy(dtype=object).copy()
        else:
            X_ndarray = X[:, self.cols].astype(float)

        # getting the resulting column names
        if column_names is not None:
            column_names[self.cols] = np.array([f'{self.encoder_name}_{name}' for name in column_names[self.cols]])

        # only numerical columns here
        result = self._encoder.transform(X_ndarray)

        if isinstance(X, pd.DataFrame):
            # saving X dtypes
            not_cols = list(set(np.arange(X.shape[1])) - set(self.cols))
            saved_dtypes = X.dtypes.to_dict()
            unmodified_dtypes = dict([(X.columns[col], saved_dtypes[X.columns[col]]) for col in not_cols])

            X_ndarray = copy.deepcopy(X.to_numpy(dtype=object))
            X_ndarray[:, self.cols] = result
            result = pd.DataFrame(X_ndarray, columns=column_names).astype(dtype=unmodified_dtypes).infer_objects()
        else:
            X_ndarray = copy.deepcopy(X)
            X_ndarray[:, self.cols] = result
            result = X_ndarray

        # self.cols backup
        self.cols = saved_cols

        return result

    def define_cols(self, X):
        # cast to pandas DataFrame with inferring object types
        if not isinstance(X, pd.DataFrame):
            X_df = pd.DataFrame(X).infer_objects()
        else:
            X_df = X.infer_objects()

        # check all the columns
        numeric_cols = []
        for num, col in enumerate(X_df):
            # if dtype is 'numeric'
            if pd.api.types.is_numeric_dtype(X_df[col]):
                # appending if float
                if pd.api.types.is_float_dtype(X_df[col]):
                    numeric_cols.append(num)
                elif not self.only_float:
                    # checking rated_search
                    if self.rated_search:
                        if X_df[col].nunique() >= self.numeric_rate * X_df.shape[0]:
                            numeric_cols.append(num)
                    else:
                        numeric_cols.append(num)
        return np.array(numeric_cols)

    def cols_to_numeric(self, X):
        if self.cols is None or not self.cols.shape[0]:
            return np.array([])

        inds = self.cols.copy()

        try:
            inds = inds.astype(int)
            return inds
        except (ValueError, TypeError):
            if not isinstance(X, pd.DataFrame):
                return np.array([])
            else:
                columns_array = X.columns.to_numpy(dtype=object).copy()
                names_dict = dict((columns_array[i], i) for i in range(columns_array.shape[0]))
                inds = np.array([names_dict[name] for name in inds if name in names_dict])
                return inds

    def get_cols(self):
        if self.cols is not None:
            return self.cols.copy()
        else:
            return np.array([])

    def get_encoder(self):
        return self.encoder_name

    def get_available_encoders(self):
        return np.array(list(self._encoders_list.keys()))


class CustomEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_args=None,
                 numerical_args=None, datetime_args=None, categorical_enabled=True,
                 numerical_enabled=True, datetime_enabled=True, target_encoder='label',
                 target_encoding=False):
        self.target_encoder = target_encoder
        self.target_encoding = target_encoding
        self._target_encoders_list = {'label': skpr.LabelEncoder}

        self.categorical_args = categorical_args if categorical_args is not None else dict()
        self.numerical_args = numerical_args if numerical_args is not None else dict()
        self.datetime_args = datetime_args if datetime_args is not None else dict()
        self.categorical_enc = CategoricalEncoder(**self.categorical_args)
        self.numerical_enc = NumericalEncoder(**self.numerical_args)
        self.datetime_enc = DateTimeEncoder(**self.datetime_args)

        self.categorical_enabled = categorical_enabled
        self.numerical_enabled = numerical_enabled
        self.datetime_enabled = datetime_enabled

    def fit(self, X, y=None, **fit_params):
        if self.categorical_enabled:
            self.categorical_enc.fit(X, y, **fit_params)

        if self.numerical_enabled:
            self.numerical_enc.fit(X, y, **fit_params)

        if self.datetime_enabled:
            self.datetime_enc.fit(X, y, **fit_params)

        cat_cols = self.cols_to_numeric(X, self.categorical_enc.get_cols())
        num_cols = self.cols_to_numeric(X, self.numerical_enc.get_cols())
        dt_cols = self.cols_to_numeric(X, self.datetime_enc.get_cols())

        if isinstance(X, pd.DataFrame):
            X_columns = X.columns.to_numpy(dtype=object).copy()
            self.categorical_enc.cols = X_columns[cat_cols] if len(cat_cols) > 0 else np.array([])
            self.numerical_enc.cols = X_columns[num_cols] if len(num_cols) > 0 else np.array([])
            self.datetime_enc.cols = X_columns[dt_cols] if len(dt_cols) > 0 else np.array([])

        return self

    def transform(self, X, y=None):
        # target encoding
        y_copy = None
        if y is not None:
            X_result, y_copy = self.fill_omissions(X, y)
            if self.target_encoding:
                if self.target_encoder in self._target_encoders_list:
                    y_copy = self._target_encoders_list[self.target_encoder].fit_transform(y_copy)
                else:
                    y_copy = skpr.LabelEncoder().fit_transform(y_copy)
        else:
            X_result = self.fill_omissions(X)

        # main encoding
        if not isinstance(X_result, pd.DataFrame):
            X_result = pd.DataFrame(X_result)

        if self.numerical_enabled:
            X_result = self.numerical_enc.transform(X_result, y_copy)

        if self.categorical_enabled:
            X_result = self.categorical_enc.transform(X_result, y_copy)

        if self.datetime_enabled:
            X_result = self.datetime_enc.transform(X_result, y_copy)

        if not isinstance(X, pd.DataFrame):
            X_result = X_result.to_numpy()

        # getting the result
        if self.target_encoding:
            return X_result, y_copy
        else:
            return X_result

    def fill_omissions(self, X, y=None):
        if y is not None:
            return copy.deepcopy(X), copy.deepcopy(y)
        else:
            return copy.deepcopy(X)

    def print_cols(self):
        print('Categorical columns: ', sep=', ', end='')
        print(*self.categorical_enc.get_cols(), sep=', ')
        print('Numerical columns: ', sep=', ', end='')
        print(*self.numerical_enc.get_cols(), sep=', ')
        print('Datetime columns: ', sep=', ', end='')
        print(*self.datetime_enc.get_cols(), sep=', ')

    def cols_to_numeric(self, X, cols):
        if cols is None or not cols.shape[0]:
            return np.array([])

        inds = cols.copy()

        try:
            inds = inds.astype(int)
            return inds
        except (ValueError, TypeError):
            if not isinstance(X, pd.DataFrame):
                return np.array([])
            else:
                columns_array = X.columns.to_numpy(dtype=object).copy()
                names_dict = dict((columns_array[i], i) for i in range(columns_array.shape[0]))
                inds = np.array([names_dict[name] for name in inds if name in names_dict])
                return inds

    def get_cols(self):
        return {'categorical': self.categorical_enc.get_cols(),
                'numerical': self.numerical_enc.get_cols(),
                'datetime': self.datetime_enc.get_cols()}

#### Отбор признаков (author: github.com/EnriFermi)

##### Basic encoder

In [3]:
class SelectNullDistributionBaseline(BaseEstimator, TransformerMixin):
  def __get_feature_importances(s, X, y, shuffle, seed=None):
    f_num = X.shape[1]
    if shuffle:
        np.random.shuffle(y)

    #TODO hardcode
    dtrain = lgb.Dataset(X, y,params={'verbose': -1}, free_raw_data=False, silent=True)
    lgb_params = {
        'objective': 'binary',
        'boosting_type': 'rf',
        'subsample': 0.623,
        'colsample_bytree': 0.7,
        'num_leaves': 127,
        'max_depth': 8,
        'seed': seed,
        'bagging_freq': 1,
        'n_jobs': 4,
        'verbose': -1
    }
    categorical_feats = []
    # Fit the model
    clf = lgb.train(params=lgb_params, train_set=dtrain, num_boost_round=200, categorical_feature=categorical_feats)

    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df["feature"] = np.arange(f_num)
    imp_df["importance_gain"] = clf.feature_importance(importance_type='gain')
    imp_df["importance_split"] = clf.feature_importance(importance_type='split')
    imp_df['trn_score'] = roc_auc_score(y, clf.predict(X))

    return imp_df

  def __get_null_feature_importances(s, X, y):
    null_imp_df = pd.DataFrame()
    dsp = ''
    for i in range(s.n_runs):
        X_train, y_train = resample(X, y)
        imp_df = s.__get_feature_importances(X_train, y_train, shuffle=True)
        imp_df['run'] = i + 1
        null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
    return null_imp_df
  
  def __create_mask(s):
    n_feats = len(s.feat_scores)
    n_feat_selected = n_feats if s.max_features is None else min(s.max_features, n_feats)
    if s.threshold == 'median':
      s.feat_mask = s.feat_scores >= np.median(s.feat_scores)
    elif s.threshold == 'mean':
      s.feat_mask = s.feat_scores >= np.mean(s.feat_scores)
    else:
      s.feat_mask = s.feat_scores >= s.threshold
    
    s.feats_ind = np.arange(n_feats)[s.feat_mask][:n_feat_selected]

  def __init__(self, typ = 'class', threshold=None, max_features=None, n_runs=80, n_bins=20, n_gauss_components=5, score_func=(lambda x: 0.5 * x) ):
    self.typ = typ
    self.threshold = threshold if threshold is not None else 'median'
    self.max_features = max_features
    self.n_runs = n_runs
    self.n_bins = n_bins
    self.n_gauss_components = n_gauss_components
    self.score_func = score_func

  def fit(self, X, y=None):
    actual_imp_df = self.__get_feature_importances(X, y, shuffle=False)
    null_imp_df = self.__get_null_feature_importances(X, y)

    self.feat_scores = []
    for feat in range(X.shape[1]):
      actual_feat_imp = actual_imp_df.loc[actual_imp_df['feature'] == feat]
      null_feat_imp = null_imp_df.loc[null_imp_df['feature'] == feat]
      actual_imp = actual_feat_imp['importance_gain'].iloc[0]
      null_imp = null_feat_imp['importance_gain']
      #TODO choose bin strategy
      # act_len = np.max(actual_imp) - np.min(actual_imp)
      # null_len  = np.max(null_imp) - np.min(null_imp)
      # total_len = max(np.max(null_imp), np.max(actual_imp)) - min(np.min(null_imp), np.min(actual_imp))
      # bin_count = max(total_len // null_len * n_bins, total_len // act_len * n_bins)
      _sf_p = np.percentile(null_imp, 75)
      print(_sf_p, actual_imp)
      # print(actual_imp)
      # print(null_imp)
      # print(substr_imp)
      # plt.hist(substr_imp, bins=20, label='diff', density=True)
      # plt.hist(null_imp, bins=20, label='null', density=True)
      # plt.hist(actual_imp, bins=20, label='actual', density=True)
      # plt.legend(loc='upper right')
      # plt.show()

      feat_score = np.log(actual_imp/_sf_p)
      print('f', feat_score, 'g', type(feat_score))
      self.feat_scores.append(feat_score)
    # print(self.feat_scores)
    # print(np.array(self.feat_scores).argsort().argsort())
    self.__create_mask()
    
  def transform(self, X):
    return X[:, self.feats_ind]
  def get_support(self, indices=False):
    return self.feats_ind if indices else self.feat_mask

##### Modified encoder

In [4]:
class SelectNullDistribution(BaseEstimator, TransformerMixin):
  def __get_feature_importances(s, X, y, shuffle, seed=None):
    f_num = X.shape[1]
    if shuffle:
        np.random.shuffle(y)

    #TODO hardcode
    dtrain = lgb.Dataset(X, y,params={'verbose': -1}, free_raw_data=False, silent=True)
    lgb_params = {
        'objective': 'binary',
        'boosting_type': 'rf',
        'subsample': 0.623,
        'colsample_bytree': 0.7,
        'num_leaves': 127,
        'max_depth': 8,
        'seed': seed,
        'bagging_freq': 1,
        'n_jobs': 4,
        'verbose': -1
    }
    categorical_feats = []
    # Fit the model
    clf = lgb.train(params=lgb_params, train_set=dtrain, num_boost_round=200, categorical_feature=categorical_feats)

    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df["feature"] = np.arange(f_num)
    imp_df["importance_gain"] = clf.feature_importance(importance_type='gain')
    imp_df["importance_split"] = clf.feature_importance(importance_type='split')
    imp_df['trn_score'] = roc_auc_score(y, clf.predict(X))

    return imp_df
  
  def __get_real_feature_importances(s, X, y):
    real_imp_df = pd.DataFrame()
    dsp = ''
    for run in range(s.n_runs):
      # skf = StratifiedKFold(n_splits=n_splits, random_state=None, shuffle=True)
      # for i, (train_index, _) in enumerate(skf.split(X, y)):
      #     imp_df = s.__get_feature_importances(X[train_index, :], y[train_index], shuffle=False)
      #     imp_df['run'] = run * n_splits + i + 1
      #     real_imp_df = pd.concat([real_imp_df, imp_df], axis=0)
      X_train, y_train = resample(X, y)
      imp_df = s.__get_feature_importances(X_train, y_train, shuffle=False)
      imp_df['run'] = run + 1
      real_imp_df = pd.concat([real_imp_df, imp_df], axis=0)
    return real_imp_df

  def __get_null_feature_importances(s, X, y):
    null_imp_df = pd.DataFrame()
    dsp = ''
    for i in range(s.n_runs):
        X_train, y_train = resample(X, y)
        imp_df = s.__get_feature_importances(X_train, y_train, shuffle=True)
        imp_df['run'] = i + 1
        null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
    return null_imp_df
  
  def __create_mask(s):
    n_feats = len(s.feat_scores)
    n_feat_selected = n_feats if s.max_features is None else min(s.max_features, n_feats)
    if s.threshold == 'median':
      s.feat_mask = s.feat_scores >= np.median(s.feat_scores)
    elif s.threshold == 'mean':
      s.feat_mask = s.feat_scores >= np.mean(s.feat_scores)
    else:
      s.feat_mask = s.feat_scores >= s.threshold
    
    s.feats_ind = np.arange(n_feats)[s.feat_mask][:n_feat_selected]

  def __init__(self, typ = 'class', threshold=None, max_features=None, n_runs=300, n_bins=20, n_gauss_components=5, score_func=(lambda x: 0.5 * x) ):
    self.typ = typ
    self.threshold = threshold if threshold is not None else 'median'
    self.max_features = max_features
    self.n_runs = n_runs
    self.n_bins = n_bins
    self.n_gauss_components = n_gauss_components
    self.score_func = score_func

  def fit(self, X, y=None):
    actual_imp_df = self.__get_real_feature_importances(X, y)
    null_imp_df = self.__get_null_feature_importances(X, y)

    self.feat_scores = []
    for feat in range(X.shape[1]):
      actual_feat_imp = actual_imp_df.loc[actual_imp_df['feature'] == feat]
      null_feat_imp = null_imp_df.loc[null_imp_df['feature'] == feat]
      actual_imp = actual_feat_imp['importance_gain']
      null_imp = null_feat_imp['importance_gain']
      #TODO choose bin strategy
      # act_len = np.max(actual_imp) - np.min(actual_imp)
      # null_len  = np.max(null_imp) - np.min(null_imp)
      # total_len = max(np.max(null_imp), np.max(actual_imp)) - min(np.min(null_imp), np.min(actual_imp))
      # bin_count = max(total_len // null_len * n_bins, total_len // act_len * n_bins)
      substr_imp = np.sum(np.array(np.meshgrid(actual_imp, -null_imp)).T.reshape(-1,2), axis=1)
      # print(actual_imp)
      # print(null_imp)
      # print(substr_imp)
      # plt.hist(substr_imp, bins=20, label='diff', density=True)
      # plt.hist(null_imp, bins=20, label='null', density=True)
      # plt.hist(actual_imp, bins=20, label='actual', density=True)
      # plt.legend(loc='upper right')
      # plt.show()
      gm_model = GaussianMixture(n_components=self.n_gauss_components)
      gm_model.fit(substr_imp.reshape(-1, 1))
      means_ = gm_model.means_
      score_func_vf = np.vectorize(self.score_func)
      feat_score = np.sum(score_func_vf(means_))
      self.feat_scores.append(feat_score)
    # print(self.feat_scores)
    # print(np.array(self.feat_scores).argsort().argsort())
    self.__create_mask()
    
  def transform(self, X):
    return X[:, self.feats_ind]
  def get_support(self, indices=False):
    return self.feats_ind if indices else self.feat_mask

##### BaseSelection encoder

In [5]:
class FeatureSelectionTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, typ='skb_mutual_info_classif', k=50, task_type = 'class', n_estimators = 500):
      if task_type == 'class':
        if typ[0:3] == 'skb':
            func = getattr(fs, typ[4:])
            self.predictor = SelectKBest(score_func=mutual_info_classif, k=k)
        elif typ == 'rtree':
            self.predictor = SelectFromModel(ensemble.RandomForestClassifier(n_estimators = n_estimators), max_features=k)
        elif typ == 'lsvm':
            self.predictor = SelectFromModel(svm.LinearSVC(C=0.1, penalty="l1", dual=False), max_features=k)
        elif typ == 'null_importance':
            self.predictor = SelectNullDistributionBaseline()
        elif typ == 'null_importance_mod':
            self.predictor = SelectNullDistribution()
      elif task_type == 'reg':
        if typ[0:2] == 'skb':
            func = getattr(fs, typ[3:])
            self.predictor = SelectKBest(score_func=mutual_info_classif, k=k)

    def fit(self, X, y=None):
        return self.predictor.fit(X, y)

    def transform(self, X):
        return self.predictor.transform(X)

    def get_support(self, indices):
        return self.predictor.get_support(indices=indices)


#### Генерация признаков (author: github.com/EgorSWEB)

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.covariance import empirical_covariance
from sklearn. preprocessing import normalize
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
import pandas as pd
from itertools import combinations


from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import time

import numpy as np

EPS = 1e-20
EPS_LOG = 1e-3
THRESHOLD = 0.2

class FeatureGenerationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, thr=THRESHOLD, features_mask=None, important_features=None, stand_gen=True, corr_gen=True):
        self.thr = thr
        self._features_mask = features_mask
        self._important_features = important_features
        self._stand_gen = stand_gen
        self._corr_gen = corr_gen
        self.desc_dict = {}

    #Construction of a matrix of correlation coefficients of features
    def _correlation_create(self, X):
        return np.corrcoef(X.T)

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            X = np.array(X)

        self._cnt_columns = X.shape[1]

        if self._features_mask is None:
            self._features_mask = np.arange(X.shape[1])
        else:
            self._features_mask = np.array(self._features_mask)

        if self._important_features is None:
            self._important_features = self._features_mask
        else:
            self._important_features = np.array(self._important_features)

        if self._important_features.size < 2:
            self._corr_gen = False

        if self._corr_gen:
            self._corr_mat = self._correlation_create(X[:, self._important_features])

        if self._stand_gen:
            self._cnt_columns = self._standard_generation(X, self._cnt_columns)
        if self._corr_gen:
            self._cnt_columns = self._correlation_generation(X, self._cnt_columns)

        return self

    #Generating features using standard functions
    def _standard_generation(self, X, cnt_columns=None):
        features_mask = self._features_mask
        important_features = self._important_features

        if cnt_columns is None:
            cnt_columns = X.shape[1]

        if features_mask.size:
            #exponent
            self.desc_dict['s_exp'] = [features_mask, np.arange(cnt_columns, cnt_columns + features_mask.shape[0])]
            cnt_columns += features_mask.shape[0]

            #x^2
            self.desc_dict['s_^2'] = [features_mask, np.arange(cnt_columns, cnt_columns + features_mask.shape[0])]
            cnt_columns += features_mask.shape[0]

            #x^3
            self.desc_dict['s_^3'] = [features_mask, np.arange(cnt_columns, cnt_columns + features_mask.shape[0])]
            cnt_columns += features_mask.shape[0]


        if important_features.size:
            #logarithm
            self.desc_dict['s_log'] = [important_features, np.arange(cnt_columns, cnt_columns + important_features.shape[0])]
            cnt_columns += important_features.shape[0]

            #x^0.5
            self.desc_dict['s_^0.5'] = [important_features, np.arange(cnt_columns, cnt_columns + important_features.shape[0])]
            cnt_columns += important_features.shape[0]

        return cnt_columns

    #Generating features from two that have a correlation coefficient less than the threshold
    def _correlation_generation(self, X, cnt_columns=None):
        if cnt_columns is None:
            cnt_columns = X.shape[1]

        important_features = self._important_features

        pairs_indxs_mat = np.array(list(combinations(range(self._corr_mat.shape[0]), 2)))

        #x1 * x2
        self.desc_dict['p_*'] = [important_features[pairs_indxs_mat], np.arange(cnt_columns, cnt_columns + pairs_indxs_mat.shape[0])]
        cnt_columns += pairs_indxs_mat.shape[0]

        #x1 / x2, x2 / x1
        self.desc_dict['p_/'] = [np.hstack([important_features[pairs_indxs_mat], important_features[pairs_indxs_mat][:,::-1]]).reshape(-1, 2),
                               np.arange(cnt_columns, cnt_columns + 2 * pairs_indxs_mat.shape[0])]
        cnt_columns += 2 * pairs_indxs_mat.shape[0]

        pairs_indxs_mat = np.array([[[i, j] for j in range(self._corr_mat.shape[1])] for i in range(self._corr_mat.shape[0])])
        pairs_indxs_mat = pairs_indxs_mat[abs(self._corr_mat) <= self.thr]
        pairs_indxs_mat = pairs_indxs_mat[pairs_indxs_mat[:, 0] > pairs_indxs_mat[:, 1]]

        if pairs_indxs_mat.size:
            #x1 + x2
            self.desc_dict['p_+'] = [important_features[pairs_indxs_mat], np.arange(cnt_columns, cnt_columns + pairs_indxs_mat.shape[0])]
            cnt_columns += pairs_indxs_mat.shape[0]

            #x1 - x2, x2 - x1
            self.desc_dict['p_-'] = [np.hstack([important_features[pairs_indxs_mat], important_features[pairs_indxs_mat][:,::-1]]).reshape(-1, 2),
                                   np.arange(cnt_columns, cnt_columns + 2 * pairs_indxs_mat.shape[0])]
            cnt_columns += 2 * pairs_indxs_mat.shape[0]

        return cnt_columns

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = np.array(X)

        cnt_columns = 0
        # print(self.desc_dict)
        for k, v in self.desc_dict.items():
            if k != '_':
              cnt_columns += self.desc_dict[k][1].shape[0]

        X = np.hstack([X, np.zeros((X.shape[0], cnt_columns))])

        #exponent
        if 's_exp' in self.desc_dict.keys():
            X[:, self.desc_dict['s_exp'][1]] =  np.exp(np.clip(X[:,self.desc_dict['s_exp'][0]], -750, 700))

        #x^2
        if 's_^2' in self.desc_dict.keys():
            X[:, self.desc_dict['s_^2'][1]] =  np.power(X[:,self.desc_dict['s_^2'][0]], 2)

        #x^3
        if 's_^3' in self.desc_dict.keys():
            X[:, self.desc_dict['s_^3'][1]] =  np.power(X[:,self.desc_dict['s_^3'][0]], 3)

        #logarithm
        if 's_log' in self.desc_dict.keys():
            X[:, self.desc_dict['s_log'][1]] =  np.where(X[:, self.desc_dict['s_log'][0]] <= -1, np.log(EPS_LOG),
                                                         np.log(X[:,self.desc_dict['s_log'][0]] + 1, where=X[:,self.desc_dict['s_log'][0]] > -1))

        #x^0.5
        if 's_^0.5' in self.desc_dict.keys():
            X[:, self.desc_dict['s_^0.5'][1]] = np.where(X[:,self.desc_dict['s_^0.5'][0]] < 0, -np.power(-X[:,self.desc_dict['s_^0.5'][0]], 0.5,
                                                          where=X[:,self.desc_dict['s_^0.5'][0]] < 0), np.power(X[:,self.desc_dict['s_^0.5'][0]], 0.5,
                                                          where=X[:,self.desc_dict['s_^0.5'][0]] >= 0))
        #x1 * x2
        if 'p_*' in self.desc_dict.keys():
            X[:, self.desc_dict['p_*'][1]] = np.prod(X.T[self.desc_dict['p_*'][0]], axis=1).T

        #x1 / x2, x2 / x1
        if 'p_/' in self.desc_dict.keys():
            X[:, self.desc_dict['p_/'][1]] = X.T[self.desc_dict['p_/'][0][:, 0]].T / (X.T[self.desc_dict['p_/'][0][:, 1]].T + EPS)

        #x1 + x2
        if 'p_+' in self.desc_dict.keys():
            X[:, self.desc_dict['p_+'][1]] = np.sum(X.T[self.desc_dict['p_+'][0]], axis=1).T

        #x1 - x2, x2 - x1
        if 'p_-' in self.desc_dict.keys():
            X[:, self.desc_dict['p_-'][1]] = X.T[self.desc_dict['p_-'][0][:, 0]].T - X.T[self.desc_dict['p_-'][0][:, 1]].T

        INF = 1e30
        X = np.clip(X, -INF, INF)
        return X


# if __name__ == '__main__':
#     X, y = make_classification(
#     n_samples=100000, n_features=100, n_informative=80, n_redundant=2,
#     random_state=42)

#     X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42)
#     scaler = StandardScaler()

#     result = scaler.fit_transform(X_train)
#     feat_gen = FeatureGenerationTransformer(thr=0.01,  important_features=[1, 2, 3, 4, 5, 6, 7, 8, 9, 92])
#     start_time = time.time()
#     result_2 = feat_gen.fit_transform(result)
#     end_time = time.time()
#     elapsed_time = end_time - start_time
#     print(f'Время генерации: {elapsed_time:.5f}')

#### Алгоритм отбора признаков с генерацией (author: github.com/EgorSWEB, github.com/EnriFermi)

In [7]:
def merge_desc_dict(dict1, dict2, bias=0):
  rdict = {}
  pointer = bias
  for op in dict1.keys():
    val = np.unique(np.concatenate((dict1[op][0],dict2[op][0]), axis=0), axis=0)
    rdict[op] = [val, np.arange(pointer, pointer+val.shape[0])]
    if op != "_":
      pointer += val.shape[0]
  return rdict

In [21]:
class FeatProcessingMod(BaseEstimator, TransformerMixin):
    def __init__(self, task_type, n_fold_splits, gen_kwargs={}, sel_for_gen_kwargs={}, sel_from_gen_kwargs={}, for_k_limit=50):
      self.n_fold_splits = n_fold_splits
      self.gen_kwargs = gen_kwargs
      self.sel_for_gen_kwargs = sel_for_gen_kwargs
      self.sel_from_gen_kwargs = sel_from_gen_kwargs
      self.for_k_limit = for_k_limit

      self.desc_dict = {}
    def fit(self, X, y=None):
      print(X.shape)


      if 'k' not in self.sel_for_gen_kwargs.keys():
        for_k = min(X.shape[1], self.for_k_limit)
        self.sel_for_gen_kwargs['k'] = for_k
      else:
        for_k = self.sel_for_gen_kwargs['k']

      if 'k' not in self.sel_from_gen_kwargs.keys():
        from_k = X.shape[1]
        self.sel_from_gen_kwargs['k'] = from_k
        # self.sel_from_gen_kwargs['thr'] = -np.inf
      else:
        from_k = self.sel_from_gen_kwargs['k']


      kf = KFold(n_splits=self.n_fold_splits)

      for i, (train_index, val_index) in enumerate(kf.split(X)):

        X_train, X_val, y_train, y_val = X[train_index, :], X[val_index, :], y[train_index], y[val_index]
        obj_count, f_dim = X_train.shape[0], X_train.shape[1]
        #1 SEL
        self.sel_for_gen = FeatureSelectionTransformer(**self.sel_for_gen_kwargs)
        self.sel_for_gen.fit(X_train, y_train)
        important_features = self.sel_for_gen.get_support(indices=True)[:for_k]

        # print('for_k:', for_k)
        #Константа просчитывается с учетом того, что под вычисления отводится 8 гб ОЗУ
        for_k_limit_important = int(((1073741824 / (X.shape[0] / 0.8) - X.shape[1] - for_k * 9) / 9) ** 0.5)
        # print('for_k_limit_important:', for_k_limit_important)
        # print('abs:', abs(min(for_k, for_k_limit_important)))

        #GEN
        self.gen = FeatureGenerationTransformer(features_mask=important_features, important_features=important_features[:min(abs(for_k), abs(for_k_limit_important))], **self.gen_kwargs)
        self.gen.fit(X_train, y_train)
        X_gen = self.gen.transform(X_val)

        pipe_sc = pipeline.Pipeline([('std_scaler',preprocessing.StandardScaler())])
        X_gen[:, X_val.shape[1]:] = pipe_sc.fit_transform(X_gen[:,  X_val.shape[1]:])
        #2 SEL
        self.sel_from_gen = FeatureSelectionTransformer(**self.sel_from_gen_kwargs)
        self.sel_from_gen.fit(X_gen, y_val)
        #Prepare indicies
        fos = important_features # индексы в исходном векторе,
        # выбранных признаков
        frs = self.sel_from_gen.get_support(indices=True)[:from_k] # индексы в сгенерированном векторе,
        # выбранных признаков


        desc =  self.gen.desc_dict
        # print(desc)
        desc['_'] = [np.arange(0, f_dim), np.arange(0, f_dim)]
        sort_desc = {}
        map = np.zeros(X_gen.shape[0])

        #for keys
        for op in desc.keys():
           #Начинается с 0
          # print(frs, fos, desc[op][0])
          mask = np.in1d(desc[op][1], frs) #remove generating which are not selected
          sort_desc[op] = [desc[op][0][mask], desc[op][1][mask]]
          # print(sort_desc[op])

          # if op != '_':
          #   sort_desc[op][0] = fos[sort_desc[op][0]]
        if len(self.desc_dict.keys()) == 0:
          self.desc_dict = sort_desc
        else:
          self.desc_dict = merge_desc_dict(self.desc_dict, sort_desc, bias=f_dim) # union algorythms
        # print(self.desc_dict)

      self.gen.desc_dict = self.desc_dict #!!!!!

    def transform(self, X):

      X_res = self.gen.transform(X)

      # for i in list(set(range(X.shape[1])) - set(self.desc_dict['_'][0])):

      #   X_res = np.delete(X_res, i, 1)
      return X_res



In [8]:
class FeatProcessing(BaseEstimator, TransformerMixin):
    def __init__(self, task_type, n_fold_splits, gen_kwargs={}, sel_for_gen_kwargs={}, sel_from_gen_kwargs={}):
      self.n_fold_splits = n_fold_splits
      self.gen_kwargs = gen_kwargs
      self.sel_for_gen_kwargs = sel_for_gen_kwargs
      self.sel_from_gen_kwargs = sel_from_gen_kwargs

      self.desc_dict = {}
    def fit(self, X, y=None):
      print(X.shape)


      if 'k' not in self.sel_for_gen_kwargs.keys():
        for_k = X.shape[1] // 2
        self.sel_for_gen_kwargs['k'] = for_k
      else:
        for_k = self.sel_for_gen_kwargs['k']

      if 'k' not in self.sel_from_gen_kwargs.keys():
        from_k = X.shape[1]
        self.sel_for_gen_kwargs['k'] = from_k
      else:
        from_k = self.sel_from_gen_kwargs['k']


      kf = KFold(n_splits=self.n_fold_splits)

      for i, (train_index, val_index) in enumerate(kf.split(X)):

        X_train, X_val, y_train, y_val = X[train_index, :], X[val_index, :], y[train_index], y[val_index]
        obj_count, f_dim = X_train.shape[0], X_train.shape[1]
        #1 SEL
        self.sel_for_gen = FeatureSelectionTransformer(**self.sel_for_gen_kwargs)
        self.sel_for_gen.fit(X_train, y_train)
        important_features = self.sel_for_gen.get_support(indices=True)[:for_k]
        #GEN
        self.gen = FeatureGenerationTransformer(features_mask=important_features, important_features=important_features, **self.gen_kwargs)
        self.gen.fit(X_train, y_train)
        X_gen = self.gen.transform(X_val)

        #2 SEL
        self.sel_from_gen = FeatureSelectionTransformer(**self.sel_from_gen_kwargs)
        self.sel_from_gen.fit(X_gen, y_val)
        #Prepare indicies
        fos = important_features # индексы в исходном векторе,
        # выбранных признаков
        frs = self.sel_from_gen.get_support(indices=True)[:from_k] # индексы в сгенерированном векторе,
        # выбранных признаков


        desc =  self.gen.desc_dict
        # print(desc)
        desc['_'] = [np.arange(0, f_dim), np.arange(0, f_dim)]
        sort_desc = {}
        map = np.zeros(X_gen.shape[0])

        #for keys
        for op in desc.keys():
           #Начинается с 0
          # print(frs, fos, desc[op][0])
          mask = np.in1d(desc[op][1], frs) #remove generating which are not selected
          sort_desc[op] = [desc[op][0][mask], desc[op][1][mask]]
          # print(sort_desc[op])

          # if op != '_':
          #   sort_desc[op][0] = fos[sort_desc[op][0]]
        if len(self.desc_dict.keys()) == 0:
          self.desc_dict = sort_desc
        else:
          self.desc_dict = merge_desc_dict(self.desc_dict, sort_desc, bias=f_dim) # union algorythms
        # print(self.desc_dict)

      self.gen.desc_dict = self.desc_dict #!!!!!

    def transform(self, X):

      X_res = self.gen.transform(X)

      # for i in list(set(range(X.shape[1])) - set(self.desc_dict['_'][0])):

      #   X_res = np.delete(X_res, i, 1)
      return X_res



#### Final build

In [9]:
class ValidTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, task_type, **kwargs):
      self.task_type = task_type
      self.kwargs = kwargs
      if 'work_time' in kwargs.keys():
        self.work_time = kwargs['work_time']
      else:
        self.work_time = 1
      if 'val_size' in kwargs.keys():
        self.val_size = kwargs['val_size']
      else:
        self.val_size = 0.2
      if 'random_state' in kwargs.keys():
        self.random_state = kwargs['random_state']
      else:
        self.random_state = 42
    def fit(self, X, y=None):
      obj_count, f_dim = X.shape[0], X.shape[1]

      if 'encoder' in self.kwargs.keys():
        self.encoder = CategoricalEncoder(**self.kwargs['encoder'])
      else:
        self.encoder = CategoricalEncoder()
      self.encoder = preprocessing.StandardScaler() # TODO add normalization after generation
      if 'processor' in self.kwargs.keys():
        self.trans = FeatProcessing(**self.kwargs['processor'])
      else:
        self.trans = FeatProcessing(self.task_type, 2)
      # self.encoder.fit(X, y)
      self.trans.fit(X, y)

    def transform(self, X, y=None):
      # Perform arbitary transformation
      X = self.encoder.transform(X)
      return self.trans.transform(X)

#### Testing

##### Try with AutoML Banchmark

In [None]:
!git clone https://github.com/openml/automlbenchmark.git --branch stable --depth 1
!cd automlbenchmark

In [None]:
!python -m pip install --upgrade pip
!python -m pip install -r automlbenchmark/requirements.txt

In [None]:
!python automlbenchmark/runbenchmark.py benchmark

##### Testing with OpenML

##### Utils

In [10]:
def calc_accuracy(X, y, X_t, y_t):
  models = [('linear_svm', svm.LinearSVC(max_iter=2000, random_state=42), 'decision_function'),
            ('logistic', linear_model.LogisticRegression(solver = 'lbfgs', random_state=42), 'predict_proba'),
            ('linear_discriminant_analysis', LinearDiscriminantAnalysis(), 'predict_proba'),
            ('gradboost', ensemble.GradientBoostingClassifier(random_state=42), 'predict_proba'),
            ('randomforest', ensemble.RandomForestClassifier(random_state=42), 'predict_proba')]
  ans = {}
  for name, model, loss_func in models:
      model.fit(X, y)
      # print(name, ': ', model.score(X_t, y_t))
      ans[name] = model.score(X_t, y_t)
  return ans

In [None]:
!pip install openml

In [None]:
import openml

dataset = openml.datasets.get_dataset(1471)
print(dataset)

In [None]:
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
X.info()

In [13]:
# Получение меток классов и их строковое представление
class_labels = set(y)
# Создание словаря для преобразования строковых меток в числовые индексы
class_labels_map = {label: i for i, label in enumerate(class_labels)}
y_numeric = [class_labels_map[label] for label in y]

X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), np.array(y_numeric), test_size=0.2, random_state=42)

In [14]:
pipe_sc = pipeline.Pipeline([('std_scaler',preprocessing.StandardScaler())])
pipe_sc.fit(X_train, y_train)
X_train = pipe_sc.transform(X_train)
X_test = pipe_sc.transform(X_test)

In [None]:
trans_full = FeatProcessingMod('class', 2, {'thr': 0.5},
                       sel_for_gen_kwargs={'typ': 'null_importance'}, sel_from_gen_kwargs={})
trans_full.fit(X_train, y_train)
# print(X_train.shape)
X_train_chgd_norm = trans_full.transform(X_train)
# print(X_traоin.shape)
X_test_chgd_norm = trans_full.transform(X_test)
calc_accuracy(X_train_chgd_norm, y_train, X_test_chgd_norm, y_test)

In [31]:
X_train_chgd_norm = trans_full.transform(X_train)

In [None]:
calc_accuracy(X_train, y_train, X_test, y_test)

In [None]:
# trans = FeatProcessing('class', 2, {'thr': 0.5},
#                        sel_for_gen_kwargs={'typ': 'lsvm'}, sel_from_gen_kwargs={})
trans_null = FeatureSelectionTransformer(typ='null_importance')
trans_null.fit(X_train, y_train)

trans_null_mod = FeatureSelectionTransformer(typ='null_importance_mod')
trans_null_mod.fit(X_train, y_train)
trans_baseline = FeatureSelectionTransformer(typ='rtree', k = X_train.shape[1])
trans_baseline.fit(X_train, y_train)
print(trans_baseline.predictor.estimator_.feature_importances_)
print(trans_baseline.predictor.estimator_.feature_importances_.argsort().argsort())

# print(X_train.shape)
X_train_chgd_null_mod = trans_null_mod.transform(X_train)
# print(X_train.shape)
X_test_chgd_null_mod = trans_null_mod.transform(X_test)

X_train_chgd_base = trans_baseline.transform(X_train)
# print(X_train.shape)
X_test_chgd_base = trans_baseline.transform(X_test)

X_train_chgd_null = trans_null.transform(X_train)
X_test_chgd_null = trans_null.transform(X_test)

In [None]:
calc_accuracy(X_train, y_train, X_test, y_test)

In [None]:
calc_accuracy(X_train_chgd_null, y_train, X_test_chgd_null, y_test)

In [None]:
calc_accuracy(X_train_chgd_null_mod, y_train, X_test_chgd_null_mod, y_test)

In [None]:
print(X_test_chgd_base.shape)

In [None]:
calc_accuracy(X_train_chgd_base, y_train, X_test_chgd_base, y_test)

In [None]:
trans_null_mod = FeatProcessing('class', 2, {'thr': 0.5},
                       sel_for_gen_kwargs={'typ': 'lsvm'}, sel_from_gen_kwargs={})
trans_null_mod.fit(X_train, y_train)
# print(X_train.shape)
X_train_chgd = trans_null_mod.transform(X_train)
# print(X_train.shape)
X_test_chgd = trans_null_mod.transform(X_test)
calc_accuracy(X_train_chgd, y_train, X_test_chgd, y_test)

In [None]:
X_train_chgd.shape

In [None]:
X_train_chgd[:3, :]

In [None]:
X_train[:3, :]

In [None]:
# Получение меток классов и их строковое представление
class_labels = set(y)
# Создание словаря для преобразования строковых меток в числовые индексы
class_labels_map = {label: i for i, label in enumerate(class_labels)}
y_numeric = [class_labels_map[label] for label in y]

X_train, X_test, y_train, y_test = train_test_split(X, y_numeric, test_size=0.2, random_state=42)

In [192]:
def calc_accuracy(X, y, X_t, y_t):
  models = [('linear_svm', svm.LinearSVC(max_iter=2000, random_state=42), 'decision_function'), ('ridge', linear_model.Ridge(random_state=42), 'predict'),
            ('gradboost', ensemble.GradientBoostingClassifier(random_state=42), 'predict_proba'), ('logistic', linear_model.LogisticRegression(tol = 1e-4, solver = 'newton-cholesky', random_state=42), 'predict_proba'),
            ('randomforest', ensemble.RandomForestClassifier(random_state=42), 'predict_proba')]
  for name, model, loss_func in models:
    model.fit(X, y)
    print(name, ': ', model.score(X_t, y_t))

In [None]:
calc_accuracy(X_train, y_train, X_test, y_test)

In [None]:
feat_gen = FeatureGenerationTransformer(thr=0.5)
start_time = time.time()
X_train_gen = feat_gen.fit_transform(np.array(X_train))
X_test_gen = feat_gen.transform(np.array(X_test))
end_time = time.time()
elapsed_time = end_time - start_time
print(f'Время генерации: {elapsed_time:.5f}')
print('Размер до генерации', X_train.shape)
print('Размер после генерации', X_train_gen.shape)

In [None]:
INF = 1e30
X_train_gen = np.clip(X_train_gen, -INF, INF)
X_test_gen = np.clip(X_test_gen, -INF, INF)

In [None]:
calc_accuracy(X_train_gen, y_train, X_test_gen, y_test)

linear_svm :  0.45714285714285713<br>
ridge :  -3537520734.514072<br>
gradboost :  0.8<br>
logistic :  0.8571428571428571<br>
randomforest :  0.8

In [None]:
pipe_sc = pipeline.Pipeline([('std_scaler',preprocessing.StandardScaler())])
pipe_sc.fit(X_train, y_train)
X_train_sc = pipe_sc.transform(X_train)
X_test_sc = pipe_sc.transform(X_test)

In [None]:
calc_accuracy(X_train_sc, y_train, X_test_sc, y_test)

In [None]:
pipe_m = pipeline.Pipeline([('std_scaler',preprocessing.StandardScaler()), ('feat_gen', FeatureGenerationTransformer(thr=0.5)), ('feat_sel', FeatureSelectionTransformer('rtree', X_train.shape[1]))])
pipe_m.fit(X_train, y_train)
X_train_m = pipe_m.transform(X_train)
X_test_m = pipe_m.transform(X_test)

In [None]:
X_test_m.shape

In [None]:
calc_accuracy(X_train_m, y_train, X_test_m, y_test)