In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.base import BaseEstimator, TransformerMixin

class Binning(BaseEstimator, TransformerMixin):
    def __init__(self, method='uniform', n_bins=5, encode='ordinal', config=None, target_column=None):
        self.method = method
        self.n_bins = n_bins
        self.encode = encode
        self.config = config if config is not None else {}
        self.target_column = target_column
        self.binner_ = None
        self.fitted = False
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.tolist()
            X_values = X.values

        else:
            X_values = np.asarray(X)
            self.feature_names_in_ = [f"feature_{i}" for i in range(X_values.shape[1])]

        if self.method == 'tree' and y is None and self.target_column is None:
            raise ValueError("Method 'tree' requires target values (y) or target_column to be specified.")

        if self.target_column is not None:
            if isinstance(X, pd.DataFrame):
                if self.target_column in X.columns:
                    y = X[self.target_column].values

                    X = X.drop(columns=[self.target_column])
                    X_values = X.values

                else:
                    raise ValueError(f"Target column '{self.target_column}' not found in DataFrame.")

            else:
                try:
                    idx = int(self.target_column)
                    y = X_values[:, idx]
                    X_values = np.delete(X_values, idx, axis=1)

                except:
                    raise ValueError("When X is array, target_column must be an integer index.")

        if self.method in ['uniform', 'quantile', 'kmeans']:
            self._fit_kbins(X_values)

        elif self.method == 'tree':
            self._fit_tree(X_values, y)

        else:
            raise ValueError(f"Method '{self.method}' not supported. Choose from 'uniform', 'quantile', 'kmeans', 'tree'.")

        self.fitted = True
        return self

    def transform(self, X):
        if not self.fitted:
            raise RuntimeError("Binning must be fitted before transform.")

        if isinstance(X, pd.DataFrame):
            X_values = X.values
        else:
            X_values = np.asarray(X)

        if self.method in ['uniform', 'quantile', 'kmeans']:
            X_trans = self.binner_.transform(X_values)

        elif self.method == 'tree':
            X_trans = self._transform_tree(X_values)

        else:
            raise ValueError(f"Method '{self.method}' not supported.")

        if isinstance(X, pd.DataFrame):
            if self.encode == 'ordinal':
                new_columns = [f"{col}_binned" for col in X.columns]

            else:
                new_columns = [f"{col}_bin_{i}" for col in X.columns for i in range(self.n_bins)]  # ساده‌سازی
            return pd.DataFrame(X_trans, columns=new_columns, index=X.index)

        else:
            return X_trans

    def _fit_kbins(self, X):
        strategy_map = {
            'uniform': 'uniform',
            'quantile': 'quantile',
            'kmeans': 'kmeans'
        }

        strategy = strategy_map[self.method]
        self.binner_ = KBinsDiscretizer(
            n_bins=self.n_bins,
            encode=self.encode,
            strategy=strategy
        )

        self.binner_.fit(X)

    def _fit_tree(self, X, y):
        self.tree_binners_ = []
        n_features = X.shape[1]

        task = self.config.get('task', 'regression')
        criterion = self.config.get('criterion', None)
        if criterion is None:
            criterion = 'squared_error' if task == 'regression' else 'gini'

        min_samples_split = self.config.get('min_samples_split', 2)
        min_samples_leaf = self.config.get('min_samples_leaf', 1)
        max_depth = self.config.get('max_depth', None)
        random_state = self.config.get('random_state', None)

        if isinstance(self.n_bins, int):
            n_bins_list = [self.n_bins] * n_features

        else:
            n_bins_list = self.n_bins
            if len(n_bins_list) != n_features:
                raise ValueError("Length of n_bins list must match number of features.")

        for i, nb in enumerate(n_bins_list):
            X_col = X[:, i].reshape(-1, 1)
            if task == 'regression':
                tree = DecisionTreeRegressor(
                    max_leaf_nodes=nb,
                    criterion=criterion,
                    min_samples_split=min_samples_split,
                    min_samples_leaf=min_samples_leaf,
                    max_depth=max_depth,
                    random_state=random_state
                )

            else:
                tree = DecisionTreeClassifier(
                    max_leaf_nodes=nb,
                    criterion=criterion,
                    min_samples_split=min_samples_split,
                    min_samples_leaf=min_samples_leaf,
                    max_depth=max_depth,
                    random_state=random_state
                )

            tree.fit(X_col, y)

            thresholds = sorted(set(tree.tree_.threshold[tree.tree_.feature != -2]))

            bin_edges = np.array([-np.inf] + thresholds + [np.inf])

            self.tree_binners_.append(bin_edges)

    def _transform_tree(self, X):
        X_trans = np.zeros((X.shape[0], X.shape[1]), dtype=int)
        for i, bin_edges in enumerate(self.tree_binners_):
            X_col = X[:, i]

            bins = np.digitize(X_col, bin_edges) - 1
            X_trans[:, i] = bins

        if self.encode == 'ordinal':
            return X_trans

        elif self.encode.startswith('onehot'):
            from sklearn.preprocessing import OneHotEncoder
            X_list = []
            for i in range(X_trans.shape[1]):
                encoder = OneHotEncoder(sparse_output=(self.encode == 'onehot'))
                col_encoded = encoder.fit_transform(X_trans[:, i].reshape(-1, 1))
                X_list.append(col_encoded)

            if self.encode == 'onehot':
                from scipy.sparse import hstack
                return hstack(X_list)

            else:
                return np.hstack([x.toarray() for x in X_list])

        else:
            raise ValueError(f"Unsupported encode: {self.encode}")

    def get_feature_names_out(self, input_features=None):
        if input_features is None:
            input_features = self.feature_names_in_

        if self.encode == 'ordinal':
            return [f"{name}_binned" for name in input_features]

        else:
            if isinstance(self.n_bins, int):
                n_bins_list = [self.n_bins] * len(input_features)

            else:
                n_bins_list = self.n_bins

            names = []
            for name, nb in zip(input_features, n_bins_list):
                for i in range(nb):
                    names.append(f"{name}_bin_{i}")

            return names