In [259]:
from itertools import product
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd
import param
#from pycaret.utils import check_metric

#from feature_selection.run_pycaret_setup import run_pycaret_setup


class FeatureSelection(param.Parameterized):

    # Class attributes
    model_class_to_name = {
        "RidgeClassifier": "ridge",
        "LogisticRegression": "lr",
        "LinearDiscriminantAnalysis": "lda",
        "GradientBoostingClassifier": "gbc",
        "QuadraticDiscriminantAnalysis": "qda",
        "LGBMClassifier": "lightgbm",
        "AdaBoostClassifier": "ada",
        "RandomForestClassifier": "rf",
        "ExtraTreesClassifier": "et",
        "GaussianNB": "nb",
        "DecisionTreeClassifier": "dt",
        "KNeighborsClassifier": "knn",
        "SGDClassifier": "svm",
        "CatBoostClassifier": "catboost",
        "SVC": "rbfsvm",
        "GaussianProcessClassifier": "gpc",
        "MLPClassifier": "mlp",
        "XGBClassifier": "xgboost",
    }

    metrics_list = ["Accuracy", "AUC", "Recall", "Precision", "F1", "Kappa", "MCC"]

    # Private class attributes
    _filter_metric = {
        "Accuracy": 0.5,
        "AUC": 0.5,
        "Recall": 0.6,
        "Precision": 0.6,
        "F1": 0.6,
        "Kappa": 0.1,
        "MCC": 0.1,
    }

    _setup_kwargs = dict(
        preprocess=True,
        train_size=0.75,
        # test_data=test_data,
        session_id=123,
        normalize=True,
        transformation=True,
        ignore_low_variance=True,
        remove_multicollinearity=False,
        multicollinearity_threshold=0.4,
        n_jobs=-1,
        use_gpu=False,
        profile=False,
        ignore_features=None,
        fold_strategy="timeseries",
        remove_perfect_collinearity=True,
        create_clusters=False,
        fold=4,
        feature_selection=False,
        # you can use this to keep the 95 % most relevant features (fat_sel_threshold)
        feature_selection_threshold=0.4,
        combine_rare_levels=False,
        rare_level_threshold=0.02,
        pca=False,
        pca_method="kernel",
        pca_components=30,
        polynomial_features=False,
        polynomial_degree=2,
        polynomial_threshold=0.01,
        trigonometry_features=False,
        remove_outliers=False,
        outliers_threshold=0.01,
        feature_ratio=False,
        feature_interaction=False,
        # Makes everything slow AF. use to find out possibly interesting features
        interaction_threshold=0.01,
        fix_imbalance=False,
        log_experiment=False,
        verbose=False,
        silent=True,
        experiment_name="lagstest",
        html=False,
    )

    _numerics = ["int16", "int32", "int64", "float16", "float32", "float64", "int", "float"]

    # Init values
    ## Feature selection parameters
    target = param.String("goal_2.5")
    number_features = param.Number(
        0.5,
        bounds=(0, 1),
        inclusive_bounds=(False, False),
        doc="Number of features (percentage) selected each iteration. Only the first nth "
        "features will be kept for the next iteration.",
    )
    target_features = param.Number(
        0.3,
        bounds=(0, None),
        inclusive_bounds=(False, True),
        doc="Final total number of features. The goal of the package is to reduce "
        "the incoming columns of the dataset to this 'target_features' number.",
    )
    ## Metric parameters
    filter_metrics = param.Dict(_filter_metric)
    ## Model setup and model optimization parameters
    numerics = param.List(_numerics)
    ignore_features = param.List(default=[], allow_None=True)
    setup_kwargs = param.Dict(_setup_kwargs)
    include = param.List(default=None, item_type=str, allow_None=True)
    exclude = param.List(["qda", "knn", "nb"], item_type=str)
    sort = param.String("AUC")
    number_models = param.Integer(10, bounds=(2, 13))
    top_models = param.List(default=None, allow_None=True)
    optimize = param.Boolean(False)
    opt_list = param.List(["Accuracy", "Precision", "Recall", "F1", "AUC"], item_type=str)
    ## Class selectors
    dataset = param.ClassSelector(class_=pd.DataFrame)
    dict_models = param.ClassSelector(class_=dict)
    tune_dict_models = param.ClassSelector(class_=dict)
    x_train = param.ClassSelector(class_=pd.DataFrame)
    x_df = param.DataFrame(pd.DataFrame())
    model_df = param.ClassSelector(class_=pd.DataFrame)
    model_tuned_df = param.ClassSelector(class_=pd.DataFrame)
    features_df = param.ClassSelector(class_=pd.DataFrame)

    def __init__(self, dataset: pd.DataFrame, **kwargs):
        # Copy of the incoming dataset
        dataset = dataset.copy()
        # Compute the upper bound of number_features, target_features, number_models
        total_features = dataset.shape[1]
        self.param.target_features.bounds = (0, total_features)
        if "include" in kwargs:
            print('hola')
            self.param.number_models.default = len(kwargs["include"])
            self.param.number_models.bounds = (0, len(kwargs["include"]))
            print(self.param.number_models.bounds)
        # Call super
        super(FeatureSelection, self).__init__(dataset=dataset, **kwargs)
        # Get the features of the dataframe
        self.feature_list = self.dataset.columns.tolist()
        self.feature_list.remove(self.target)  # target column should not be counted
        # Compute target features
        self.target_features = self.calculate_number_features(
            number_features=self.target_features, features=self.feature_list
        )
        # Get the evaluator and the arguments. Depends on the "include" parameter
        self._training_function, self._args = self._decide_model_eval()
        # Get all the columns whose type is numeric
        self.numeric_features = self._compute_numeric_features(df=self.dataset[self.feature_list])

    def _compute_numeric_features(self, df: pd.DataFrame):
        """Return those columns from the given dataset whose data type is numeric."""
        return df.select_dtypes(include=self.numerics).columns.tolist()

    def _decide_model_eval(self):
        """
        Define the pycaret model evaluator depending on the number of included models.

        If the 'include' list parameter equals 1, the method will return
        the 'create_models' pycaret object.
        If 'include' parameter list is greatear than 1, the method will
        return the 'compare_model' pycaret object and its arguments.
        If 'include' parameter equals None, the method will return the
        'compare_models' pycaret object, where all possible models are
        considered for evaluation, except those included within the 'exclude'
        list.
        """
        args = {"n_select": self.number_models, "sort": self.sort, "verbose": False}
        training_function = np.random.randint
        if not self.include:
            args["exclude"] = self.exclude
        elif len(self.include) == 1:
            training_function = lambda *rgs, **kwargs: [range(*rgs, **kwargs)]
            args = {"estimator": self.include[0], "verbose": False}
        else:
            args["include"] = self.include
        return training_function, args

    @staticmethod
    def calculate_number_features(
        number_features: Union[int, float], features: Union[pd.DataFrame, List]
    ) -> int:
        n_features = (
            int(number_features)
            if (number_features >= 1)
            else int(number_features * len(features))
        )
        return n_features

In [260]:
dataset = pd.DataFrame({'a': [1,3,3,4,5], 'b': [13,3,4,4,5], 'c': ['a', 'b', 'c', 'd', 'd']})
pepe = FeatureSelection(dataset, target='c', target_features=1)

In [139]:
a.feature_list


['a', 'b']

In [2]:
predictions = pd.read_csv('predictions.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
import panel as pn 
import param 

pn.extension()

In [2]:
class Mem(param.Parameterized):
    single_file = param.FileSelector(path='../../*/*/*', precedence=0.5)


In [24]:
a = Mem()

In [25]:
pn.Row(a.param)

In [None]:
'lr', 'dt', 'svm','rbfsvm','gpc','mlp','ridge','rf','ada','gbc','lda','et','xgboost','lightgbm' ,'catboost'

In [270]:
data = predictions.iloc[:500, np.random.randint(0,2000,size=150).tolist()]

In [284]:
import param 
import panel as pn 
import pandas as pd
import numpy as np 
pn.extension()
import time


class Widget(param.Parameterized):
    
    _available_models = ['lr', 'dt', 'svm','rbfsvm','gpc','mlp','ridge','rf','ada','gbc','lda','et','xgboost','lightgbm' ,'catboost']
    
    _filter_metric = {
        "Accuracy": 0.5,
        "AUC": 0.5,
        "Recall": 0.6,
        "Precision": 0.6,
        "F1": 0.6,
        "Kappa": 0.1,
        "MCC": 0.1,
    }
    
    #opt_kwargs = param.Dict({})
    # 1st param layer 
    target = param.String(default='goals_2.5')
    # 2nd param layer 
    ## Feature selection parameters 
    number_features = param.Number(0.5, bounds=(0, 1), inclusive_bounds=(False, False) )
    target_features = param.Number(0.3, bounds=(0, None), inclusive_bounds=(False, True) )
    ## Metric parameters
    acu = param.Number(0.6, bounds=(0, 1), label='Accuracy')
    auc = param.Number(0.6, bounds=(0, 1), label='AUC')
    rec = param.Number(0.6, bounds=(0, 1), label='Recall')
    pre = param.Number(0.6, bounds=(0, 1), label='Precision')
    f1 = param.Number(0.6, bounds=(0, 1), label='F-score')
    kappa = param.Number(0.1, bounds=(0, 1), label='Kappa')
    mcc = param.Number(0.1, bounds=(0, 1), label='MCC')
    ## Model setup and model optimization parameters
    ignore_features = param.ListSelector(default=None, objects=[1, 2], allow_None=True )
    sort = param.ObjectSelector(default="AUC", objects=_filter_metric.keys() )
    include = param.ListSelector(default=None, objects= [None] + _available_models, allow_None=True )
    number_models = param.Integer(10, bounds=(2, len(_available_models)) )
    optimize = param.Boolean(False, label='Tuning process', doc='Call tuning process during feature evaluation.')
    opt_list = param.ListSelector(default=['AUC'], objects=list(_filter_metric.keys())[:-2], precedence=-1, label='Tuning list')
    # Hidden layers
    dataset = param.ClassSelector(class_=pd.DataFrame, precedence=-1)
    filter_metrics = param.Dict(default=_filter_metric, precedence=-1)
    
    def __init__(self, dataset, **kwargs):
        upper_bound = dataset.shape[1]
        self.param.target_features.bounds = (0, upper_bound)
        self.param.ignore_features.objects = [None] + dataset.columns.to_list()
        self.param.target.default = dataset.columns.tolist()[-1]
        super(Widget, self).__init__(dataset=dataset, **kwargs)
        self.button = pn.widgets.Button(name='Start process!', button_type="success") 
        self.button.on_click(self.run_process)
    
    @param.depends('optimize', watch=True)
    def update_precedence(self):
        self.param.opt_list.precedence = 1 if self.optimize else -1
        
    @param.depends("acu", "auc", "rec", "pre", "f1", "kappa", "mcc", watch=True)
    def _define_filter_metrics(self):
        legend = ["acu", "auc", "rec", "pre", "f1", "kappa", "mcc"]
        self.filter_metrics = {key: getattr(self, key) for key in legend} 
    
    
    def _discrete(self):
        options = np.linspace(0,1,100,endpoint=False)[1:].tolist() + np.linspace(1,self.dataset.shape[1],200, dtype=int).tolist()
        return {'widget_type': pn.widgets.DiscreteSlider,
                'name': 'Target features', 
                'options': options, 
                'value': 0.3,
                'width': 280}

        
    def interactive_panel(self):
        first_col = ['target', 'number_features', 'target_features', 'ignore_features', 'sort', 'include', 'number_models']
        second_col = ['acu', 'auc', 'rec', 'pre', 'f1', 'kappa', 'mcc', 'optimize', 'opt_list']
        view = pn.Row(
            pn.Column(
                pn.panel(self.param,
                         parameters=[c for c in first_col],
                         name='Feature parameters and models included',
                         widgets={'target_features': self._discrete()}
                        )
            ),
            pn.Column(
                pn.panel(self.param,
                         parameters=[c for c in second_col],
                         name='Metric and optimize parameters')
            ),
            pn.Column(
                pn.panel(self.button, name="Run the library to select relevant features")
            ),
        )
        return view
    
    def cosita(self):
        return range(10, **self.opt_kwargs)

    def run_process(self):
        inst = FeatureSelection(dataset = self.dataset, target=self.target, number_features=self.nu)
        

In [285]:
a = Widget(dataset=predictions)#pd.DataFrame({'a': [1,23,4,4], 'b': [1,2,2,2]}))
a.interactive_panel()

In [287]:
a.filter_metrics

{'acu': 0.6,
 'auc': 0.6,
 'rec': 0.6,
 'pre': 0.6,
 'f1': 0.9,
 'kappa': 0.1,
 'mcc': 0.1}

In [264]:

if not a.include:
    print('ll\ ')

ll\ 


In [296]:
translate = {'acu': "Accuracy",
  'auc': "AUC",
  'rec': "Recall",
  'pre': "Precision",
  'f1': "F1",
  'kappa': "Kappa", 
  'mcc': "MCC"}

In [298]:
not_list = ['name', "acu", "auc", "rec", "pre", "f1", "kappa", "mcc", 'dataset']
if (not a.include or a.include==[None]):
    not_list = not_list + ['include']
kwargs = {c: getattr(a, c) for c in a.param if c not in not_list}
#FeatureSelection(**{c: getattr(a, c) for c in a.param if c not in not_list})
not_list

['name',
 'acu',
 'auc',
 'rec',
 'pre',
 'f1',
 'kappa',
 'mcc',
 'dataset',
 'include']

In [291]:
kwargs['filter_metrics'] = {}

{'target': 'Score',
 'number_features': 0.5,
 'target_features': 0.3,
 'ignore_features': None,
 'sort': 'AUC',
 'number_models': 10,
 'optimize': False,
 'opt_list': ['AUC'],
 'filter_metrics': {'acu': 0.6,
  'auc': 0.6,
  'rec': 0.6,
  'pre': 0.6,
  'f1': 0.9,
  'kappa': 0.1,
  'mcc': 0.1}}

In [292]:
FeatureSelection(predictions, **{'target': 'Score',
 'number_features': 0.5,
 'target_features': 0.3,
 'ignore_features': None,
 'sort': 'AUC',
 'number_models': 10,
 'optimize': False,
 'opt_list': ['AUC'],
 'filter_metrics': {'Accuracy': 0.5,
  'AUC': 0.5,
  'Recall': 0.6,
  'Precision': 0.6,
  'F1': 0.6,
  'Kappa': 0.1,
  'MCC': 0.1}})

FeatureSelection(dataset=                           match_id  win_pct_h  draw_pct_h  loss_pct_h  \
0           2220927_mexican_primera        0.2         0.2         0.6   
1           2219375_spanish_la_liga        0.6         0.0         0.4   
2           2229005_italian_serie_a        0.2         0.2         0.6   
3           2219371_spanish_la_liga        0.0         0.4         0.6   
4    2210370_english_premier_league        0.6         0.2         0.2   
..                              ...        ...         ...         ...   
495       2215358_german_bundesliga        0.2         0.4         0.4   
496       2215359_german_bundesliga        0.4         0.4         0.2   
497         2229119_italian_serie_a        0.4         0.2         0.4   
498         2229114_italian_serie_a        0.6         0.2         0.2   
499          2215697_french_ligue_1        0.6         0.2         0.2   

     win_streak_h  loss_streak_h  draw_streak_h  points_h  ranking_h  ATT_h  \
0      

In [239]:
pepe.ignore_features

[]

In [72]:
pn.panel(a.param.number_features).width = 500
#np.linspace(1,a.dataset.shape[1],200, dtype=int)

In [62]:
pn.panel(a.param.number_features)

In [54]:
pn.widgets.DiscreteSlider(name='Discrete Slider', options=[2, 4, 8, 16, 32, 64, 128], value=32, tooltips=True)


In [9]:
class mem: 
    def __init__(self):
        self.button = pn.widgets.Button(name='Click me', button_type='primary')
        self.text = pn.widgets.TextInput(value='Ready')
        self.button.on_click(self.b)
        
    def b(self, event):
        self.text.value = 'Clicked {0} times'.format(self.button.clicks)
    
    def view(self):
        return pn.Row(self.button, self.text)


In [10]:
cosita = mem()

In [11]:
cosita.view()

In [12]:
cosita.button.clicks

10

In [3]:
pene = a._button()
pene

In [42]:
button = pn.widgets.Button(name='Click me', button_type='primary')
pn.Row(button, button.clicks)

In [44]:
button.clicks

10

In [28]:
list(range(1,a.dataset.shape[1]))[0]

1

In [30]:
np.linspace(0,1,100,endpoint=False)[1:].tolist() + list(range(1,a.dataset.shape[1]))

[0.01,
 0.02,
 0.03,
 0.04,
 0.05,
 0.06,
 0.07,
 0.08,
 0.09,
 0.1,
 0.11,
 0.12,
 0.13,
 0.14,
 0.15,
 0.16,
 0.17,
 0.18,
 0.19,
 0.2,
 0.21,
 0.22,
 0.23,
 0.24,
 0.25,
 0.26,
 0.27,
 0.28,
 0.29,
 0.3,
 0.31,
 0.32,
 0.33,
 0.34,
 0.35000000000000003,
 0.36,
 0.37,
 0.38,
 0.39,
 0.4,
 0.41000000000000003,
 0.42,
 0.43,
 0.44,
 0.45,
 0.46,
 0.47000000000000003,
 0.48,
 0.49,
 0.5,
 0.51,
 0.52,
 0.53,
 0.54,
 0.55,
 0.56,
 0.5700000000000001,
 0.58,
 0.59,
 0.6,
 0.61,
 0.62,
 0.63,
 0.64,
 0.65,
 0.66,
 0.67,
 0.68,
 0.6900000000000001,
 0.7000000000000001,
 0.71,
 0.72,
 0.73,
 0.74,
 0.75,
 0.76,
 0.77,
 0.78,
 0.79,
 0.8,
 0.81,
 0.8200000000000001,
 0.8300000000000001,
 0.84,
 0.85,
 0.86,
 0.87,
 0.88,
 0.89,
 0.9,
 0.91,
 0.92,
 0.93,
 0.9400000000000001,
 0.9500000000000001,
 0.96,
 0.97,
 0.98,
 0.99,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 

In [12]:
data = pd.DataFrame({'a': [1,23,4,4], 'b': [1,2,2,2]})

In [37]:
a = FeatureSelection(dataset=data, target='a', target_features=1)

In [44]:
for a in FeatureSelection.param:
    print(FeatureSelection.param[a])

<param.parameterized.String object at 0x114982ae0>
<param.parameterized.String object at 0x11491c220>
<param.Number object at 0x114b1ef40>
<param.Number object at 0x114b1ee80>
<param.Number object at 0x114b1ea00>
<param.Dict object at 0x11491c2c0>
<param.List object at 0x11486f040>
<param.List object at 0x114b6d0f0>
<param.Dict object at 0x11491c360>
<param.List object at 0x114b6d510>
<param.List object at 0x114b6d9e0>
<param.parameterized.String object at 0x1145f5f40>
<param.Integer object at 0x114b1eac0>
<param.List object at 0x114b6d250>
<param.Boolean object at 0x114982040>
<param.List object at 0x114b6d1a0>
<param.ClassSelector object at 0x1149820e0>
<param.ClassSelector object at 0x1149822c0>
<param.ClassSelector object at 0x114982360>
<param.ClassSelector object at 0x114982400>
<param.ClassSelector object at 0x114982900>
<param.ClassSelector object at 0x1149829a0>
<param.ClassSelector object at 0x114982a40>


In [40]:
pn.Row(FeatureSelection.param)