In [4]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from collections import defaultdict

In [5]:

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import json
from google.cloud import storage


input_bucket_path = 'gs://berkabank/production/data/'
# Load the data
data = {
    "training_features": pd.read_csv(
        f"{input_bucket_path}05_features/training_features.csv"
    ),
    "core_training": pd.read_csv(
        f"{input_bucket_path}04_processing/core_training.csv"
    ),
}

In [6]:
from dataclasses import dataclass
from typing import Union
import numpy as np
import pandas as pd
from probatus.feature_elimination import ShapRFECV
from sklearn.base import BaseEstimator
from sklearn.model_selection import RandomizedSearchCV


@dataclass
class FeatureEliminationShap:
    """Feature elimination class.

    Attributes:
        model (Union[BaseEstimator, RandomizedSearchCV]): model to use for feature elimination
        step (float): step for feature elimination
        cv (int): number of cross-validation folds
        scoring (str): scoring metric
        n_jobs (int): number of parallel jobs
        standard_error_threshold (float): standard error threshold
        return_type (str): return type
        num_features (Union[int, str]): number of features to return

    Methods:
        run(X, y): fit the model


    Returns:
        list: reduced feature set
    """

    model: Union[BaseEstimator, RandomizedSearchCV]
    step: float = 0.2
    cv: int = 10
    scoring: str = "roc_auc"
    n_jobs: int = -1
    standard_error_threshold: float = 0.5
    return_type: str = "feature_names"
    num_features: Union[int, str] = "best"

    def run(self, X: pd.DataFrame, y: np.array) -> pd.DataFrame:
        """Run the feature elimination process.

        Args:
            X (pd.DataFrame): input features
            y (np.array): target variable

        Returns:
            list: reduced feature set
        """
        shap_elimination = ShapRFECV(
            model=self.model,
            step=self.step,
            cv=self.cv,
            scoring=self.scoring,
            n_jobs=self.n_jobs,
        )

        grid_search = shap_elimination.fit(X, y)

        return grid_search.get_reduced_features_set(
            num_features=self.num_features,
            standard_error_threshold=self.standard_error_threshold,
            return_type=self.return_type,
        )


  from .autonotebook import tqdm as notebook_tqdm


In [7]:


# Parameters
params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [10, 20, 30],
    "min_samples_split": [2, 5, 10],
}
step = 0.2
cv = 10
scoring = "roc_auc"
n_jobs = -1
standard_error_threshold = 0.5
return_type = "feature_names"
num_features = "best_coherent"

# Model GridSearch
model = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=params,
)

# Process core
features_processor = FeatureEliminationShap(
    model=model,
    step=step,
    cv=cv,
    scoring=scoring,
    n_jobs=n_jobs,
    standard_error_threshold=standard_error_threshold,
    return_type=return_type,
    num_features=num_features,
)

X = data["training_features"].groupby("account_id").sum()
y = data["core_training"].set_index("account_id")["target"]
drivers = features_processor.run(X=X, y=y)

In [8]:
X[drivers]

Unnamed: 0_level_0,n_transactions
account_id,Unnamed: 1_level_1
1,4858.0
2,39309.0
4,2582.0
6,23927.0
9,8810.0
...,...
11320,62533.0
11333,32853.0
11349,1.0
11362,3956.0


In [9]:
X

Unnamed: 0_level_0,n_transactions,days_since_account_creation
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4858.0,44717
2,39309.0,200270
4,2582.0,37413
6,23927.0,177388
9,8810.0,69553
...,...,...
11320,62533.0,362963
11333,32853.0,163370
11349,1.0,91
11362,3956.0,31216


In [14]:
X[drivers].to_csv(f"{input_bucket_path}05_features/training_drivers.csv",index=True)

In [12]:
pd.read_csv(f"{input_bucket_path}05_features/training_drivers.csv",index_col=0)

Unnamed: 0_level_0,n_transactions
account_id,Unnamed: 1_level_1
1,4858.0
2,39309.0
4,2582.0
6,23927.0
9,8810.0
...,...
11320,62533.0
11333,32853.0
11349,1.0
11362,3956.0


In [15]:
X[drivers].to_csv("./training_drivers.csv",index=True)