In [None]:
### Função para calcular a métrica pedida pela competição
def instacart_f1_score(data, user_id_col, true_col, pred_col):
    f1_scores = []

    users = data[user_id_col].unique()
    for user in users:
        user_data = data[data[user_id_col] == user]

        # Garantir que true_col e pred_col sejam conjuntos
        if isinstance(user_data[true_col].values[0], (np.integer, int)):
            y_true = {user_data[true_col].values[0]}
        else:
            y_true = set(user_data[true_col].values[0])

        if isinstance(user_data[pred_col].values[0], (np.integer, int)):
            y_pred = {user_data[pred_col].values[0]}
        else:
            y_pred = set(user_data[pred_col].values[0])

        if len(y_true) == 0 and len(y_pred) == 0:
            f1_scores.append(1.0)
        elif len(y_true) == 0 or len(y_pred) == 0:
            f1_scores.append(0.0)
        else:
            tp = len(y_true & y_pred)
            fp = len(y_pred - y_true)
            fn = len(y_true - y_pred)

            precision = tp / (tp + fp) if tp + fp > 0 else 0
            recall = tp / (tp + fn) if tp + fn > 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
            f1_scores.append(f1)

    return np.mean(f1_scores)

# Função de Avaliação
def get_ks(y, y_pred):
    return ks_2samp(y_pred[y == 1], y_pred[y != 1]).statistic

# Função de Avaliação
def metricas_validacao(model, data, target_):
    data = data.copy()
    prob = model.predict_proba(data.drop(columns=[target_]))[:, 1]

    auc = metrics.roc_auc_score(data[target_], prob)
    ks = get_ks(data[target_], prob)
    logloss = metrics.log_loss(data[target_], prob)
    accuracy = metrics.accuracy_score(data[target_], (prob > 0.5).astype(int))
    precision = metrics.precision_score(data[target_], (prob > 0.5).astype(int))
    recall = metrics.recall_score(data[target_], (prob > 0.5).astype(int))
    f1 = metrics.f1_score(data[target_], (prob > 0.5).astype(int))

    print(f'AUC: {auc:.4f}, KS: {ks:.4f}, Log Loss: {logloss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')
    return auc, ks, logloss, accuracy, precision, recall, f1

# Função de Plotagem
def save_plot(plot_func, filename, path='.', format='pdf'):
    # Criar uma nova figura para evitar conflitos
    plt.figure()
    # Gerar e salvar o gráfico
    plot_func()
    full_path = os.path.join(path, f"{filename}.{format}")
    plt.savefig(full_path, format=format)
    # Fechar a figura para liberar memória
    plt.close()
    print(f"Gráfico salvo como {full_path}")

# Função de Conversão
def convert_data_types():
    np.int = np.int32
    np.float = np.float64
    np.bool = np.bool_

# Função de modelagem
class SimplifiedLGBMModel:
    def __init__(self, data, features, target, categorical_features=[], test_flag_label=None, col_safra=None, test_size=0.3, random_state=42):
        self.data = data
        self.features = features
        self.target = target
        self.categorical_features = categorical_features
        self.test_flag_label = test_flag_label
        self.col_safra = col_safra
        self.test_size = test_size
        self.random_state = random_state
        self.model = None
        self.params = {
            'force_col_wise': True,
            'min_child_samples': 20,
            'max_depth': 10,
            'min_split_gain': 0.01,
            'num_leaves': 100,
            'n_estimators': 100
        }
        self.X_train, self.X_test, self.y_train, self.y_test = self.preprocess()

    def preprocess(self):
        if self.test_flag_label:
            train_data = self.data[self.data[self.test_flag_label] == 0]
            test_data = self.data[self.data[self.test_flag_label] == 1]
            X_train, y_train = train_data[self.features], train_data[self.target]
            X_test, y_test = test_data[self.features], test_data[self.target]
        else:
            X = self.data[self.features]
            y = self.data[self.target]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=self.random_state, stratify=y)
        return X_train, X_test, y_train, y_test

    def find_irrelevant_features(self):
        const_features = [col for col in self.features if self.data[col].nunique() == 1]
        self.features = [col for col in self.features if col not in const_features]
        self.update_datasets()
        print(f"Removed constant features: {const_features}")

    def find_correlated_features(self):
        corr_matrix = self.data[self.features].corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        corr_features = [column for column in upper.columns if any(upper[column] > 0.85)]
        self.features = [col for col in self.features if col not in corr_features]
        self.update_datasets()
        print(f"Removed correlated features: {corr_features}")

    def train(self):
        if not self.features:
            print("Nenhuma característica selecionada para treinamento.")
            return
        self.model = LGBMClassifier(random_state=self.random_state, **self.params)
        self.model.fit(self.X_train[self.features], self.y_train, categorical_feature=self.categorical_features)

    def evaluate(self):
        if self.model is None:
            print("Modelo não foi treinado.")
            return None, None
        print("Training Metrics:")
        train_metrics = metricas_validacao(self.model, pd.concat([self.X_train[self.features], self.y_train], axis=1), self.target)
        print("Testing Metrics:")
        test_metrics = metricas_validacao(self.model, pd.concat([self.X_test[self.features], self.y_test], axis=1), self.target)
        return train_metrics, test_metrics

    def plot_roc_curve(self):
        if self.model is None:
            print("Modelo não foi treinado.")
            return
        prob_test = self.model.predict_proba(self.X_test[self.features])[:, 1]
        fpr, tpr, _ = metrics.roc_curve(self.y_test, prob_test)
        auc = metrics.roc_auc_score(self.y_test, prob_test)

        plt.figure(dpi=300)
        plt.plot(fpr, tpr, color="darkorange", label="AUC = %0.2f" % auc)
        plt.plot([0, 1], [0, 1], color="navy", linestyle="--")
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curve")
        plt.legend(loc="lower right")
        plt.show()

    def plot_feature_importance(self):
        if self.model is None:
            print("Modelo não foi treinado.")
            return
        feature_importances = pd.DataFrame(self.model.feature_importances_, index=self.features, columns=['importance']).sort_values('importance', ascending=False)
        plt.figure(dpi=300)
        sns.barplot(x=feature_importances['importance'], y=feature_importances.index)
        plt.ylabel("Feature")
        plt.title("Feature Importances")
        plt.show()

    def borutapy_features_select(self, max_iter=100):
        convert_data_types()
        rf = RandomForestClassifier(n_jobs=-1, random_state=self.random_state, n_estimators=5)
        boruta = BorutaPy(estimator=rf, n_estimators='auto', max_iter=max_iter, verbose=1, random_state=self.random_state)
        boruta.fit(self.X_train[self.features].values, self.y_train.values)
        green_area = self.X_train[self.features].columns[boruta.support_].tolist()
        self.features = green_area
        print(f"Selected features (BorutaPy): {self.features}")
        self.update_datasets()
        self.train()

    def mrmr_features_select(self, k):
        selected_features_mrmr = mrmr.mrmr_classif(X=self.X_train[self.features], y=self.y_train, K=k)
        self.features = selected_features_mrmr
        print(f"Selected features (MRMR): {self.features}")
        self.update_datasets()
        self.train()

    def forward_features_select(self, k_folds=5, max_time=3600):
        start_time = time.time()
        skf = StratifiedKFold(n_splits=k_folds, random_state=self.random_state, shuffle=True)
        features_temp = self.features.copy()
        selected_features = []
        model = LGBMClassifier(random_state=self.random_state)
        while len(features_temp) > 0:
            if time.time() - start_time > max_time:
                print("Reached maximum time limit for feature selection.")
                break

            best_score = 0
            best_feature = None
            for feature in features_temp:
                current_features = selected_features + [feature]
                scores = []
                for train_idx, val_idx in skf.split(self.X_train, self.y_train):
                    model.fit(self.X_train.iloc[train_idx][current_features], self.y_train.iloc[train_idx])
                    pred = model.predict_proba(self.X_train.iloc[val_idx][current_features])[:, 1]
                    scores.append(metrics.roc_auc_score(self.y_train.iloc[val_idx], pred))
                mean_score = np.mean(scores)
                if mean_score > best_score:
                    best_score = mean_score
                    best_feature = feature
            if best_feature is not None:
                selected_features.append(best_feature)
                features_temp.remove(best_feature)
                print(f"Selected feature: {best_feature} with AUC: {best_score:.4f}")
            else:
                break
        self.features = selected_features
        self.update_datasets()
        self.train()

    def full_optuna(self, k_folds=5):
        """Otimiza hiperparâmetros usando Optuna."""
        def objective(trial):
            param_grid = {
                'n_estimators': trial.suggest_int('n_estimators', 10, 50),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
                'num_leaves': trial.suggest_int('num_leaves', 20, 80),
                'max_depth': trial.suggest_int('max_depth', 3, 15),
                'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
                'subsample': trial.suggest_float('subsample', 0.6, 0.9),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0)
            }
            scores = []
            model = LGBMClassifier(**param_grid, random_state=self.random_state)
            for train_idx, val_idx in StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=self.random_state).split(self.X_train, self.y_train):
                model.fit(self.X_train.iloc[train_idx], self.y_train.iloc[train_idx])
                preds = model.predict_proba(self.X_train.iloc[val_idx])[:, 1]
                score = metrics.roc_auc_score(self.y_train.iloc[val_idx], preds)
                scores.append(score)
            return np.mean(scores)

        study = create_study(direction='maximize')
        study.optimize(objective, n_trials=50)
        self.params = study.best_params
        print(f"Best parameters (Optuna): {self.params}")
        self.update_datasets()
        self.train()

    def update_datasets(self):
        """Atualiza os datasets de treino e teste com as features selecionadas."""
        self.X_train = self.X_train[self.features]
        self.X_test = self.X_test[self.features]

    def predict(self, X):
        """Retorna as previsões binárias do modelo treinado."""
        if self.model is None:
            raise ValueError("O modelo não foi treinado ainda.")
        return self.model.predict(X[self.features])

    def predict_proba(self, X):
        """Retorna as previsões de probabilidade do modelo treinado."""
        if self.model is None:
            raise ValueError("O modelo não foi treinado ainda.")
        return self.model.predict_proba(X[self.features])[:, 1]

    def save_model(self, filename, path='.'):
        full_path = os.path.join(path, filename)
        joblib.dump(self.model, full_path)
        print(f"Modelo salvo como {full_path}")

    def save_metrics(self, filename, train_metrics, test_metrics, path='.'):
        full_path = os.path.join(path, filename)
        metrics = {
            'train_metrics': train_metrics,
            'test_metrics': test_metrics
        }
        with open(full_path, 'w') as f:
            json.dump(metrics, f)
        print(f"Métricas salvas como {full_path}")

    def save_feature_importances(self, filename, path='.'):
        full_path = os.path.join(path, filename)
        feature_importances = pd.DataFrame(self.model.feature_importances_, index=self.features, columns=['importance']).sort_values('importance', ascending=False)
        feature_importances.to_csv(full_path)
        print(f"Importância das features salva como {full_path}")

    def save_features(self, filename='features.txt', path='.'):
        full_path = os.path.join(path, filename)
        with open(full_path, 'w') as f:
            for feature in self.features:
                f.write(f"{feature}\n")
        print(f"Features salvas em {full_path}")

    def save_plots(self, path='.'):
        self.save_roc_curve(path)
        self.save_feature_importance_plot(path)

    def save_roc_curve(self, path='.'):
        save_plot(self.plot_roc_curve, 'curva_roc', path, format='pdf')

    def save_feature_importance_plot(self, path='.'):
        save_plot(self.plot_feature_importance, 'importancia_das_features', path, format='pdf')

    def run(self):
        self.find_irrelevant_features()
        self.find_correlated_features()
        self.train()
        train_metrics, test_metrics = self.evaluate()
        if train_metrics is None or test_metrics is None:
            print("Não foi possível avaliar o modelo.")
            return
        self.plot_roc_curve()
        self.plot_feature_importance()
        return
