In [64]:
import pandas as pd
import numpy as np
import warnings  
import matplotlib.pyplot as plt 
from sklearn.metrics import classification_report, make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV 
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression  
from pycaret.classification import *
from skopt.space import Real, Categorical, Integer

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings('ignore')

In [None]:
def show_importance_features(model:GridSearchCV, X_train):
        # Extração do melhor modelo dentro do Pipeline
    best_pipeline = model.best_estimator_
    best_model = best_pipeline.named_steps['model']  # Acessa o modelo dentro do pipeline 
    try:
        feature_names = feature_names = X_train.columns.tolist()
        feature_names = [name.split("__")[-1] for name in feature_names]  # Remove "num_continuas__"
    except AttributeError:
        feature_names = [f"feature_{i}" for i in range(X_train.shape[1])]  # Nomes genéricos
    # Verificar o tipo do modelo corretamente
    if isinstance(best_model, (LogisticRegression, SVC)):
        if hasattr(best_model, "coef_"):  # Apenas modelos lineares (SVC com kernel="linear")
            coef = best_model.coef_.flatten()  # Achata os coeficientes
            # Verifica se o número de coeficientes corresponde ao número de atributos
            if len(coef) == len(feature_names):
                imp = pd.DataFrame({"atributos": feature_names, "importancia": coef})
                imp = imp.sort_values(by="importancia", ascending=True)
                # Construindo gráfico
                plt.figure(figsize=(10, 6))
                plt.barh(y=imp['atributos'], width=imp['importancia'])
                plt.xlabel("Importância")
                plt.ylabel("Atributos")
                plt.title(f"Importância dos Atributos - {best_model.__class__.__name__}")
                plt.show()
            else:
                print(f"Erro: O número de coeficientes ({len(coef)}) não bate com o número de atributos ({len(feature_names)})")
        else:
            print(f"O modelo {best_model.__class__.__name__} usa um kernel que não fornece coeficientes.")
    elif isinstance(best_model, DecisionTreeClassifier):
        imp = pd.DataFrame({"atributos": feature_names, "importancia": best_model.feature_importances_})
        imp = imp.sort_values(by="importancia", ascending=True) 
        # Construindo gráfico
        plt.figure(figsize=(10, 6))
        plt.barh(y=imp['atributos'], width=imp['importancia'])
        plt.xlabel("Importância")
        plt.ylabel("Atributos")
        plt.title(f"Importância dos Atributos - {best_model.__class__.__name__}")
        plt.show()
    else:
        print(f"O modelo {best_model.__class__.__name__} não fornece coeficientes de importância.")

# Função para exibir os resultados do modelo
 # Esta função tem como objetivo verificar se o modelo está generalizando bem
 # Ela avalia cada iteração do Kfolds e calcula a média para cada K e soma no final;
 # Ao final, conseguimos ver se o modelo está generalizando bem as informações se os marcadores médios do K forem próximos aos marcadores da base de teste completa
def show_results_models(model: GridSearchCV, x_train, x_test, y_train, y_test, exec_train=True):
    print('Best params ==>', model.best_params_)
    print('Best score:', model.best_score_)
    if exec_train:
        yhat_predicted_train = model.best_estimator_.predict(x_train)
        print('Desempenho - Base de Treino')
        print(classification_report(y_train, yhat_predicted_train))

    yhat_predicted_test = model.best_estimator_.predict(x_test)
    print('Desempenho - Base de Teste')
    print(classification_report(y_test, yhat_predicted_test))
    # Extração dos resultados do GridSearchCV
    results = model.cv_results_
    # Número de folds usados no K-Fold
    n_splits = len([key for key in results.keys() if key.startswith("split")]) // 4
    # Criar listas com métricas de cada fold
    accuracy = [results[f"split{i}_test_accuracy"][model.best_index_] for i in range(n_splits)]
    precision = [results[f"split{i}_test_precision"][model.best_index_] for i in range(n_splits)]
    recall = [results[f"split{i}_test_recall"][model.best_index_] for i in range(n_splits)]
    f1_scores = [results[f"split{i}_test_f1"][model.best_index_] for i in range(n_splits)]  # <- Nome corrigido
    # Criar listas para armazenar os valores acumulados
    acc_means, acc_stds = [], []
    prec_means, prec_stds = [], []
    recall_means, recall_stds = [], []
    f1_means, f1_stds = [], []
    for i in range(1, n_splits + 1):
        acc_means.append(np.mean(accuracy[:i]))
        acc_stds.append(np.std(accuracy[:i]))
        prec_means.append(np.mean(precision[:i]))
        prec_stds.append(np.std(precision[:i]))
        recall_means.append(np.mean(recall[:i]))
        recall_stds.append(np.std(recall[:i]))
        f1_means.append(np.mean(f1_scores[:i]))  # <- Nome corrigido
        f1_stds.append(np.std(f1_scores[:i]))  # <- Nome corrigido
    # Criar DataFrame com as métricas de cada iteração acumulada
    metrics_df = pd.DataFrame({
        "Fold": range(1, n_splits + 1),
        "Accuracy Mean": acc_means,
        "Accuracy Std": acc_stds,
        "Precision Mean": prec_means,
        "Precision Std": prec_stds, 
        "Recall Mean": recall_means,
        "Recall Std": recall_stds,
        "F1-score Mean": f1_means,
        "F1-score Std": f1_stds
        })
    # Exibir a tabela formatada por Fold
    print("\n========== Tabela de métricas por iteração do K-Fold ==========")
    print(metrics_df.to_string(index=False))
    # Cálculo final da média e do desvio padrão das métricas
    mean_values = {
    "Accuracy": np.mean(accuracy),
    "Precision": np.mean(precision),
    "Recall": np.mean(recall),
    "F1-score": np.mean(f1_scores)  # <- Nome corrigido
    }
    std_values = {
    "Accuracy": np.std(accuracy),
    "Precision": np.std(precision),
    "Recall": np.std(recall),
    "F1-score": np.std(f1_scores)  # <- Nome corrigido
    }
    # Criar DataFrame para exibir a estatística final
    summary_df = pd.DataFrame([mean_values, std_values], index=["Média", "Desvio Padrão"])

    print("\n========== Comparação direta das métricas no conjunto de teste ==========")
    print("Accuracy:", accuracy_score(y_test, yhat_predicted_test))
    print("Precision (macro):", precision_score(y_test, yhat_predicted_test, average="macro"))
    print("Recall (macro):", recall_score(y_test, yhat_predicted_test, average="macro"))
    print("F1-score (macro):", f1_score(y_test, yhat_predicted_test, average="macro"))

    print("Precision (weighted):", precision_score(y_test, yhat_predicted_test, average="weighted"))
    print("Recall (weighted):", recall_score(y_test, yhat_predicted_test, average="weighted"))
    print("F1-score (weighted):", f1_score(y_test, yhat_predicted_test, average="weighted"))

    # Exibir estatísticas finais
    print("\n========== Estatísticas finais das métricas ==========")
    print(summary_df.to_string(index=True)) 
    show_importance_features(model, x_train)

## Criando Pipeline para o modelo de Regressão Logistica

In [83]:
train_features = catalog.load('shots_train')
test_features = catalog.load('shots_test')

x_train_features = train_features.drop(['shot_made_flag'], axis=1)  # Variáveis independentes
y_train_features = train_features['shot_made_flag']  # Variável dependente (alvo)

x_test_features = test_features.drop(['shot_made_flag'], axis=1)  # Variáveis independentes
y_test_features = test_features['shot_made_flag']  # Variável dependente (alvo)


# Recriar os DataFrames de treino e teste com as variáveis separadas
train_df = pd.concat([x_train_features, y_train_features], axis=1)
test_df = pd.concat([x_test_features, y_test_features], axis=1)

In [None]:
# criando os pipelines
reglog = Pipeline([ 
   ('model', LogisticRegression(solver='saga'))
])

 # configurar o amostrador - estratificado
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)

In [None]:
 #TREINAMENTO DO MODELO DE REGRESSÃO LOGISTICA
params_grid_reglog = {
    'model__penalty': ['l1', 'l2', 'elasticnet', None],
    'model__C': [0.001, 0.01, 0.1, 1, 10],
    'model__l1_ratio': [0.25, 0.5, 0.75],
    'model__class_weight': ['balanced', None]
}

scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

 # configurar o buscador
search_reglog = GridSearchCV(
    estimator=reglog,
    param_grid=params_grid_reglog,
    scoring=scoring_metrics,
    cv=splitter,
    refit="f1",
    error_score=0,
    n_jobs=-1
)

search_reglog.fit(x_train_features, y_train_features)

In [None]:
show_results_models(search_reglog, x_train_features, x_test_features, y_train_features, y_test_features, False)

In [72]:
session_id = catalog.load('params:session_id')
exp = ClassificationExperiment()
exp.setup(
    data=train_features,
    target='shot_made_flag',
    n_jobs=-1, 
    use_gpu=True, 
    session_id=session_id, 
    log_experiment='mlflow', 
    experiment_name='kobe_shots_pycaret')

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> inits

Unnamed: 0,Description,Value
0,Session id,54321
1,Target,shot_made_flag
2,Target type,Binary
3,Original data shape,"(16228, 14)"
4,Transformed data shape,"(16228, 14)"
5,Transformed train set shape,"(11359, 14)"
6,Transformed test set shape,"(4869, 14)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> inits

[1m<[0m[1;95mpycaret.classification.oop.ClassificationExperiment[0m[39m object at [0m[1;36m0x0000028094E0F110[0m[1m>[0m

In [79]:
lr = exp.create_model('lr', verbose=False)  # Silencia outputs durante criação

lr_search_space = {
    'penalty': Categorical(['l1', 'l2', 'elasticnet']),
    'C': Real(0.001, 100, prior='log-uniform'),
    'class_weight': Categorical(['balanced', None]),
    'max_iter': Integer(100, 1000),
    'tol': Real(1e-4, 1e-2, prior='log-uniform'),
}

# Criar um espaço de busca condicional
lr_search_space['solver'] = Categorical(['liblinear', 'saga'])

# Adicionar 'l1_ratio' apenas se a penalidade for 'elasticnet'
if 'elasticnet' in lr_search_space['penalty'].categories:
    lr_search_space['l1_ratio'] = Real(0.1, 0.9)
    # Restrição: elasticnet só pode usar saga
    lr_search_space['solver'] = Categorical(['saga'])  

# Rodar a otimização garantindo que as combinações sejam válidas
tuned_lr = exp.tune_model(
    lr,
    custom_grid=lr_search_space,
    n_iter=100,
    optimize='F1',
    search_library='scikit-optimize',
    search_algorithm='bayesian',
    choose_better=True,
    early_stopping=True,
    early_stopping_max_iters=10,
    verbose=False
)





In [80]:
exp.evaluate_model(tuned_lr)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [73]:
dt = exp.create_model('dt', verbose=False)

# Configuração otimizada para velocidade
tuned_dt = exp.tune_model(
    dt,
    n_iter=50,  # Reduz iterações sem perder muita qualidade
    optimize='F1',
    search_library='scikit-optimize',
    search_algorithm='bayesian',
    early_stopping=True,
    early_stopping_max_iters=10, 
    custom_grid={
        'criterion': Categorical(['gini', 'entropy']),  # Remove log_loss (similar a entropy)
        'splitter': Categorical(['best']),  # Remove 'random' (geralmente pior)
        'max_depth': Integer(3, 20),  # Busca inteligente ao invés de lista fixa
        'min_samples_split': Integer(2, 10),  # Intervalo reduzido
        'min_samples_leaf': Integer(1, 5),
        'max_features': Categorical(['sqrt', 'log2', 0.5]),  # Remove None e 0.8
        'ccp_alpha': Real(0.0, 0.1),  # Foco nos valores mais úteis
        'max_leaf_nodes': Integer(10, 50)  # Intervalo mais restrito
    }
)



Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6056,0.5997,0.4549,0.619,0.5244,0.2008,0.2078
1,0.5819,0.5851,0.4217,0.5872,0.4909,0.152,0.158
2,0.5563,0.5443,0.3635,0.5534,0.4388,0.0973,0.1031
3,0.6056,0.5998,0.4465,0.6205,0.5193,0.1998,0.2076
4,0.5731,0.572,0.4022,0.5752,0.4734,0.1329,0.139
5,0.5836,0.5779,0.4188,0.5896,0.4898,0.1548,0.1613
6,0.603,0.599,0.4446,0.6164,0.5166,0.1945,0.202
7,0.6109,0.6092,0.4244,0.6389,0.51,0.2086,0.2206
8,0.603,0.6024,0.4446,0.6164,0.5166,0.1945,0.202
9,0.5947,0.5959,0.4207,0.6096,0.4978,0.1768,0.1854


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).




In [81]:
best_model = exp.compare_models(sort='f1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.5418,0.5386,0.5476,0.519,0.5328,0.0838,0.084,0.049
et,Extra Trees Classifier,0.5664,0.5908,0.4963,0.5506,0.5218,0.1273,0.1279,0.18
rf,Random Forest Classifier,0.567,0.5925,0.4887,0.5521,0.5183,0.1279,0.1287,0.205
lightgbm,Light Gradient Boosting Machine,0.589,0.6077,0.4626,0.5883,0.5176,0.1687,0.1724,0.962
knn,K Neighbors Classifier,0.5399,0.5588,0.4812,0.5196,0.4996,0.075,0.0752,0.086
gbc,Gradient Boosting Classifier,0.5979,0.6149,0.3989,0.6225,0.4861,0.1812,0.1933,0.621
ada,Ada Boost Classifier,0.5971,0.6108,0.3986,0.6213,0.4854,0.1797,0.1917,0.224
catboost,CatBoost Classifier,0.5995,0.6129,0.3934,0.628,0.4835,0.1841,0.1975,18.967
lr,Logistic Regression,0.5999,0.6164,0.3871,0.6315,0.4798,0.1844,0.1989,0.571
ridge,Ridge Classifier,0.6001,0.6166,0.3853,0.6328,0.4788,0.1848,0.1997,0.02
































In [84]:
exp.evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…