# Análise Comparativa de Modelos

In [39]:
from IPython.display import display, Markdown
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

## 1. Obtenção do Conjunto de Dados

In [22]:
# Obtemos os arquivos brutos de dados e o dicionário antes de iniciar as etapas do pré-processamento.
df = pd.read_csv("../data/raw/data.csv")
df_dict = pd.read_csv("../data/external/dictionary.csv")
df_dict

Unnamed: 0,variavel,descricao,tipo,subtipo
0,total_bill,valor total da conta em dólares,quantitativa,continua
1,tip,valor da gorjeta em dólares,quantitativa,continua
2,sex,sexo do cliente,qualitativa,nominal
3,smoker,se o cliente fuma,qualitativa,nominal
4,day,dia da semana,qualitativa,ordinal
5,time,horario,qualitativa,ordinal
6,size,pessoas na mesa,quantitativa,discreta


## 2. Limpeza de Dados:

Aqui realizamos a normalização, codificação e o tratamento de dados discrepantes e/ou faltantes dentro do conjunto de dados.

### 2.1. Tratamento de dados faltantes:

In [23]:
# Contagem total de dados  faltantes
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

**Observação**: No conjunto de dados não há dados faltantes.

### 2.2. Tratamento de dados discrepantes:

In [24]:
# Separar as variáveis.
target_column = 'tip'

# Organizar as colunas.
continuous_columns = (
    df_dict
    .query("subtipo == 'continua'")
    .variavel
    .to_list()
)
nominal_columns = (
    df_dict
    .query("subtipo == 'nominal' and variavel != @target_column")
    .variavel
    .to_list()
)
ordinal_day_columns = (
    df_dict
    .query("variavel in ['day']")
    .variavel
    .to_list()
)
ordinal_time_columns = (
    df_dict
    .query("variavel in ['time']")
    .variavel
    .to_list()
)
discrete_columns = (
    df_dict
    .query("subtipo == 'discreta'")
    .variavel
    .to_list()
)

X = df.drop(columns=[target_column], axis=1)
y = df[target_column]

In [25]:
continuous_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='mean')),  # Tratamento de dados faltantes
    ('normalization', StandardScaler())  # Normalização de dados contínuos
])
nominal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse_output=False, drop='first')), # Codificação de variáveis
    ('normalization', StandardScaler()) # Normalização de dados
])
ordinal_day_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse_output=False, drop='first')), # Codificação de variáveis
    ('normalization', StandardScaler()) # Normalização de dados
])
ordinal_time_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse_output=False, drop='first')), # Codificação de variáveis
    ('normalization', StandardScaler()) # Normalização de dados
])
discrete_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='mean')),  # Tratamento de dados faltantes
    ('normalization', StandardScaler()) # Normalização de dados
])
preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('continuous', continuous_preprocessor, continuous_columns),
    ('ordinal_day', ordinal_day_preprocessor, ordinal_day_columns),
    ('ordinal_time', ordinal_time_preprocessor, ordinal_time_columns),
    ('discrete', discrete_preprocessor, discrete_columns)
])

model = LogisticRegression()

## 3. Seleção de Modelos

Iremos análisar quatro modelos, que serão testados utilizando um método de validação, a saber:

- K-Nearest-Neighbors
- Decision Tree
- Random Forest

Além disso, cada um desses algoritmos será testado com diferentes hiper-parametros, para que possamos encontrar o melhor modelo e a melhor configuração possível para esse modelo.

Utilizaremos as seguintes métricas para análise:

- Acurácia (accuracy): proporção entre os dados que foram corretamente previstos (como positivos ou negativos) com o total de dados observados.
- Precisão (precision): proporção entre dados corretamente previstos como positivos e o total de observações positivas.
- Recall: proporção entre dados corretamente previstos como positivos com o total de observações.
- F1-score: média entre precision e recall, portanto levando em conta tanto falsos positivos quanto falsos negativos.

In [40]:
# experiment settings
n_splits_comparative_analysis = 10
n_folds_grid_search = 5
test_size = .2
random_state = 0
scoring = 'accuracy'
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

# model settings
max_iter = 1000
models = [
    ('K-Nearest Neighbors', KNeighborsRegressor(), {"n_neighbors": range(3, 20, 2), 'weights': ['uniform', 'distance']}),
    ('Decision Tree',  DecisionTreeRegressor(random_state=random_state), {'criterion':['squared_error', 'friedman_mse'],'max_depth': [3, 25, 40]}),
    ('Random Forest',  RandomForestRegressor(random_state=random_state), {'criterion':['squared_error', 'friedman_mse'],'max_depth': [3, 25, 40], 'n_estimators': [10, 50]}),
]

In [41]:
#Não rodou.
results = pd.DataFrame({})
cross_validate_grid_search = KFold(n_splits=n_folds_grid_search)
cross_validate_comparative_analysis = ShuffleSplit(n_splits=n_splits_comparative_analysis, test_size=test_size, random_state=random_state)
for model_name, model_object, model_parameters in models:
    print(f"running {model_name}...")
    model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=2,
        cv=cross_validate_grid_search
    )
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_grid_search)
    ])
    scores = cross_validate(
        estimator=approach,
        X=X,
        y=y,
        cv=cross_validate_comparative_analysis,
        n_jobs=2,
        scoring=metrics,
        return_train_score=False
    )
    scores['model_name'] = [model_name] * n_splits_comparative_analysis
    df_scores = pd.DataFrame(scores)
    df_scores =  df_scores.drop(columns=['model_name'])
    df_scores = df_scores.agg(['mean', 'std'])
    display(df_scores)
    results = pd.concat([results, pd.DataFrame(scores)], ignore_index=True)

running K-Nearest Neighbors...


ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/mnt/c/Users/Samsung/avanti-bootcamp-cdd/.venv/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3802, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'tip'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/mnt/c/Users/Samsung/avanti-bootcamp-cdd/.venv/lib/python3.10/site-packages/sklearn/utils/__init__.py", line 447, in _get_column_indices
    col_idx = all_columns.get_loc(col)
  File "/mnt/c/Users/Samsung/avanti-bootcamp-cdd/.venv/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3804, in get_loc
    raise KeyError(key) from err
KeyError: 'tip'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/mnt/c/Users/Samsung/avanti-bootcamp-cdd/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/mnt/c/Users/Samsung/avanti-bootcamp-cdd/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/mnt/c/Users/Samsung/avanti-bootcamp-cdd/.venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 416, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/mnt/c/Users/Samsung/avanti-bootcamp-cdd/.venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 370, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/mnt/c/Users/Samsung/avanti-bootcamp-cdd/.venv/lib/python3.10/site-packages/joblib/memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "/mnt/c/Users/Samsung/avanti-bootcamp-cdd/.venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 950, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/mnt/c/Users/Samsung/avanti-bootcamp-cdd/.venv/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/mnt/c/Users/Samsung/avanti-bootcamp-cdd/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/mnt/c/Users/Samsung/avanti-bootcamp-cdd/.venv/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 740, in fit_transform
    self._validate_column_callables(X)
  File "/mnt/c/Users/Samsung/avanti-bootcamp-cdd/.venv/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 448, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
  File "/mnt/c/Users/Samsung/avanti-bootcamp-cdd/.venv/lib/python3.10/site-packages/sklearn/utils/__init__.py", line 455, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe
