In [3]:
import pandas as pd
import sys
import os
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [4]:
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

In [5]:
from preprocessamento import processamento

In [6]:
df = pd.read_csv("../data/restaurant_customer_satisfaction.csv")

In [7]:
df = processamento(df)

In [8]:
X = df.drop('HighSatisfaction', axis=1)
y = df['HighSatisfaction']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [10]:
def avaliar_modelo(modelo, X_train, X_test, y_train, y_test):
    """
    This function evaluates a given machine learning model using the provided training and testing datasets.
    It calculates and returns various performance metrics such as accuracy, precision, recall, and F1 score.

    Parameters:
    modelo (sklearn.base.BaseEstimator): The machine learning model to be evaluated.
    X_train (array-like): The training dataset features.
    X_test (array-like): The testing dataset features.
    y_train (array-like): The training dataset labels.
    y_test (array-like): The testing dataset labels.

    Returns:
    dict: A dictionary containing the calculated performance metrics for both the training and testing datasets.
    """
    modelo.fit(X_train, y_train)

    train_pred = modelo.predict(X_train)
    test_pred = modelo.predict(X_test)

    accuracy_train = accuracy_score(y_train, train_pred)
    precision_train = precision_score(y_train, train_pred, average='weighted')
    recall_train = recall_score(y_train, train_pred, average='weighted')
    f1_train = f1_score(y_train, train_pred, average='weighted')

    accuracy_test = accuracy_score(y_test, test_pred)
    precision_test = precision_score(y_test, test_pred, average='weighted')
    recall_test = recall_score(y_test, test_pred, average='weighted')
    f1_test = f1_score(y_test, test_pred, average='weighted')

    return {
        "Accuracy_train": accuracy_train,
        "Accuracy_test": accuracy_test,
        "Precision_train": precision_train,
        "Precision_test": precision_test,
        "Recall_train": recall_train,
        "Recall_test": recall_test,
        "F1_train": f1_train,
        "F1_test": f1_test,
    }

In [11]:
modelos = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier()
}

In [12]:
melhores_modelos = {
    'accuracy': {'nome': None, 'valor': 0},
    'precision': {'nome': None, 'valor': 0},
    'recall': {'nome': None, 'valor': 0},
    'f1': {'nome': None, 'valor': 0}
}

In [13]:
scalers = [
    ('RobustScaler', RobustScaler()),
    ('StandardScaler', StandardScaler()),
    ('MinMaxScaler', MinMaxScaler()),
    ('MaxAbsScaler', MaxAbsScaler())
]

In [14]:
for scaler_nome, scaler in scalers:
    for nome, modelo in modelos.items():
        pipeline = Pipeline([
            ('scaler', scaler),
            ('pca', PCA(n_components=15)),
            ('selector', SelectKBest(score_func=f_classif, k=15)),
            ('model', modelo)
        ])
        
        resultados = avaliar_modelo(pipeline, X_train, X_test, y_train, y_test)
        
        print(f'\n{nome} com {scaler_nome}:')
        print(f'Accuracy: {resultados["Accuracy_test"]}')
        print(f'Precision: {resultados["Precision_test"]}')
        print(f'Recall: {resultados["Recall_test"]}')
        print(f'F1 Score: {resultados["F1_test"]}\n')
        print('-' * 60)
        
        if resultados["Accuracy_test"] > melhores_modelos['accuracy']['valor']:
            melhores_modelos['accuracy']['nome'] = f'{nome} com {scaler_nome}'
            melhores_modelos['accuracy']['valor'] = resultados["Accuracy_test"]
        
        if resultados["Precision_test"] > melhores_modelos['precision']['valor']:
            melhores_modelos['precision']['nome'] = f'{nome} com {scaler_nome}'
            melhores_modelos['precision']['valor'] = resultados["Precision_test"]
        
        if resultados["Recall_test"] > melhores_modelos['recall']['valor']:
            melhores_modelos['recall']['nome'] = f'{nome} com {scaler_nome}'
            melhores_modelos['recall']['valor'] = resultados["Recall_test"]
        
        if resultados["F1_test"] > melhores_modelos['f1']['valor']:
            melhores_modelos['f1']['nome'] = f'{nome} com {scaler_nome}'
            melhores_modelos['f1']['valor'] = resultados["F1_test"]

print("\nMelhores modelos:")
print(f'Melhor modelo para Accuracy: {melhores_modelos["accuracy"]["nome"]} com valor {melhores_modelos["accuracy"]["valor"]}')
print(f'Melhor modelo para Precision: {melhores_modelos["precision"]["nome"]} com valor {melhores_modelos["precision"]["valor"]}')
print(f'Melhor modelo para Recall: {melhores_modelos["recall"]["nome"]} com valor {melhores_modelos["recall"]["valor"]}')
print(f'Melhor modelo para F1 Score: {melhores_modelos["f1"]["nome"]} com valor {melhores_modelos["f1"]["valor"]}')


Logistic Regression com RobustScaler:
Accuracy: 0.8733333333333333
Precision: 0.8405727376861397
Recall: 0.8733333333333333
F1 Score: 0.8371920289855073

------------------------------------------------------------

Decision Tree com RobustScaler:
Accuracy: 0.8066666666666666
Precision: 0.7979407008086253
Recall: 0.8066666666666666
F1 Score: 0.8021765491727468

------------------------------------------------------------

Random Forest com RobustScaler:
Accuracy: 0.8766666666666667
Precision: 0.8919798657718121
Recall: 0.8766666666666667
F1 Score: 0.8250979536628997

------------------------------------------------------------

Gradient Boosting com RobustScaler:
Accuracy: 0.86
Precision: 0.8057044673539518
Recall: 0.86
F1 Score: 0.820054347826087

------------------------------------------------------------

XGBoost com RobustScaler:
Accuracy: 0.88
Precision: 0.856395092796477
Recall: 0.88
F1 Score: 0.8494545454545456

------------------------------------------------------------

Log