In [20]:
import pandas as pd
import sys
import os
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [2]:
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

In [4]:
from src.preprocessor import preprocessamento

In [7]:
df = pd.read_csv("./data/dataset.csv")

In [8]:
df = preprocessamento(df)

In [9]:
X = df.drop('conversion_status', axis=1)
y = df['conversion_status']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [11]:
def avaliar_modelo(modelo, X_train, X_test, y_train, y_test):
    modelo.fit(X_train, y_train)

    train_pred = modelo.predict(X_train)
    test_pred = modelo.predict(X_test)

    accuracy_train = accuracy_score(y_train, train_pred)
    precision_train = precision_score(y_train, train_pred, average='weighted')
    recall_train = recall_score(y_train, train_pred, average='weighted')
    f1_train = f1_score(y_train, train_pred, average='weighted')

    accuracy_test = accuracy_score(y_test, test_pred)
    precision_test = precision_score(y_test, test_pred, average='weighted')
    recall_test = recall_score(y_test, test_pred, average='weighted')
    f1_test = f1_score(y_test, test_pred, average='weighted')

    return {
        "Accuracy_train": accuracy_train,
        "Accuracy_test": accuracy_test,
        "Precision_train": precision_train,
        "Precision_test": precision_test,
        "Recall_train": recall_train,
        "Recall_test": recall_test,
        "F1_train": f1_train,
        "F1_test": f1_test,
    }

In [13]:
modelos = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier()
}

In [17]:
melhores_modelos = {
    'accuracy': {'nome': None, 'valor': 0},
    'precision': {'nome': None, 'valor': 0},
    'recall': {'nome': None, 'valor': 0},
    'f1': {'nome': None, 'valor': 0}
}

In [21]:
scalers = [
    ('RobustScaler', RobustScaler()),
    ('StandardScaler', StandardScaler()),
    ('MinMaxScaler', MinMaxScaler()),
    ('MaxAbsScaler', MaxAbsScaler())
]

In [23]:
for scaler_nome, scaler in scalers:
    for nome, modelo in modelos.items():
        pipeline = Pipeline([
            ('scaler', scaler),
            ('pca', PCA(n_components=10)),
            ('selector', SelectKBest(score_func=f_classif, k=10)),
            ('model', modelo)
        ])
        
        resultados = avaliar_modelo(pipeline, X_train, X_test, y_train, y_test)
        
        print(f'\n{nome} com {scaler_nome}:')
        print(f'Accuracy: {resultados["Accuracy_test"]}')
        print(f'Precision: {resultados["Precision_test"]}')
        print(f'Recall: {resultados["Recall_test"]}')
        print(f'F1 Score: {resultados["F1_test"]}\n')
        print('-' * 30)
        
        if resultados["Accuracy_test"] > melhores_modelos['accuracy']['valor']:
            melhores_modelos['accuracy']['nome'] = f'{nome} com {scaler_nome}'
            melhores_modelos['accuracy']['valor'] = resultados["Accuracy_test"]
        
        if resultados["Precision_test"] > melhores_modelos['precision']['valor']:
            melhores_modelos['precision']['nome'] = f'{nome} com {scaler_nome}'
            melhores_modelos['precision']['valor'] = resultados["Precision_test"]
        
        if resultados["Recall_test"] > melhores_modelos['recall']['valor']:
            melhores_modelos['recall']['nome'] = f'{nome} com {scaler_nome}'
            melhores_modelos['recall']['valor'] = resultados["Recall_test"]
        
        if resultados["F1_test"] > melhores_modelos['f1']['valor']:
            melhores_modelos['f1']['nome'] = f'{nome} com {scaler_nome}'
            melhores_modelos['f1']['valor'] = resultados["F1_test"]

print("\nMelhores modelos:")
print(f'Melhor modelo para Accuracy: {melhores_modelos["accuracy"]["nome"]} com valor {melhores_modelos["accuracy"]["valor"]}')
print(f'Melhor modelo para Precision: {melhores_modelos["precision"]["nome"]} com valor {melhores_modelos["precision"]["valor"]}')
print(f'Melhor modelo para Recall: {melhores_modelos["recall"]["nome"]} com valor {melhores_modelos["recall"]["valor"]}')
print(f'Melhor modelo para F1 Score: {melhores_modelos["f1"]["nome"]} com valor {melhores_modelos["f1"]["valor"]}')


Logistic Regression com RobustScaler:
Accuracy: 0.8904124737365918
Precision: 0.8669648764923487
Recall: 0.8904124737365918
F1 Score: 0.8679379976268952

------------------------------

Decision Tree com RobustScaler:
Accuracy: 0.8609974565962623
Precision: 0.863900802310378
Recall: 0.8609974565962623
F1 Score: 0.8624135762383106

------------------------------

Random Forest com RobustScaler:
Accuracy: 0.8993696782041358
Precision: 0.8871126254636635
Recall: 0.8993696782041358
F1 Score: 0.8906405451711885

------------------------------

Gradient Boosting com RobustScaler:
Accuracy: 0.9001437576025655
Precision: 0.8863651707072634
Recall: 0.9001437576025655
F1 Score: 0.889332389304742

------------------------------

XGBoost com RobustScaler:
Accuracy: 0.9016919163994249
Precision: 0.8910630805640329
Recall: 0.9016919163994249
F1 Score: 0.8943738316828895

------------------------------

Logistic Regression com StandardScaler:
Accuracy: 0.8895278115669578
Precision: 0.865269546259086