In [117]:
import os
import numpy as np
import pandas as pd
from joblib import dump
from dotenv import load_dotenv
from sqlalchemy.engine import Engine
from sqlalchemy import create_engine
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

base_path = "/media/bruno/Arquivos/Desenvolvimento/WineQuality"
load_dotenv(f"{base_path}/config/.env")

True

In [2]:
def get_engine() -> Engine:
    user = os.getenv("DB_USER")
    pw = os.getenv("DB_PASS")
    db = os.getenv("DB_NAME")
    host = os.getenv("DB_HOST")
    port = os.getenv("DB_PORT")
    return create_engine(f"mysql+pymysql://{user}:{pw}@{host}:{port}/{db}")

In [123]:
def train() -> None:
    engine = get_engine()
    with engine.connect() as conn:
        query = """
            select
                w.volatile_acidity,
                w.citric_acid,
                w.free_sulfur_dioxide,
                w.alcohol,
                if(w.quality >= 7, 1, 0) as quality
            from wine_results w
        """
        dataset = pd.read_sql(query, conn)
        n_minority = list(dataset.values[:, -1]).count(1)
        thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
        executions = 10

        classifiers, scalers, metrics, iteration = [], [], [], 0
        for threshold in thresholds:
            for n in range(0, executions):
                new_dataset, included = [], []
                new_dataset.extend(dataset.query("quality == 1").values)
                while len(new_dataset) < 2 * n_minority:
                    i = np.random.randint(0, len(dataset))
                    quality = dataset.values[i, -1]
                    if quality != 1 and i not in included:
                        new_dataset.append(dataset.values[i])
                        included.append(i)
                        
                new_dataset = pd.DataFrame(new_dataset, columns=dataset.columns)
                X, y = new_dataset.values[:, :-1], new_dataset.values[:, -1]
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)

                classifier = LogisticRegression()
                classifier.fit(X_train, y_train)

                def predict(features):
                    values = classifier.predict_proba(features)
                    return [1 if v[1] >= threshold else 0 for v in values]
            
                y_pred = predict(X_test)
                cm = confusion_matrix(y_test, y_pred)
                tn, fp, fn, tp = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
                test_acc = accuracy_score(y_test, y_pred)
                train_acc = accuracy_score(y_train, predict(X_train))

                precision = tp / (tp + fp)
                recall = tp / (tp + fn)
                f1 = 2 * (precision * recall) / (precision + recall)
                
                classifiers.append(classifier)
                scalers.append(scaler)
                metrics.append({
                    "train_acc": train_acc,
                    "test_acc": test_acc,
                    "precision": precision,
                    "recall": recall,
                    "f1": f1,
                    "threshold": threshold,
                    "execution": iteration,
                    "decision": f1 * 0.5 + test_acc * 0.5
                })
                iteration = iteration + 1

        metrics = pd.DataFrame(metrics).sort_values("decision", ascending=False)
        print("RESULTADO DA MODELAGEM")
        print(f"Acurácia no treino: {metrics['train_acc'].values[0]}")
        print(f"Acurácia no teste: {metrics['test_acc'].values[0]}")
        print(f"Precisão do modelo: {metrics['precision'].values[0]}")
        print(f"Recall do modelo: {metrics['recall'].values[0]}")
        print(f"F1-score do modelo: {metrics['f1'].values[0]}")

        dump(classifiers[metrics["execution"].values[0]], f"{base_path}/data/model.joblib")
        dump(scalers[metrics["execution"].values[0]], f"{base_path}/data/scaler.joblib")


train()

RESULTADO DA MODELAGEM
Acurácia no treino: 0.7821782178217822
Acurácia no teste: 0.8549618320610687
Precisão do modelo: 0.8157894736842105
Recall do modelo: 0.9253731343283582
F1-score do modelo: 0.8671328671328671
