In [2]:
import os
import numpy as np
import pandas as pd
from joblib import dump
from dotenv import load_dotenv
from sqlalchemy.engine import Engine
from sqlalchemy import create_engine
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

base_path = "/media/bruno/Arquivos/Desenvolvimento/WineQuality"
load_dotenv(f"{base_path}/config/.env")

True

In [3]:
def get_engine() -> Engine:
    user = os.getenv("DB_USER")
    pw = os.getenv("DB_PASS")
    db = os.getenv("DB_NAME")
    host = os.getenv("DB_HOST")
    port = os.getenv("DB_PORT")
    return create_engine(f"mysql+pymysql://{user}:{pw}@{host}:{port}/{db}")

In [11]:
def train() -> None:
    engine = get_engine()
    with engine.connect() as conn:
        query = """
            select
                w.volatile_acidity,
                w.citric_acid,
                w.free_sulfur_dioxide,
                w.alcohol,
                if(w.quality >= 7, 1, 0) as quality
            from wine_results w
        """
        dataset = pd.read_sql(query, conn)
        n_minority = list(dataset.values[:, -1]).count(1)

        new_dataset, included = [], []
        new_dataset.extend(dataset.query("quality == 1").values)
        while len(new_dataset) < 2 * n_minority:
            i = np.random.randint(0, len(dataset))
            quality = dataset.values[i, -1]
            if quality != 1 and i not in included:
                new_dataset.append(dataset.values[i])
                included.append(i)
                
        new_dataset = pd.DataFrame(new_dataset, columns=dataset.columns)
        X, y = new_dataset.values[:, :-1], new_dataset.values[:, -1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test_scaler = scaler.transform(X_test)

        classifier = LogisticRegression()
        classifier.fit(X_train, y_train)
    
        y_pred = classifier.predict(X_test_scaler)
        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
        test_acc = accuracy_score(y_test, y_pred)
        train_acc = accuracy_score(y_train, classifier.predict(X_train))

        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * (precision * recall) / (precision + recall)

        df_test = []
        for i in range(0, len(X_test)):
            df_test.append(list(X_test[i]))
            df_test[i].extend([y_test[i], y_pred[i]])

        columns = list(dataset.columns)
        columns.append("predict")
        df_test = pd.DataFrame(df_test, columns=columns)

        print("RESULTADO DA MODELAGEM")
        print(f"Acurácia no treino: {train_acc}")
        print(f"Acurácia no teste: {test_acc}")
        print(f"Precisão do modelo: {precision}")
        print(f"Recall do modelo: {recall}")
        print(f"F1-score do modelo: {f1}")

        df_test.to_csv(f"{base_path}/data/data_test.csv", index=False)
        dump(classifier, f"{base_path}/data/model.joblib")
        dump(scaler, f"{base_path}/data/scaler.joblib")


train()

RESULTADO DA MODELAGEM
Acurácia no treino: 0.7821782178217822
Acurácia no teste: 0.8091603053435115
Precisão do modelo: 0.7936507936507936
Recall do modelo: 0.8064516129032258
F1-score do modelo: 0.7999999999999999
