In [258]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from omegaconf import OmegaConf
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import mlflow

In [259]:
file_path = os.getcwd()

# Read configuration file
conf = OmegaConf.load(os.path.join(file_path, "..", "src", "config.yml"))

mlflow.set_experiment(conf["tracking_uri"]["experiment_name"])

# Load data
data_path = os.path.join(file_path, "..", "data", "UCI_Credit_Card.csv")

df = pd.read_csv(data_path)

In [260]:
def train(df, params):
    with mlflow.start_run():

        # Get features and target name
        # features = df.columns.to_list()[1:-1]
        # target = df.columns.to_list()[-1]

        features = df.iloc[:, 0:-1].values

        target = df.iloc[:, -1].values

        # Train test split
        df_train, df_test, dfy_train, dfy_test  = train_test_split(features, target, test_size=0.3, random_state=0)

        print("Tamanhos dos conjuntos de treino e teste:")
        print("Treino:", df_train.shape, dfy_train.shape)
        print("Teste:", df_test.shape, dfy_test.shape)
        
        # Normalizar os dados
        modelo_credito = StandardScaler()
        df_train = modelo_credito.fit_transform(df_train)
        df_test = modelo_credito.transform(df_test)

        # Train model
        clf = RandomForestClassifier(**params)

        clf.fit(df_train, dfy_train)

        y_pred_test = clf.predict(df_test)

        mlflow.log_params(params)

        signature = mlflow.models.infer_signature(df_train, clf.predict(df_train))

        mlflow.sklearn.log_model(
            clf,
            "model",
            signature=signature,
            input_example=df_train
        )

        # Evaluate
        gini_train = (
            2 * roc_auc_score(dfy_train, clf.predict_proba(df_train)[:, 1])
            - 1
        )
        gini_test = (
            2 * roc_auc_score(dfy_test, clf.predict_proba(df_test)[:, 1])
            - 1
        )

        mlflow.log_metrics({"gini_train": gini_train})
        mlflow.log_metrics({"gini_test": gini_test})

        # Show results
        print(f"Gini train: {gini_train:.3f}")
        print(f"Gini test:  {gini_test:.3f}")

        print(classification_report(dfy_test, y_pred_test))

In [261]:
def main():

    # for col in df.select_dtypes(include=['int']).columns:
    #     df[col] = df[col].astype('float64')

    # Train model
    train(df, conf["parameters"])

if __name__ == "__main__":
    main()


Tamanhos dos conjuntos de treino e teste:
Treino: (21000, 24) (21000,)
Teste: (9000, 24) (9000,)
Gini train: 0.996
Gini test:  0.559
              precision    recall  f1-score   support

           0       0.84      0.95      0.89      7060
           1       0.67      0.36      0.47      1940

    accuracy                           0.82      9000
   macro avg       0.76      0.66      0.68      9000
weighted avg       0.81      0.82      0.80      9000

