In [4]:
import os
import mlflow
import mlflow.sklearn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, roc_auc_score, roc_curve, confusion_matrix, accuracy_score
)
from mlflow.models.signature import infer_signature


In [3]:
df = pd.read_csv('../raw_data/train.csv')

In [8]:


mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("test-mlflow")  

def train_with_mlflow(df: pd.DataFrame, target_col: str):
    df = df.dropna(axis=0).reset_index(drop=True)
    X = df.drop(columns=[target_col])
    y = df[target_col]

    cat_cols = selector(dtype_include=object)(X)
    num_cols = selector(dtype_include=np.number)(X)

    preprocess = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
            ("num", StandardScaler(), num_cols),
        ],
        remainder="drop"
    )

    pipe = Pipeline(steps=[
        ("prep", preprocess),
        ("clf", LogisticRegression(max_iter=1000, class_weight=None))
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    with mlflow.start_run(run_name="logreg_ohe_pipeline"):
        mlflow.sklearn.autolog(log_input_examples=True, silent=True)

        pipe.fit(X_train, y_train)

        y_pred = pipe.predict(X_test)
        metrics = {
            "accuracy": accuracy_score(y_test, y_pred)
        }
        mlflow.sklearn.log_model(
            sk_model=pipe,
            artifact_path="model",
            registered_model_name="test-logistic"
        )   


In [None]:

import cupy as cp
import cudf
from cuml.compose import make_column_transformer, make_column_selector
from cuml.preprocessing import OneHotEncoder, StandardScaler
from cuml.linear_model import LogisticRegression
from cuml.model_selection import train_test_split
from cuml.metrics import accuracy_score, roc_auc_score

# ---- 1) V√≠ d·ª• d·ªØ li·ªáu: c√≥ c·ªôt s·ªë v√† c·ªôt ph√¢n lo·∫°i
# Th·ª±c t·∫ø, b·∫°n thay b·∫±ng DataFrame c·ªßa b·∫°n (cudf.DataFrame)
df = cudf.DataFrame({
    "age": [23, 45, 31, 52, 41, 22, 36, 28],
    "income": [10.2, 25.5, 15.3, 40.1, 27.2, 9.8, 18.6, 12.0],
    "city": ["Hanoi", "Saigon", "Danang", "Hanoi", "Saigon", "Danang", "Hanoi", "Saigon"],
    "tier": ["A", "B", "A", "C", "B", "A", "B", "C"],
    "label": [0, 1, 0, 1, 1, 0, 1, 0],  # nh√£n nh·ªã ph√¢n
})

# X (features) & y (target)
X = df.drop(columns=["label"])
y = df["label"]

# ---- 2) T·∫°o ColumnTransformer: num -> StandardScaler, cat -> OneHotEncoder
# Numeric: dtype_include=cp.number (theo docs cuML); Categorical: dtype_include=object
# Tham chi·∫øu v√≠ d·ª• selector/transformer: make_column_selector + StandardScaler + OneHotEncoder
# https://docs.rapids.ai/api/cuml/stable/api/ (m·ª•c compose)
preprocess = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=cp.number)),
    (OneHotEncoder(handle_unknown="ignore"), make_column_selector(dtype_include=object)),
)

# ---- 3) Chia train/test (GPU)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

# ---- 4) Fit transformer tr√™n train v√† transform
X_train_t = preprocess.fit_transform(X_train)   # -> m·∫£ng (GPU) ƒë√£ scale + one-hot
X_test_t  = preprocess.transform(X_test)

# ---- 5) Logistic Regression (GPU)
# L∆∞u √Ω: solver c·ªßa cuML c√≥ kh√°c sklearn m·ªôt ch√∫t n√™n k·∫øt qu·∫£ c√≥ th·ªÉ h∆°i kh√°c nhau.
# Docs LogisticRegression + predict_proba/decision_function/predict_log_proba:
# https://docs.rapids.ai/api/cuml/stable/api/
logit = LogisticRegression(max_iter=1000)
logit.fit(X_train_t, y_train)

# ---- 6) ƒê√°nh gi√°
y_pred = logit.predict(X_test_t)
# N·∫øu c·∫ßn x√°c su·∫•t ƒë·ªÉ AUC:
y_proba = logit.predict_proba(X_test_t)[:, 1]

acc = float(accuracy_score(y_test, y_pred))
auc = float(roc_auc_score(y_test, y_proba))

print(f"Accuracy: {acc:.4f}")
print(f"ROC AUC : {auc:.4f}")

# ---- 7) Suy lu·∫≠n (inference) cho d·ªØ li·ªáu m·ªõi:
new_df = cudf.DataFrame({
    "age": [30, 50],
    "income": [16.0, 35.0],
    "city": ["Danang", "Hanoi"],
    "tier": ["B", "C"],
})
new_X = preprocess.transform(new_df)
new_proba = logit.predict_proba(new_X)[:, 1]
print("Predicted probabilities:", new_proba)


In [9]:
train_with_mlflow(df , 'target')


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 2806.90it/s] 
Registered model 'test-logistic' already exists. Creating a new version of this model...
2025/11/04 21:35:39 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: test-logistic, version 1
Created version '1' of model 'test-logistic'.


üèÉ View run logreg_ohe_pipeline at: http://127.0.0.1:5000/#/experiments/487761078381125904/runs/10fbef67202c4a5294859e16482b433a
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/487761078381125904
