# ML_Ops POC

In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
import mlflow
import mlflow.sklearn
#import mlflow.tensorflow
#import tensorflow as tf
import joblib
from sklearn.pipeline import Pipeline
#from tensorflow import keras
from mlflow.tracking import MlflowClient
from mlflow.models import infer_signature
from pathlib import Path
from dotenv import load_dotenv
from sklearn.compose import ColumnTransformer




In [4]:
load_dotenv(override=True)

script_dir = Path(__file__).resolve().parent if "__file__" in globals() else Path(os.getcwd())

print(f"Aktuelles Arbeitsverzeichnis: {os.getcwd()}")

root_dir = script_dir.parent.parent

print(root_dir)

data_path = os.getenv("DATA_PATH")
mlflowUri = os.getenv("MLFLOW_TRACKING_URI")
print('Origin from env: ' , data_path)
print('Origin from env: ' , mlflowUri)


data_path = Path(data_path)
mlflowUri = Path(mlflowUri)

print(data_path)
print(mlflowUri)

if not data_path.is_absolute():
    data_path = root_dir / data_path

if not mlflowUri.is_absolute():
    mlflowUri = root_dir / mlflowUri

print(data_path)
print(mlflowUri)

columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]


mlflow.set_tracking_uri(mlflowUri)
df = pd.read_csv(data_path / "adult.csv", names=columns, na_values=" ?", header=0)


Aktuelles Arbeitsverzeichnis: c:\Zeug Hendrik\Studium\Master FH Meschede\5 Semester\MLOps\Project\FHSWF-MLOps\backend\notebooks
c:\Zeug Hendrik\Studium\Master FH Meschede\5 Semester\MLOps\Project\FHSWF-MLOps
Origin from env:  data/origin
Origin from env:  backend/mlruns
data\origin
backend\mlruns
c:\Zeug Hendrik\Studium\Master FH Meschede\5 Semester\MLOps\Project\FHSWF-MLOps\data\origin
c:\Zeug Hendrik\Studium\Master FH Meschede\5 Semester\MLOps\Project\FHSWF-MLOps\backend\mlruns


In [6]:
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

mlflow.set_tracking_uri(mlflowUri)
df = pd.read_csv(data_path / "adult.csv", names=columns, na_values=" ?", header=0)

# Fehlende Werte entfernen
df.dropna(inplace=True)

# Zielvariable in boolschen Wert umwandeln
df["income"] = df["income"].apply(lambda x: 1 if x == ">50K" else 0)

# Kategorische Variablen kodieren
categorical_features = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]


X = df.drop("income", axis=1)
y = df["income"]

# Daten in Trainings- und Testset aufteilen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

pipeline = Pipeline([
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), X_train.select_dtypes(include=['int64', 'float64']).columns),  # Numerische Variablen skalieren
            ('cat', Pipeline([
                ('encoder', OneHotEncoder(handle_unknown='ignore'))
            ]), categorical_features)  # Kategorische Variablen
        ]
    )),
    ('classifier', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))  # Klassifikator
])

# MLflow Experiment: Random Forest
mlflow.set_experiment("Random_Forest")
with mlflow.start_run() as run:
    
    pipeline.fit(X_train, y_train)
    
    # Vorhersagen & Metriken berechnen
    y_pred_rf = pipeline.predict(X_test)

    acc_rf = accuracy_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf)

    signature = infer_signature(X_train, pipeline.predict(X_train))
    input_example = pd.DataFrame(X_train[:5], columns=X.columns).to_dict(orient="records")

    mlflow.log_param("model", "RandomForestClassifier")
    mlflow.log_metric("accuracy", acc_rf)
    mlflow.log_metric("f1_score", f1)
    mlflow.sklearn.log_model(pipeline, "random_forest_model", signature=signature, input_example=input_example)
    
    model_uri = f"runs:/{run.info.run_id}/random_forest_pipeline"
    
    modelversion = mlflow.register_model(model_uri=model_uri, name="random_forest_pipeline")
    
    client = MlflowClient()
    client.set_registered_model_alias(name='random_forest_pipeline', alias='newest', version=modelversion.version)
    
    best_model = None
    best_f1 = 0
    for mv in client.search_model_versions(f"name='random_forest_pipeline'"):
        metrics = client.get_run(mv.run_id).data.metrics
        if metrics["f1_score"] > best_f1:
            best_f1 = metrics["f1_score"]
            best_model = mv
    
    if best_model:
        client.set_registered_model_alias(name='random_forest_pipeline', alias='best', version=best_model.version)
    

    print(f"Accuracy: {acc_rf}, F1 Score: {f1}")



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'random_forest_pipeline' already exists. Creating a new version of this model...
Created version '2' of model 'random_forest_pipeline'.


Accuracy: 0.8601064555752695, F1 Score: 0.6805858522904331


In [5]:
import requests
import json

# URL mit den Parametern model und alias für lokalen FastAPI-Testserver
url = "http://127.0.0.1:8000/predict"
params = {
    "model": "RandomForestAdultIncome",
    "alias": "newest"
}

# Beispiel-Daten aus dem Adult-Income-Dataset
body = {
    "age": 39,
    "workclass": "State-gov",
    "fnlwgt": 77516,
    "education": "Bachelors",
    "education-num": 13,
    "marital-status": "Never-married",
    "occupation": "Adm-clerical",
    "relationship": "Not-in-family",
    "race": "White",
    "sex": "Male",
    "capital-gain": 500000,
    "capital-loss": 0,
    "hours-per-week": 40,
    "native-country": "United-States"
}

# Request senden
response = requests.post(url, params=params, json=body)

# Antwort ausgeben
print(response.status_code)
print(response.json())


200
{'prediction': [1]}


In [16]:
import mlflow
import pandas as pd

# FastAPI Initialisierung
#app = FastAPI()

# MLflow Client
client = mlflow.tracking.MlflowClient()

# Bestes Modell aus MLflow Production laden
model_name = "random_forest_pipeline"
try:
    #model_uri = f"models:/{model_name}/Production"
    run_id = client.get_model_version_by_alias('random_forest_pipeline','best').run_id
    model_uri = "runs:/{}/model".format(run_id)

    pymodel = mlflow.sklearn.load_model(model_uri)
except Exception as e:
    model = None
    print(f"Fehler beim Laden des Modells: {e}")

Fehler beim Laden des Modells: No such file or directory: '/Users/developerhhotels/Documents/Projekte/Own Projects/MLOps/FHSWF-MLOps/backend/mlruns/747985668469723700/24dad1cdeb1740ba8e0ae3c95343bd64/artifacts/model'
