# ML_Ops POC

In [26]:
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
import mlflow
import mlflow.sklearn
#import mlflow.tensorflow
#import tensorflow as tf
import joblib
from sklearn.pipeline import Pipeline
#from tensorflow import keras
from mlflow.tracking import MlflowClient
from mlflow.models import infer_signature
from pathlib import Path


In [27]:


print(os.getcwd()) 
root_dir = Path(os.getcwd()).resolve().parent.parent
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
test = root_dir / "data" / "origin" / "adult.csv"
print(test)
df = pd.read_csv(root_dir / "data" / "origin" / "adult.csv", names=columns, na_values=" ?", header=0)


c:\Zeug Hendrik\Studium\Master FH Meschede\5 Semester\MLOps\Project\FHSWF-MLOps\backend\notebooks
C:\Zeug Hendrik\Studium\Master FH Meschede\5 Semester\MLOps\Project\FHSWF-MLOps\data\origin\adult.csv


In [28]:
root_dir = Path(os.getcwd()).resolve().parent.parent
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

#df_read = pd.read_csv('Data/Income/adult.csv', names=columns, na_values=" ?", header=0)
df = pd.read_csv(root_dir / "data" / "origin" / "adult.csv", names=columns, na_values=" ?", header=0)

# Fehlende Werte entfernen
df.dropna(inplace=True)

# Zielvariable in boolschen Wert umwandeln
df["income"] = df["income"].apply(lambda x: 1 if x == ">50K" else 0)

# Kategorische Variablen kodieren
categorical_features = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
for col in categorical_features:
    df[col] = LabelEncoder().fit_transform(df[col])

df

# Features und Zielvariable definieren
X = df.drop("income", axis=1)
y = df["income"]

# Daten in Trainings- und Testset aufteilen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)


pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
])

#mlflow.set_experiment("Logistic_Regression")
#with mlflow.start_run():
    #model_lr = LogisticRegression(class_weight='balanced')
    #model_lr.fit(X_train, y_train)
    #y_pred_lr = model_lr.predict(X_test)
    
    #acc_lr = accuracy_score(y_test, y_pred_lr)
    #mlflow.log_param("model", "LogisticRegression")
    #mlflow.log_metric("accuracy", acc_lr)
    #mlflow.sklearn.log_model(model_lr, "model")
    
    #print(f"Logistic Regression Accuracy: {acc_lr:.4f}")

# MLflow Experiment: Random Forest
mlflow.set_experiment("Random_Forest")
with mlflow.start_run() as run:
    
    pipeline.fit(X_train, y_train)
    
    # Vorhersagen & Metriken berechnen
    y_pred_rf = pipeline.predict(X_test)

    #model_rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
    #model_rf.fit(X_train, y_train)
    #y_pred_rf = model_rf.predict(X_test)
    
    acc_rf = accuracy_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf)

    signature = infer_signature(X_train, pipeline.predict(X_train))
    input_example = pd.DataFrame(X_train[:5], columns=X.columns).to_dict(orient="records")

    mlflow.log_param("model", "RandomForestClassifier")
    mlflow.log_metric("accuracy", acc_rf)
    mlflow.log_metric("f1_score", f1)
    mlflow.sklearn.log_model(pipeline, "random_forest_model", signature=signature, input_example=input_example)
    model_uri = f"runs:/{run.info.run_id}/random_forest_pipeline"
    modelversion = mlflow.register_model(model_uri=model_uri, name="random_forest_pipeline")
    
    client = MlflowClient()
    client.set_registered_model_alias(name='random_forest_pipeline', alias='testalias', version=modelversion.version)
    #client.transition_model_version_stage(name="random_forest_pipeline", version=1, stage="Staging")

    
    best_model = None
    best_f1 = 0
    for mv in client.search_model_versions(f"name='random_forest_pipeline'"):
        if mv.current_stage == "Staging":
            metrics = client.get_run(mv.run_id).data.metrics
            if metrics["f1_score"] > best_f1:
                best_f1 = metrics["f1_score"]
                best_model = mv
    
    if best_model:
        client.transition_model_version_stage(name="random_forest_pipeline", version=best_model.version, stage="Production")
    

    print(f"Accuracy: {acc_rf}, F1 Score: {f1}")

joblib.dump(pipeline, "model.joblib")
print("Model saved as model.joblib")

#mlflow.set_experiment("TensorFlow_NN")
#with mlflow.start_run() as run:
    #model_tf = keras.Sequential([
    #    keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    #    keras.layers.Dense(32, activation='relu'),
    #    keras.layers.Dense(1, activation='sigmoid')
    #])
    
    #model_tf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    #history = model_tf.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), verbose=0)
    
    #test_loss, test_acc = model_tf.evaluate(X_test, y_test, verbose=0)
    
    #mlflow.log_param("model", "TensorFlow_NN")
    #mlflow.log_metric("accuracy", test_acc)
    #mlflow.tensorflow.log_model(model_tf, "model")
    
    #print(f"TensorFlow Neural Network Accuracy: {test_acc:.4f}")

    #model_name = "TensorFlow_NN_Model"
    #model_uri = f"runs:/{run.info.run_id}/model"
    #mlflow.register_model(model_uri, model_name)
    
    # Model Stage auf Staging setzen
    #client = MlflowClient()
    #model_version = client.get_latest_versions(model_name, stages=["None"])[0].version
    #client.transition_model_version_stage(name=model_name, version=model_version, stage="Staging")
    
    #print(f"Model {model_name} Version {model_version} moved to Staging")


2025/02/27 20:00:44 INFO mlflow.tracking.fluent: Experiment with name 'Random_Forest' does not exist. Creating a new experiment.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Accuracy: 0.8651562713252354, F1 Score: 0.6873417721518987
Model saved as model.joblib


Successfully registered model 'random_forest_pipeline'.
Created version '1' of model 'random_forest_pipeline'.


In [13]:
import mlflow
import pandas as pd

# FastAPI Initialisierung
#app = FastAPI()

# MLflow Client
client = mlflow.tracking.MlflowClient()

# Bestes Modell aus MLflow Production laden
model_name = "random_forest_pipeline"
try:
    #model_uri = f"models:/{model_name}/Production"
    model = client.get_model_version_by_alias('random_forest_pipeline','testalias')
except Exception as e:
    model = None
    print(f"Fehler beim Laden des Modells: {e}")