In [6]:
# mlflow
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# other
import pandas as pd
import numpy as np
import os
import boto3
import tempfile
mlflow.end_run()



# set experiment
mlflow.set_experiment('mlflow-irissignur')

# load data
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris() 
X = iris.data
y = iris.target


# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# train model
model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
model.fit(X_train, y_train)

# predict
y_pred = model.predict(X_test)

# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# infer signature
signature = infer_signature(X_train, model.predict(X_train))

# start mlflow run
with mlflow.start_run() as run:
    # log parameters
    mlflow.log_params(model.get_params())
    

    # log metrics
    mlflow.log_metric('accuracy', accuracy)

    # log model
    mlflow.sklearn.log_model(model, 'model',signature=signature)

    # log conda env
    mlflow.log_dict(
        _mlflow_conda_env(
            None,
            additional_conda_deps=None,
            additional_pip_deps=['boto3'],
            additional_conda_channels=None),
        'conda.yaml')

    # log data
    # Convert X_train to a DataFrame if needed and save it as a CSV file
    X_train_df = pd.DataFrame(X_train)
    X_train_df.to_csv("X_train.csv", index=False)

    # Log the CSV file as an artifact
    mlflow.log_artifact("X_train.csv")

2024/11/03 15:22:23 INFO mlflow.tracking.fluent: Experiment with name 'mlflow-irissignur' does not exist. Creating a new experiment.




In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.model_selection import train_test_split
dataset = datasets.load_iris()
X = dataset.data
y = dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

