In [3]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn

  This is separate from the ipykernel package so we can avoid doing imports until


# Entrainement et enregistrement d'un modèle avec mlflow

## Chargement de jeu de données sklearn sur le cancer du sein.

In [4]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [5]:
df = pd.DataFrame(cancer['data'], columns=cancer['feature_names'])
df['target'] = cancer['target']
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [6]:
df.shape

(569, 31)

In [7]:
df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


In [14]:
df.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


## Extraire quelques données de test pour nous avec l'API modèle mlflow.

In [16]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [17]:
test_target = test['target']
test[['target']].to_csv('test-target.csv', index=False)
del test['target']
test.to_csv('test.csv', index=False)

## 'Spliter' les données d'entrainement, construire un pipeline et entrainer un modèle.

In [18]:
features = [x for x in list(train.columns) if x != 'target']
x_raw = train[features]
y_raw = train['target']

x_train, x_test, y_train, y_test = train_test_split(x_raw, y_raw,
                                                            test_size=.20,
                                                            random_state=123,
                                                            stratify=y_raw)

In [19]:
clf = RandomForestClassifier(n_estimators=100,
                            min_samples_leaf=2,
                            class_weight='balanced',
                            random_state=123)

preprocessor = Pipeline(steps=[('scaler', StandardScaler())])

model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('randomforestclassifier', clf)])


In [20]:
model.fit(x_train, y_train)

Pipeline(memory=None,
     steps=[('preprocessor', Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
           ...imators=100, n_jobs=1, oob_score=False, random_state=123,
            verbose=0, warm_start=False))])

## Vérifier quelques métriques du modèle.

In [21]:
accuracy_train = model.score(x_train, y_train)
accuracy_train

1.0

In [22]:
accuracy_test = model.score(x_test, y_test)
accuracy_test

0.945054945054945

In [23]:
model.get_params()

{'memory': None, 'preprocessor': Pipeline(memory=None,
      steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))]), 'preprocessor__memory': None, 'preprocessor__scaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'preprocessor__scaler__copy': True, 'preprocessor__scaler__with_mean': True, 'preprocessor__scaler__with_std': True, 'preprocessor__steps': [('scaler',
   StandardScaler(copy=True, with_mean=True, with_std=True))], 'randomforestclassifier': RandomForestClassifier(bootstrap=True, class_weight='balanced',
             criterion='gini', max_depth=None, max_features='auto',
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=2,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_jobs=1, oob_score=False, random_state=123,
             verbose=0, warm_start=False), 'randomforestclassifier__bootstrap': True, 'randomforestclassifier__class

## Mise à jour du modèle pour donner des probabilités au lieu d'une cible binaire.

In [24]:
def overwrite_predict(func):
        def wrapper(*args, **kwargs):
            result = func(*args, **kwargs)
            return [round(x, 4) for x in result[:, 1]]
        return wrapper

model.predict = overwrite_predict(model.predict_proba)

## Assurons-nous que nous pouvons exécuter/accéder au serveur mlflow.

## Définir la configuration de mlflow.

In [25]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("my-experiment")


<Experiment: artifact_location='./mlflow-artifact-root/1', experiment_id='1', lifecycle_stage='active', name='my-experiment', tags={}>

In [26]:
from mlflow.tracking import MlflowClient
client = MlflowClient()


In [27]:
model_name = "clf-model"

with mlflow.start_run() as run:
  run_num = run.info.run_id
  model_uri = "runs:/{run_id}/{artifact_path}".format(run_id=run_num, artifact_path=model_name)

  mlflow.log_metric('accuracy_train', accuracy_train)
  mlflow.log_metric('accuracy_test', accuracy_test)
  mlflow.sklearn.log_model(model, model_name)
  #mlflow.sklearn.save_model(model, "clf-model")

  model_details = mlflow.register_model(
    model_uri=model_uri,
    name=model_name)


Registered model 'clf-model' already exists. Creating a new version of this model...
2022/11/11 13:37:50 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: clf-model, version 4
Created version '4' of model 'clf-model'.


In [28]:
model_version_infos = client.search_model_versions("name = '%s'" % model_name)
new_model_version = max([model_version_info.version for model_version_info in model_version_infos])


In [29]:
from mlflow.entities.model_registry.model_version_status import ModelVersionStatus

def wait_model_transition(model_name, model_version, stage):
  client = MlflowClient()
  for _ in range(10):
    model_version_details = client.get_model_version(
      name=model_name,
      version=model_version,
    )
    status = ModelVersionStatus.from_string(model_version_details.status)
    print("Model status: %s" % ModelVersionStatus.to_string(status))
    if status == ModelVersionStatus.READY:
      client.transition_model_version_stage(
          name=model_name,
          version=model_version,
          stage=stage,
        )
      break
    time.sleep(1)

In [30]:
try:
  wait_model_transition(model_name, int(new_model_version)-1, "None")
except:
  pass

wait_model_transition(model_name, new_model_version, "Staging")

Model status: READY
Model status: READY


In [31]:
new_model_version

'4'

In [32]:
client.update_model_version(
  name=model_name,
  version=new_model_version,
  description="This model is a random forest classifier for the breast cancer dataset from sklearn."
)

<ModelVersion: creation_timestamp=1668173870201, current_stage='Staging', description=('This model is a random forest classifier for the breast cancer dataset from '
 'sklearn.'), last_updated_timestamp=1668173902931, name='clf-model', run_id='66f2e59c848d4895bfb1fb5831669c6f', run_link='', source='./mlflow-artifact-root/1/66f2e59c848d4895bfb1fb5831669c6f/artifacts/clf-model', status='READY', status_message='', tags={}, user_id='', version='4'>

# Extraction d'un modèle à partir du registre.

In [33]:
import mlflow.pyfunc

model_name = "clf-model"
stage = 'Staging'

loaded_model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{stage}"
)

In [34]:
predicted_probs = loaded_model.predict(test[features])
predicted_probs

[0.9097,
 0.9584,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0817,
 1.0,
 0.9269,
 0.8988,
 0.0064,
 1.0,
 0.9891,
 1.0,
 0.0,
 0.9548,
 0.3868,
 0.7488,
 1.0,
 0.9764,
 0.986,
 1.0,
 0.98,
 0.6269,
 0.9694,
 0.9923,
 1.0,
 0.9884,
 0.0184,
 1.0,
 0.8303,
 0.0,
 1.0,
 0.99,
 0.9745,
 0.1165,
 0.0,
 0.7972,
 0.0074,
 1.0,
 1.0,
 0.2898,
 1.0,
 0.3441,
 0.0,
 1.0,
 0.0,
 1.0,
 0.99,
 1.0,
 1.0,
 1.0,
 0.8125,
 1.0,
 0.9028,
 1.0,
 0.0,
 0.2735,
 0.7313,
 0.0037,
 0.9923,
 1.0,
 0.2105,
 1.0,
 1.0,
 0.9954,
 0.0906,
 0.4076,
 0.0037,
 0.4379,
 0.1264,
 0.9937,
 0.1284,
 1.0,
 1.0,
 0.99,
 0.4399,
 0.99,
 0.9777,
 0.0,
 0.9937,
 1.0,
 0.9268,
 0.0144,
 0.0,
 0.0201,
 0.0,
 0.2202,
 0.9864,
 0.0037,
 0.01,
 0.0,
 1.0,
 0.0174,
 0.9647,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0037,
 1.0,
 0.9015,
 0.0431,
 0.0,
 0.0,
 0.0,
 0.2198,
 0.9923,
 1.0,
 0.0091,
 0.9923,
 0.1663]

## Assurons-nous que le modèle est servi sur le port 1234

In [36]:
api_response = !curl http://localhost:1234/invocations  -H 'Content-Type: text/csv' --data-binary @test.csv
api_response

['  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current',
 '                                 Dload  Upload   Total   Spent    Left  Speed',
 '',
 '  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0',
 '100 25256  100   749  100 24507   7884   251k --:--:-- --:--:-- --:--:--  262k',
 '[0.9097, 0.9584, 1.0, 1.0, 1.0, 1.0, 0.0817, 1.0, 0.9269, 0.8988, 0.0064, 1.0, 0.9891, 1.0, 0.0, 0.9548, 0.3868, 0.7488, 1.0, 0.9764, 0.986, 1.0, 0.98, 0.6269, 0.9694, 0.9923, 1.0, 0.9884, 0.0184, 1.0, 0.8303, 0.0, 1.0, 0.99, 0.9745, 0.1165, 0.0, 0.7972, 0.0074, 1.0, 1.0, 0.2898, 1.0, 0.3441, 0.0, 1.0, 0.0, 1.0, 0.99, 1.0, 1.0, 1.0, 0.8125, 1.0, 0.9028, 1.0, 0.0, 0.2735, 0.7313, 0.0037, 0.9923, 1.0, 0.2105, 1.0, 1.0, 0.9954, 0.0906, 0.4076, 0.0037, 0.4379, 0.1264, 0.9937, 0.1284, 1.0, 1.0, 0.99, 0.4399, 0.99, 0.9777, 0.0, 0.9937, 1.0, 0.9268, 0.0144, 0.0, 0.0201, 0.0, 0.2202, 0.9864, 0.0037, 0.01, 0.0, 1.0, 0.0174, 0.9647, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 

In [37]:
api_probs = eval(api_response[5])

In [38]:
api_target = [1 if float(x) >=0.5 else 0 for x in api_probs]

In [39]:
api_target

[1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0]

In [40]:
# true target
test_target

151    1
191    1
420    1
37     1
422    1
      ..
383    1
324    1
33     0
310    1
22     0
Name: target, Length: 114, dtype: int64

In [41]:
from sklearn.metrics import accuracy_score

In [42]:
accuracy_score(test_target, api_target)

0.9649122807017544

In [43]:
mlflow.end_run()