In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from xgboost import XGBClassifier
from dotenv import load_dotenv
import mlflow
from mlflow import MlflowClient
import os
from datetime import datetime


* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
data = pd.read_csv("data/titanic_data.csv")
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Jerry,male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,Jerry,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Jerry,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,Jerry,female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,Jerry,male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
# 將 Age 的缺失值補 Age 的平均數
data['Age'].fillna(data['Age'].mean(), inplace = True) # inplace=True 表示覆蓋掉原資料
#data['Age'] = data['Age'].fillna(data['Age'].mean(), inplace = False)

# 將 Embarked 的缺失值補 Embarked 的眾數
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace = True)

# 將 Fare 的缺失值補 Fare 的中位數
data['Fare'].fillna(data['Fare'].median(), inplace = True)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
# drop 較為不重要的欄位
drop_column = ['PassengerId','Cabin', 'Ticket', 'Name']
data.drop(drop_column, axis=1, inplace = True)

In [6]:
# 資料切分，並紀錄random_state
random_state = 311
test_size = 0.2
data_train, data_val = train_test_split(data,
                                        test_size=test_size,
                                        shuffle=True, 
                                        random_state=random_state)

In [7]:
# 資料 Ground Truth 設定
y_train = data_train.Survived
X_train = data_train.drop(columns='Survived')

y_val = data_train.Survived
X_val = data_train.drop(columns='Survived')

X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
742,1,female,21.000000,2,2,262.3750,C
608,2,female,22.000000,1,2,41.5792,C
232,2,male,59.000000,0,0,13.5000,S
849,1,female,29.699118,1,0,89.1042,C
239,2,male,33.000000,0,0,12.2750,S
...,...,...,...,...,...,...,...
347,3,female,29.699118,1,0,16.1000,S
553,3,male,22.000000,0,0,7.2250,C
312,2,female,26.000000,1,1,26.0000,S
93,3,male,26.000000,1,2,20.5750,S


In [8]:
categorical_features = ['Pclass', 'Sex', 'Embarked']
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare']
X_train.Pclass = X_train.Pclass.astype('object')
X_val.Pclass = X_val.Pclass.astype('object')

# 2. 將類別變項進行one-hot編碼(OneHotEncoder)並紀錄到新表格
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

X_train_onehot = pd.DataFrame(encoder.fit_transform(X_train[categorical_features]), # 將類別變項進行轉換
                              columns=encoder.get_feature_names_out(categorical_features), # 輸入新的欄位名稱
                              index=X_train.index) # 紀錄資料順序
                              
X_val_onehot = pd.DataFrame(encoder.transform(X_val[categorical_features]), # 將類別變項進行轉換
                            columns=encoder.get_feature_names_out(categorical_features), # 輸入新的欄位名稱
                            index=X_val.index) # 紀錄資料順序

# 3. 將連續變項歸一化(MinMaxScaler): 將數值壓縮到0~1之間
scaler = MinMaxScaler()

X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_val[numerical_features] = scaler.transform(X_val[numerical_features])

# 4. 合併(concat)處理後的 類別變項欄位、連續變項欄位
X_train = pd.concat([X_train_onehot, X_train[numerical_features]], axis=1)
X_val = pd.concat([X_val_onehot, X_val[numerical_features]], axis=1) 

print(X_train.shape)
X_train

(712, 12)




Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Age,SibSp,Parch,Fare
742,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.258608,0.250,0.333333,0.512122
608,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.271174,0.125,0.333333,0.081157
232,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.736115,0.000,0.000000,0.026350
849,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.367921,0.125,0.000000,0.173920
239,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.409399,0.000,0.000000,0.023959
...,...,...,...,...,...,...,...,...,...,...,...,...
347,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.367921,0.125,0.000000,0.031425
553,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.271174,0.000,0.000000,0.014102
312,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.321438,0.125,0.166667,0.050749
93,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.321438,0.125,0.333333,0.040160


In [9]:
# 建立模型
model_svc = SVC(C=1.0,        # Regularization parameter
                kernel='rbf') # kernel

model_xgb = XGBClassifier(max_depth=2,
                          learning_rate=0.1)


In [10]:
# 訓練模型
model_svc.fit(X_train, y_train)

In [11]:
model_xgb.fit(X_train, y_train,
            eval_set=[(X_val, y_val)])

[0]	validation_0-logloss:0.65420
[1]	validation_0-logloss:0.62228
[2]	validation_0-logloss:0.59583
[3]	validation_0-logloss:0.57351
[4]	validation_0-logloss:0.55476
[5]	validation_0-logloss:0.53874
[6]	validation_0-logloss:0.52513
[7]	validation_0-logloss:0.51349
[8]	validation_0-logloss:0.50343
[9]	validation_0-logloss:0.49486
[10]	validation_0-logloss:0.48737
[11]	validation_0-logloss:0.48098
[12]	validation_0-logloss:0.47536
[13]	validation_0-logloss:0.47040
[14]	validation_0-logloss:0.46602
[15]	validation_0-logloss:0.46216
[16]	validation_0-logloss:0.45853
[17]	validation_0-logloss:0.45399
[18]	validation_0-logloss:0.45074
[19]	validation_0-logloss:0.44793
[20]	validation_0-logloss:0.44510
[21]	validation_0-logloss:0.44149
[22]	validation_0-logloss:0.43923
[23]	validation_0-logloss:0.43686
[24]	validation_0-logloss:0.43394
[25]	validation_0-logloss:0.43192
[26]	validation_0-logloss:0.42983
[27]	validation_0-logloss:0.42729
[28]	validation_0-logloss:0.42511
[29]	validation_0-loglos

In [12]:
# 評估指標
y_pred = model_svc.predict(X_val)
accuracy_svc = (y_pred == y_val).sum()/y_val.shape[0]
accuracy_svc

0.8061797752808989

In [13]:
y_pred = model_xgb.predict(X_val)
accuracy_xgb = (y_pred == y_val).sum()/y_val.shape[0]
accuracy_xgb

0.8651685393258427

In [14]:
load_dotenv('.env')
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv('MINIO_ROOT_USER')
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv('MINIO_ROOT_PASSWORD')
os.environ["MLFLOW_S3_ENDPOINT_URL"] = os.getenv('MLFLOW_S3_ENDPOINT_URL')

mlflow.set_tracking_uri(os.getenv('MLFLOW_TRACKING_URI'))

experiment_name = 'Titanic'
existing_exp = mlflow.get_experiment_by_name(experiment_name)

if not existing_exp:
    mlflow.create_experiment(experiment_name, "s3://mlflow/")
mlflow.set_experiment(experiment_name)


now = datetime.now()
dt_string = now.strftime("%Y-%m-%d %H-%M-%S")
with mlflow.start_run(run_name='Run_%s' % dt_string):
    mlflow.set_experiment_tag('developer', 'GU')

    mlflow.log_params({
        'Model': "XGboost",
        'Learning rate': 0.1,
        'data random state': random_state,
        'data split rate': test_size
    })

    mlflow.log_metric("Test Accuracy", accuracy_xgb)

    mlflow.xgboost.log_model(model_xgb, artifact_path='Model')

now = datetime.now()
dt_string = now.strftime("%Y-%m-%d %H-%M-%S")
with mlflow.start_run(run_name='Run_%s' % dt_string):
    mlflow.set_experiment_tag('developer', 'GU')

    mlflow.log_params({
        'Model': 'SVC',
        'C': 1,
        'kernel':'rbf',
        'data random state': random_state,
        'data split rate': test_size
    })

    mlflow.log_metric("Test Accuracy", accuracy_svc)

    mlflow.sklearn.log_model(model_svc, artifact_path='Model')



## 前言
此部署階段主要跟大家分享如何將訓練好的模型進行部署，一般來說會有兩道手續：
1. 從眾多實驗中找出要將哪個模型進行部署，需要對該模型進行"註冊"(Register)
2. 使用註冊後的進行部署，並實際進行資料推論

* 因為部署階段需要使用到前面安裝步驟的相關套件，所以請先確保有確實完成快速安裝
* 此階段需要幾個訓練完成的模型並上傳至 MLflow，也請確定"開發實驗階段"有確實完成

## 功能介紹

1. 註冊模型(Register model)
2. 模型部署預測

### 註冊評估指標最高的模型

In [15]:
'''
獲得實驗編號
'''
target_experiments = {}
for rm in mlflow.search_experiments(filter_string="name = 'Titanic'"):
    target_experiments = dict(rm)

experiment_id = target_experiments['experiment_id']

experiment_id

'6'

In [16]:
'''
透過實驗編號取得每一次的模型紀錄
'''
runs_df = mlflow.search_runs(experiment_ids=experiment_id)
runs_df = runs_df.sort_values(by=['metrics.Test Accuracy'], ascending=False)
runs_df.reset_index(inplace=True)
runs_df

Unnamed: 0,index,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.Test Accuracy,params.data random state,params.C,params.Model,params.kernel,params.data split rate,params.Learning rate,tags.mlflow.user,tags.mlflow.log-model.history,tags.mlflow.source.type,tags.mlflow.source.name,tags.mlflow.runName
0,1,5ba07b0fbe4448aca61fb664d4d61667,6,FINISHED,s3://mlflow/5ba07b0fbe4448aca61fb664d4d61667/a...,2023-10-25 07:03:25.453000+00:00,2023-10-25 07:03:28.287000+00:00,0.865169,311,,XGboost,,0.2,0.1,shlongkuu,"[{""run_id"": ""5ba07b0fbe4448aca61fb664d4d61667""...",LOCAL,/Users/shlongkuu/miniconda3/envs/torch/lib/pyt...,Run_2023-10-25 15-03-25
1,0,5da4a241b6564071910212307bba34d8,6,FINISHED,s3://mlflow/5da4a241b6564071910212307bba34d8/a...,2023-10-25 07:03:28.329000+00:00,2023-10-25 07:03:30.405000+00:00,0.80618,311,1.0,SVC,rbf,0.2,,shlongkuu,"[{""run_id"": ""5da4a241b6564071910212307bba34d8""...",LOCAL,/Users/shlongkuu/miniconda3/envs/torch/lib/pyt...,Run_2023-10-25 15-03-28


In [17]:
'''
將評估指標表現最好的模型進行”註冊“
'''
best_run = runs_df.iloc[0]
best_run_id = best_run["run_id"]
mv = mlflow.register_model(model_uri="runs:/%s/Model"%best_run_id, 
                           name="Titanic_model")
mv

Successfully registered model 'Titanic_model'.
2023/10/25 15:03:30 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: Titanic_model, version 1
Created version '1' of model 'Titanic_model'.


<ModelVersion: aliases=[], creation_timestamp=1698217410614, current_stage='None', description='', last_updated_timestamp=1698217410614, name='Titanic_model', run_id='5ba07b0fbe4448aca61fb664d4d61667', run_link='', source='s3://mlflow/5ba07b0fbe4448aca61fb664d4d61667/artifacts/Model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [18]:
'''
將註冊後的模型加入版本號(Staging, Production, Archived)
'''
client = MlflowClient(tracking_uri=os.getenv('MLFLOW_TRACKING_URI'))
client.transition_model_version_stage(
    name="Titanic_model", version=int(mv.version), stage="Production"
)

<ModelVersion: aliases=[], creation_timestamp=1698217410614, current_stage='Production', description='', last_updated_timestamp=1698217410641, name='Titanic_model', run_id='5ba07b0fbe4448aca61fb664d4d61667', run_link='', source='s3://mlflow/5ba07b0fbe4448aca61fb664d4d61667/artifacts/Model', status='READY', status_message='', tags={}, user_id='', version='1'>

### 取得註冊後的模型並進行推論

In [19]:
import mlflow.pyfunc
import numpy as np

In [20]:
'''
下載註冊後的模型, 並使用MLflow 讀取模型
'''
model_name = "Titanic_model"
stage = "Production"

model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{stage}")
model

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 343.90it/s]  


mlflow.pyfunc.loaded_model:
  artifact_path: Model
  flavor: mlflow.xgboost
  run_id: 5ba07b0fbe4448aca61fb664d4d61667

In [24]:
'''
建立一筆測試資料，並進行預測
'''

result = model.predict(X_val[:1])
result

array([1])