# 08_modeling_mlflow.ipynb

# 🧠 Emotion Recognition - Modeling with MLflow Tracking

"""
This notebook trains multiple machine learning models to classify facial emotions using the extracted image features.
The training process includes:

- ✅ Training a Random Forest (RF) model
- ✅ Training a Deep Neural Network (DNN) using Keras
- ✅ (Optional) Training XGBoost for comparison
- ✅ Tracking all metrics, hyperparameters, and artifacts using MLflow
- ✅ Saving models locally (.pkl, .h5) and exporting logs to JSON

📦 Inputs:
- `image_vectors.npy`: Extracted features from ResNet50 (saved in previous step)
- `final_emotion_dataset.parquet`: Annotated labels (emotion, age, gender, etc.)

📤 Output:
- Trained models (locally and optionally to GCS)
- `model_monitor_log.json`: Performance summary per model
- MLflow logs (if local/remote MLflow server is connected)

💡 This step is crucial for comparing modeling strategies and tracking experiments reliably.
"""

In [2]:
!pip install -q mlflow
!pip install -q xgboost

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.2/28.2 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m97.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m681.0/681.0 kB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
# 08_modeling_mlflow.ipynb

# 🧠 Emotion Recognition - Modeling with MLflow Tracking

# ✅ Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import mlflow
import mlflow.sklearn
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical
import xgboost as xgb
import json
import os


In [5]:
# ✅ Load data
from google.colab import auth
from google.cloud import storage

auth.authenticate_user()
project_id = "exalted-summer-454012-d2"
bucket_name = "boothill2001-dataset"

# Load features
client = storage.Client(project=project_id)
bucket = client.bucket(bucket_name)
bucket.blob("features/image_vectors.npy").download_to_filename("image_vectors.npy")
bucket.blob("dataset/final_emotion_dataset.parquet").download_to_filename("final_emotion_dataset.parquet")

In [6]:
X = np.load("image_vectors.npy")
df = pd.read_parquet("final_emotion_dataset.parquet")
y = df['dominant_emotion']

# ✅ Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ✅ Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# ✅ Start MLflow run
mlflow.set_experiment("Emotion_Recognition")

2025/03/26 07:46:43 INFO mlflow.tracking.fluent: Experiment with name 'Emotion_Recognition' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///content/mlruns/100354367244192138', creation_time=1742975203648, experiment_id='100354367244192138', last_update_time=1742975203648, lifecycle_stage='active', name='Emotion_Recognition', tags={}>

In [7]:
# ✅ Random Forest
with mlflow.start_run(run_name="RandomForest"):
    rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    rf.fit(X_train, y_train)
    preds = rf.predict(X_test)
    acc = accuracy_score(y_test, preds)

    mlflow.log_param("model", "RandomForest")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(rf, "rf_model")

    print("🔍 RF Accuracy:", acc)

# ✅ Deep Neural Network
num_classes = len(np.unique(y_encoded))
y_train_cat = to_categorical(y_train, num_classes)
y_test_cat = to_categorical(y_test, num_classes)



🔍 RF Accuracy: 0.24115148655025956


In [8]:
with mlflow.start_run(run_name="DNN"):
    model = Sequential([
        Dense(256, activation='relu', input_shape=(X.shape[1],)),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train_cat, epochs=10, batch_size=64, validation_split=0.1, verbose=0)

    loss, acc = model.evaluate(X_test, y_test_cat, verbose=0)
    mlflow.log_param("model", "DNN")
    mlflow.log_metric("accuracy", acc)
    mlflow.keras.log_model(model, "dnn_model")

    print("🔍 DNN Accuracy:", acc)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


🔍 DNN Accuracy: 0.24634261429309845


In [9]:
# ✅ XGBoost (optional)
with mlflow.start_run(run_name="XGBoost"):
    xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=6, use_label_encoder=False, eval_metric='mlogloss')
    xgb_model.fit(X_train, y_train)
    xgb_preds = xgb_model.predict(X_test)
    xgb_acc = accuracy_score(y_test, xgb_preds)

    mlflow.log_param("model", "XGBoost")
    mlflow.log_metric("accuracy", xgb_acc)
    mlflow.xgboost.log_model(xgb_model, "xgb_model")

    print("🔍 XGBoost Accuracy:", xgb_acc)

# ✅ Save monitoring log
model_log = {
    "RandomForest": {"accuracy": float(acc)},
    "DNN": {"accuracy": float(acc)},
    "XGBoost": {"accuracy": float(xgb_acc)}
}

os.makedirs("monitoring", exist_ok=True)
with open("monitoring/model_monitor_log.json", "w") as f:
    json.dump(model_log, f, indent=2)

print("✅ All models trained and logged to MLflow. Monitoring log saved.")


Parameters: { "use_label_encoder" } are not used.



🔍 XGBoost Accuracy: 0.2298253893345918
✅ All models trained and logged to MLflow. Monitoring log saved.


In [10]:
# ✅ GCS Upload Utility
from google.cloud import storage

def upload_model_to_gcs(local_path, gcs_path, bucket_name, project_id):
    """Uploads a local model file to GCS."""
    client = storage.Client(project=project_id)
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(gcs_path)
    blob.upload_from_filename(local_path)
    print(f"✅ Uploaded {local_path} to gs://{bucket_name}/{gcs_path}")

# ✅ Example usage:
# upload_model_to_gcs("random_forest.pkl", "models/random_forest.pkl", bucket_name="boothill2001-dataset", project_id="exalted-summer-454012-d2")
