# Pipeline

This Python script sets up and configures an Azure Machine Learning training pipeline. It authenticates with an Azure ML workspace and develops a training script that preprocesses data, performs grid search with cross-validation on a Random Forest or Logistic Regression model, evaluates model performance with accuracy and F1-score metrics, and logs the results with MLflow. It handles time-based feature formatting, label encoding, scaling, and model assessment (confusion matrices and feature importance). If the new model is an improvement over previous models, it registers the model in Azure ML for future use.

In [1]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Environment, Model
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.entities import CommandComponent, Data
from azure.ai.ml import command, Input, Output
from azure.identity import InteractiveBrowserCredential
from azure.ai.ml.constants import AssetTypes
import os
import time

# Initialize MLClient
credential = InteractiveBrowserCredential()
ml_client = MLClient(
    credential=credential,
    subscription_id="0a94de80-6d3b-49f2-b3e9-ec5818862801",
    resource_group_name="buas-y2",
    workspace_name="NLP6-2025"
)

def create_training_script():
    os.makedirs("pipeline_scripts", exist_ok=True)
    
    training_script = '''
import pandas as pd 
import joblib 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.preprocessing import StandardScaler, LabelEncoder
import argparse
import os
import glob
import numpy as np
import mlflow
from azureml.core import Run, Model
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import datetime
import json

def find_csv_file(path):
    if os.path.isdir(path):
        csv_files = glob.glob(os.path.join(path, "*.csv"))
        if not csv_files:
            raise FileNotFoundError(f"No CSV files found in directory: {path}")
        return csv_files[0]
    return path

def convert_time_to_seconds(time_str):
    """Convert time string (HH:MM:SS,fff) to total seconds"""
    try:
        hh_mm_ss, millis = time_str.split(',')
        h, m, s = hh_mm_ss.split(':')
        return float(h) * 3600 + float(m) * 60 + float(s) + (float(millis)/1000)
    except:
        return np.nan

def preprocess_data(train_path, test_path): 
    train_file = find_csv_file(train_path)
    test_file = find_csv_file(test_path)
    
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)
    
    # Identify time columns and convert to seconds
    time_cols = [col for col in train_df.columns 
                if any(x in str(train_df[col].dtype) for x in ['time', 'object'])]
    
    for col in time_cols:
        if train_df[col].astype(str).str.match(r'\d{2}:\d{2}:\d{2},\d{3}').any():
            train_df[col] = train_df[col].astype(str).apply(convert_time_to_seconds)
            test_df[col] = test_df[col].astype(str).apply(convert_time_to_seconds)
    
    target_columns = ['target', 'Target', 'label', 'Label', 'emotion', 'Emotion']
    target_col = next((col for col in target_columns if col in train_df.columns), None)
    
    if target_col is None:
        raise ValueError(f"No target column found. Available columns: {train_df.columns.tolist()}")
    
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(train_df[target_col])
    y_test = label_encoder.transform(test_df[target_col])
    
    X_train = train_df.drop(columns=[target_col])
    X_test = test_df.drop(columns=[target_col])
    
    # Convert remaining string columns to numeric or categorical
    for col in X_train.select_dtypes(include=['object']).columns:
        try:
            X_train[col] = pd.to_numeric(X_train[col], errors='raise')
            X_test[col] = pd.to_numeric(X_test[col], errors='raise')
        except:
            X_train[col] = LabelEncoder().fit_transform(X_train[col].astype(str))
            X_test[col] = LabelEncoder().fit_transform(X_test[col].astype(str))
    
    # Handle missing values
    X_train = X_train.fillna(0)
    X_test = X_test.fillna(0)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(scaler, 'outputs/scaler.pkl')
    joblib.dump(label_encoder, 'outputs/label_encoder.pkl')
    
    return X_train_scaled, X_test_scaled, y_train, y_test

def get_param_grid(model_name):
    """Define parameter grids for different models"""
    if model_name == "random_forest":
        return {
            'n_estimators': [50, 100, 150, 200],
            'max_depth': [10, 15, 20, 25, None],
            'min_samples_split': [2, 3, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    elif model_name == "logistic_regression":
        return {
            'C': [0.01, 0.1, 1.0, 10.0, 100.0],
            'penalty': ['l1', 'l2', 'elasticnet'],
            'solver': ['liblinear', 'saga'],
            'max_iter': [1000, 2000]
        }
    else:
        raise ValueError(f"Unsupported model: {model_name}")

def train_with_grid_search(args):
    run = Run.get_context()
    mlflow.start_run()
    
    try:
        X_train, X_test, y_train, y_test = preprocess_data(args.train_data, args.test_data)
        
        # Model selection
        if args.model_name == "random_forest":
            base_model = RandomForestClassifier(random_state=42)
        elif args.model_name == "logistic_regression":
            base_model = LogisticRegression(random_state=42)
        else:
            raise ValueError(f"Unsupported model: {args.model_name}")
        
        # Get parameter grid
        param_grid = get_param_grid(args.model_name)
        
        # Setup Grid Search
        cv = StratifiedKFold(n_splits=args.cv_folds, shuffle=True, random_state=42)
        scoring = 'f1_weighted'  # Using F1 as primary metric
        
        grid_search = GridSearchCV(
            estimator=base_model,
            param_grid=param_grid,
            cv=cv,
            scoring=scoring,
            n_jobs=args.n_jobs,
            verbose=1,
            return_train_score=True
        )
        
        print(f"Starting Grid Search with {len(param_grid)} parameter combinations...")
        grid_search.fit(X_train, y_train)
        
        # Get best model and predictions
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_cv_score = grid_search.best_score_
        
        preds = best_model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        precision = precision_score(y_test, preds, average="weighted")
        recall = recall_score(y_test, preds, average="weighted")
        f1 = f1_score(y_test, preds, average="weighted")

        os.makedirs(args.model_output, exist_ok=True)
        model_path = os.path.join(args.model_output, 'model.pkl')
        joblib.dump(best_model, model_path)
        
        # Save grid search results
        grid_results_path = os.path.join(args.model_output, 'grid_search_results.json')
        grid_results = {
            'best_params': best_params,
            'best_cv_score': best_cv_score,
            'cv_results': {
                'mean_test_score': grid_search.cv_results_['mean_test_score'].tolist(),
                'std_test_score': grid_search.cv_results_['std_test_score'].tolist(),
                'params': [str(p) for p in grid_search.cv_results_['params']]
            }
        }
        
        with open(grid_results_path, 'w') as f:
            json.dump(grid_results, f, indent=2)

        # Log metrics
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("best_cv_score", best_cv_score)
        mlflow.log_params(best_params)
        mlflow.log_param("model_name", args.model_name)
        mlflow.log_param("cv_folds", args.cv_folds)
        
        # Log grid search results as artifact
        mlflow.log_artifact(grid_results_path)

        # Log F1 vs threshold for models with predict_proba
        if hasattr(best_model, "predict_proba"):
            proba = best_model.predict_proba(X_test)
            if proba.shape[1] > 1:  # Multi-class
                proba = proba[:, 1] if proba.shape[1] == 2 else np.max(proba, axis=1)
            else:
                proba = proba[:, 0]
                
            thresholds = np.linspace(0, 1, 50)
            for i, thresh in enumerate(thresholds):
                preds_thresh = (proba >= thresh).astype(int)
                try:
                    f1_step = f1_score(y_test, preds_thresh, average="weighted", zero_division=0)
                except ValueError:
                    f1_step = 0
                mlflow.log_metric("f1_vs_threshold", f1_step, step=i)

        # Log confusion matrix
        cm_display = ConfusionMatrixDisplay.from_predictions(y_test, preds)
        plt.title(f"Confusion Matrix - {args.model_name}")
        conf_matrix_path = os.path.join(args.model_output, "confusion_matrix.png")
        plt.savefig(conf_matrix_path, dpi=300, bbox_inches='tight')
        plt.close()
        mlflow.log_artifact(conf_matrix_path)

        # Log parameter importance plot for Random Forest
        if args.model_name == "random_forest" and hasattr(best_model, 'feature_importances_'):
            plt.figure(figsize=(10, 6))
            feature_importance = best_model.feature_importances_
            indices = np.argsort(feature_importance)[::-1][:20]  # Top 20 features
            
            plt.title("Top 20 Feature Importances")
            plt.bar(range(len(indices)), feature_importance[indices])
            plt.xlabel("Feature Index")
            plt.ylabel("Importance")
            
            feature_importance_path = os.path.join(args.model_output, "feature_importance.png")
            plt.savefig(feature_importance_path, dpi=300, bbox_inches='tight')
            plt.close()
            mlflow.log_artifact(feature_importance_path)

        # Model registration logic (same as before)
        run.upload_file(name='model/model.pkl', path_or_stream=model_path)
        workspace = run.experiment.workspace
        models = Model.list(workspace, name=args.model_name)
        best_f1_registered = -1.0
        for m in models:
            try:
                f1_val = float(m.properties.get("f1_score", -1.0))
                if f1_val > best_f1_registered:
                    best_f1_registered = f1_val
            except Exception:
                continue

        if f1 > best_f1_registered:
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            run.register_model(
                model_name=args.model_name,
                model_path='model/model.pkl',
                tags={
                    "framework": "sklearn", 
                    "f1_score": f"{f1:.4f}",
                    "best_cv_score": f"{best_cv_score:.4f}",
                    "grid_search": "true"
                },
                properties={
                    "accuracy": f"{acc:.4f}", 
                    "f1_score": f"{f1:.4f}", 
                    "best_cv_score": f"{best_cv_score:.4f}",
                    "best_params": str(best_params),
                    "registered_at_utc": timestamp
                }
            )
            print(f"Model registered as {args.model_name} with F1 score: {f1:.4f} (CV: {best_cv_score:.4f})")
        else:
            print(f"Model NOT registered: F1 score {f1:.4f} is not better than previous best {best_f1_registered:.4f}.")

        # Print best parameters
        print(f"\\nBest parameters for {args.model_name}:")
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        print(f"Best CV Score: {best_cv_score:.4f}")
        print(f"Test F1 Score: {f1:.4f}")

        return acc, best_params, best_cv_score
        
    finally:
        mlflow.end_run()

def train_and_evaluate(args):
    """Original training function for backward compatibility"""
    run = Run.get_context()
    mlflow.start_run()
    
    try:
        X_train, X_test, y_train, y_test = preprocess_data(args.train_data, args.test_data)
        
        # Model selection with manual parameters
        if args.model_name == "random_forest":
            params = {
                "n_estimators": args.n_estimators, 
                "max_depth": args.max_depth,
                "min_samples_split": args.min_samples_split,
                "random_state": 42
            }
            model = RandomForestClassifier(**params)
        elif args.model_name == "logistic_regression":
            params = {
                "C": args.C,
                "max_iter": 1000,
                "random_state": 42
            }
            model = LogisticRegression(**params)
        else:
            raise ValueError(f"Unsupported model: {args.model_name}")
        
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        precision = precision_score(y_test, preds, average="weighted")
        recall = recall_score(y_test, preds, average="weighted")
        f1 = f1_score(y_test, preds, average="weighted")

        os.makedirs(args.model_output, exist_ok=True)
        model_path = os.path.join(args.model_output, 'model.pkl')
        joblib.dump(model, model_path)

        # Log scalar metrics
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_params(params)
        mlflow.log_param("model_name", args.model_name)

        # Log F1 vs threshold for custom chart
        if hasattr(model, "predict_proba"):
            proba = model.predict_proba(X_test)[:, 1]
            thresholds = np.linspace(0, 1, 50)
            for i, thresh in enumerate(thresholds):
                preds_thresh = (proba >= thresh).astype(int)
                try:
                    f1_step = f1_score(y_test, preds_thresh, average="weighted", zero_division=0)
                except ValueError:
                    f1_step = 0
                mlflow.log_metric("f1_vs_threshold", f1_step, step=i)

        # Log confusion matrix as artifact
        cm_display = ConfusionMatrixDisplay.from_predictions(y_test, preds)
        plt.title("Confusion Matrix")
        conf_matrix_path = os.path.join(args.model_output, "confusion_matrix.png")
        plt.savefig(conf_matrix_path)
        plt.close()
        mlflow.log_artifact(conf_matrix_path)

        # Register model only if F1 is better than previous versions
        run.upload_file(name='model/model.pkl', path_or_stream=model_path)
        workspace = run.experiment.workspace
        models = Model.list(workspace, name=args.model_name)
        best_f1_registered = -1.0
        for m in models:
            try:
                f1_val = float(m.properties.get("f1_score", -1.0))
                if f1_val > best_f1_registered:
                    best_f1_registered = f1_val
            except Exception:
                continue

        if f1 > best_f1_registered:
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            run.register_model(
                model_name=args.model_name,
                model_path='model/model.pkl',
                tags={"framework": "sklearn", "f1_score": f"{f1:.4f}"},
                properties={"accuracy": f"{acc:.4f}", "f1_score": f"{f1:.4f}", "registered_at_utc": timestamp}
            )
            print(f"Model registered as {args.model_name} with F1 score: {f1:.4f}")
        else:
            print(f"Model NOT registered: F1 score {f1:.4f} is not better than previous best {best_f1_registered:.4f}.")

        return acc
    finally:
        mlflow.end_run()

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_data', type=str, required=True)
    parser.add_argument('--test_data', type=str, required=True)
    parser.add_argument('--model_output', type=str, required=True)
    parser.add_argument('--model_name', type=str, required=True)
    
    # Grid search parameters
    parser.add_argument('--use_grid_search', type=bool, default=True)
    parser.add_argument('--cv_folds', type=int, default=5)
    parser.add_argument('--n_jobs', type=int, default=-1)
    
    # Manual parameters (used when grid search is disabled)
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--max_depth', type=int, default=20)
    parser.add_argument('--min_samples_split', type=int, default=2)
    parser.add_argument('--C', type=float, default=1.0)
    
    args = parser.parse_args()
    
    if args.use_grid_search:
        accuracy, best_params, cv_score = train_with_grid_search(args)
        print(f"Grid Search completed. Best CV score: {cv_score:.4f}, Test accuracy: {accuracy:.4f}")
    else:
        accuracy = train_and_evaluate(args)
        print(f"Model trained with accuracy: {accuracy:.4f}")

if __name__ == "__main__":
    main()
'''
    
    with open("pipeline_scripts/train_model.py", "w", encoding='utf-8') as f:
        f.write(training_script)

def create_training_component():
    create_training_script()
    
    return command(
        name="emotion_model_training",
        display_name="Emotion Classification Training with Grid Search",
        description="Trains emotion classification models with grid search optimization and time feature handling",
        code="./pipeline_scripts",
        command="python train_model.py "
                "--train_data ${{inputs.train_data}} "
                "--test_data ${{inputs.test_data}} "
                "--model_output ${{outputs.model_output}} "
                "--model_name ${{inputs.model_name}} "
                "--use_grid_search ${{inputs.use_grid_search}} "
                "--cv_folds ${{inputs.cv_folds}} "
                "--n_jobs ${{inputs.n_jobs}} "
                "--n_estimators ${{inputs.n_estimators}} "
                "--max_depth ${{inputs.max_depth}} "
                "--min_samples_split ${{inputs.min_samples_split}} "
                "--C ${{inputs.C}}",
        environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest",
        inputs={
            "train_data": Input(type=AssetTypes.URI_FOLDER),
            "test_data": Input(type=AssetTypes.URI_FOLDER),
            "model_name": Input(type="string"),
            "use_grid_search": Input(type="boolean", default=True),
            "cv_folds": Input(type="integer", default=5),
            "n_jobs": Input(type="integer", default=-1),
            # Manual parameters (fallback when grid search is disabled)
            "n_estimators": Input(type="integer", default=100),
            "max_depth": Input(type="integer", default=20),
            "min_samples_split": Input(type="integer", default=2),
            "C": Input(type="number", default=1.0)
        },
        outputs={
            "model_output": Output(type=AssetTypes.URI_FOLDER)
        }
    )

@pipeline()
def emotion_training_pipeline(train_data, test_data, use_grid_search=True):
    train_component = create_training_component()
    
    # Random Forest with Grid Search
    rf_train = train_component(
        train_data=train_data,
        test_data=test_data,
        model_name="random_forest",
        use_grid_search=use_grid_search,
        cv_folds=5,
        n_jobs=-1,
        # Manual parameters (used only if grid search is disabled)
        n_estimators=150,
        max_depth=15,
        min_samples_split=3
    )
    rf_train.compute = "adsai-lambda-0"
    
    # Logistic Regression with Grid Search
    lr_train = train_component(
        train_data=train_data,
        test_data=test_data,
        model_name="logistic_regression",
        use_grid_search=use_grid_search,
        cv_folds=5,
        n_jobs=-1,
        # Manual parameters (used only if grid search is disabled)
        C=1.0
    )
    lr_train.compute = "adsai-lambda-0"
    
    return {
        "random_forest_output": rf_train.outputs.model_output,
        "logistic_regression_output": lr_train.outputs.model_output
    }

def submit_pipeline(use_grid_search=True):
    """
    Submit the pipeline with optional grid search
    
    Args:
        use_grid_search (bool): Whether to use grid search for hyperparameter optimization
    """
    try:
        train_data = ml_client.data.get(name="emotion-raw-train", label="latest")
        test_data = ml_client.data.get(name="emotion-raw-test", label="latest")
        
        pipeline_job = emotion_training_pipeline(
            train_data=Input(type=AssetTypes.URI_FOLDER, path=train_data.path),
            test_data=Input(type=AssetTypes.URI_FOLDER, path=test_data.path),
            use_grid_search=use_grid_search
        )
        pipeline_job.settings.default_compute = "adsai-lambda-0"
        
        experiment_name = "emotion-classification-gridsearch" if use_grid_search else "emotion-classification-manual"
        submitted_job = ml_client.jobs.create_or_update(
            pipeline_job,
            experiment_name=experiment_name
        )
        
        print(f"Pipeline submitted: {submitted_job.studio_url}")
        print(f"Grid Search: {'Enabled' if use_grid_search else 'Disabled'}")
        return submitted_job
        
    except Exception as e:
        print(f"Error submitting pipeline: {e}")
        return None

def submit_grid_search_pipeline():
    """Convenience function to submit pipeline with grid search enabled"""
    return submit_pipeline(use_grid_search=True)

def submit_manual_pipeline():
    """Convenience function to submit pipeline with manual parameters"""
    return submit_pipeline(use_grid_search=False)

if __name__ == "__main__":
    # Submit with grid search by default
    submit_grid_search_pipeline()

  training_script = '''
Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
[32mUploading

Pipeline submitted: https://ml.azure.com/runs/silver_whistle_nlmtch7gyl?wsid=/subscriptions/0a94de80-6d3b-49f2-b3e9-ec5818862801/resourcegroups/buas-y2/workspaces/NLP6-2025&tid=0a33589b-0036-4fe8-a829-3ed0926af886
Grid Search: Enabled
