# Pipeline

In [1]:
from azure.identity import InteractiveBrowserCredential
from azure.ai.ml import MLClient

# Authenticate interactively (replaces InteractiveLoginAuthentication)
credential = InteractiveBrowserCredential()

# Connect to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id="0a94de80-6d3b-49f2-b3e9-ec5818862801",
    resource_group_name="buas-y2",
    workspace_name="NLP6-2025"
)

print("Workspace loaded successfully:", ml_client.workspace_name)

# Retrieve the environment (you must specify the version in SDK v2)
retrieved_env = ml_client.environments.get(name="emotion-clf-pipeline-env", version="21")  # replace "1" with the actual version

print("Environment retrieved:", retrieved_env.name, retrieved_env.version)


Workspace loaded successfully: NLP6-2025
Environment retrieved: emotion-clf-pipeline-env 21


In [2]:
from azureml.core import Workspace, Environment

# Load workspace using config.json, or directly via parameters
ws = Workspace(
    subscription_id="0a94de80-6d3b-49f2-b3e9-ec5818862801",
    resource_group="buas-y2",
    workspace_name="NLP6-2025"
)

print("Workspace loaded successfully:", ws.name)

# Get the environment (SDK v1, version is supported)
retrieved_env = Environment.get(workspace=ws, name="emotion-clf-pipeline-env", version="21")

print("Environment retrieved:", retrieved_env.name, retrieved_env.version)




Workspace loaded successfully: NLP6-2025
Environment retrieved: emotion-clf-pipeline-env 21


In [3]:
import pandas as pd 
import joblib 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 
from azureml.core import Workspace, Dataset, Experiment, ScriptRunConfig, Environment, Run
from azureml.core.authentication import InteractiveLoginAuthentication 
from azureml.core.compute import ComputeTarget
import os


### TRAINING SCRIPT FOR AZURE ML COMPUTE ###
def create_training_script():
    """Create the training script that will run on Azure ML compute"""
    training_script = '''
import pandas as pd 
import joblib 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import StandardScaler 
from azureml.core import Dataset, Run
import argparse
import os

def preprocess_data(train_dataset, test_dataset): 
    # Convert to pandas DataFrames
    train_df = train_dataset.to_pandas_dataframe()
    test_df = test_dataset.to_pandas_dataframe()
 
    X_train = train_df.drop('target', axis=1) 
    y_train = train_df['target'] 
    X_test = test_df.drop('target', axis=1) 
    y_test = test_df['target'] 
 
    scaler = StandardScaler() 
    X_train_scaled = scaler.fit_transform(X_train) 
    X_test_scaled = scaler.transform(X_test) 
 
    return X_train_scaled, X_test_scaled, y_train, y_test 
 
def get_model(name, params=None): 
    if name == "random_forest": 
        return RandomForestClassifier(**(params or {})) 
    elif name == "logistic_regression": 
        return LogisticRegression(**(params or {})) 
    else: 
        raise ValueError("Model not supported.") 

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_dataset_name', type=str, required=True)
    parser.add_argument('--test_dataset_name', type=str, required=True)
    parser.add_argument('--model_name', type=str, required=True)
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--max_depth', type=int, default=20)
    parser.add_argument('--threshold', type=float, default=0.8)
    args = parser.parse_args()
    
    # Get the run context
    run = Run.get_context()
    
    # Get datasets
    train_dataset = Dataset.get_by_name(run.experiment.workspace, args.train_dataset_name)
    test_dataset = Dataset.get_by_name(run.experiment.workspace, args.test_dataset_name)
    
    # Preprocess data
    X_train, X_test, y_train, y_test = preprocess_data(train_dataset, test_dataset)
    
    # Set up model parameters
    params = {"n_estimators": args.n_estimators, "max_depth": args.max_depth}
    
    # Train model
    model = get_model(args.model_name, params)
    model.fit(X_train, y_train)
    
    # Make predictions and calculate accuracy
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    
    # Log metrics
    run.log("accuracy", acc)
    run.log("n_estimators", args.n_estimators)
    run.log("max_depth", args.max_depth)
    
    # Save model
    os.makedirs('outputs', exist_ok=True)
    model_path = 'outputs/model.pkl'
    joblib.dump(model, model_path)
    
    # Evaluate model
    passed = acc >= args.threshold
    run.log("evaluation_passed", passed)
    
    print(f"Training accuracy: {acc}")
    if passed:
        print("CORRECT Model passed evaluation and is ready for use.")
    else:
        print("ERROR Model did not meet performance threshold.")
    
    return acc

if __name__ == "__main__":
    main()
'''
    
    # Write the script to a file
    os.makedirs('azure_scripts', exist_ok=True)
    with open('azure_scripts/train_model.py', 'w') as f:
        f.write(training_script)
    
    return 'azure_scripts/train_model.py'

### COMPUTE TARGET MANAGEMENT ###
def list_available_compute_targets():
    """List all available compute targets in the workspace"""
    compute_targets = ws.compute_targets
    print("Available compute targets:")
    for name, target in compute_targets.items():
        print(f"  - {name}: {target.type}")
    return list(compute_targets.keys())

def get_or_create_compute_target(compute_target_name="cpu-cluster"):
    """Get existing compute target or create a new one"""
    from azureml.core.compute import AmlCompute, ComputeTarget
    from azureml.core.compute_target import ComputeTargetException
    
    # First, list available compute targets
    available_targets = list_available_compute_targets()
    
    # Try to get existing compute target
    try:
        compute_target = ComputeTarget(workspace=ws, name=compute_target_name)
        print(f"Using existing compute target: {compute_target_name}")
        return compute_target
    except ComputeTargetException:
        print(f"Compute target {compute_target_name} not found.")
        
        # If there are existing compute targets, offer to use one of them
        if available_targets:
            print("Would you like to use one of the existing compute targets?")
            print("Available options:")
            for i, target in enumerate(available_targets):
                print(f"  {i+1}. {target}")
            
            # For automated execution, use the first available compute target
            selected_target = available_targets[0]
            print(f"Automatically selecting: {selected_target}")
            return ComputeTarget(workspace=ws, name=selected_target)
        
        # If no compute targets exist, create a new one
        print(f"Creating new compute target: {compute_target_name}")
        compute_config = AmlCompute.provisioning_configuration(
            vm_size="Standard_D2_v2",  # Basic VM size
            min_nodes=0,
            max_nodes=2,
            idle_seconds_before_scaledown=300
        )
        
        compute_target = ComputeTarget.create(ws, compute_target_name, compute_config)
        compute_target.wait_for_completion(show_output=True)
        return compute_target

### MAIN AZURE ML TRAINING FUNCTION ###
def train_and_save_azure(train_dataset_name, test_dataset_name, model_name, params, compute_target_name="cpu-cluster"):
    """Submit training job to Azure ML compute"""
    
    # Create training script
    script_path = create_training_script()
    
    # Get or create compute target
    compute_target = get_or_create_compute_target(compute_target_name)
    
    # Create experiment
    experiment = Experiment(workspace=ws, name="emotion-classification-training")
    
    # Configure the script run
    script_config = ScriptRunConfig(
        source_directory='azure_scripts',
        script='train_model.py',
        arguments=[
            '--train_dataset_name', train_dataset_name,
            '--test_dataset_name', test_dataset_name,
            '--model_name', model_name,
            '--n_estimators', params.get('n_estimators', 100),
            '--max_depth', params.get('max_depth', 20),
            '--threshold', 0.8
        ],
        compute_target=compute_target,
        environment=retrieved_env
    )
    
    # Submit the job
    run = experiment.submit(script_config)
    print(f"Submitted run: {run.id}")
    
    # Wait for completion (optional - remove if you want async)
    run.wait_for_completion(show_output=True)
    
    # Get metrics
    accuracy = run.get_metrics().get('accuracy', 0)
    evaluation_passed = run.get_metrics().get('evaluation_passed', False)
    
    return accuracy, run.id, evaluation_passed

### AIRFLOW COMPATIBLE FUNCTION ###
def train_and_save(train_dataset_name="train_data", test_dataset_name="test_data", model_name="random_forest", params=None, model_output=None, compute_target_name="cpu-cluster"):
    """Airflow-compatible wrapper function"""
    if params is None:
        params = {"n_estimators": 100, "max_depth": 20}
    
    accuracy, run_id, evaluation_passed = train_and_save_azure(
        train_dataset_name=train_dataset_name,
        test_dataset_name=test_dataset_name,
        model_name=model_name,
        params=params,
        compute_target_name=compute_target_name
    )
    
    print(f"Azure ML Run ID: {run_id}")
    print(f"Training completed with accuracy: {accuracy}")
    
    return accuracy, run_id, evaluation_passed, None

### MAIN EXECUTION ### 
if __name__ == "__main__": 
    # Dataset names in Azure ML (replace with your actual dataset names)
    train_dataset_name = "emotion-raw-train"  # Replace with your actual train dataset name
    test_dataset_name = "emotion-raw-test"    # Replace with your actual test dataset name
    model_name = "random_forest" 
    params = {"n_estimators": 100, "max_depth": 20}
    compute_target_name = "adsai-lambda-0"  # Replace with your actual compute target name
 
    # Train on Azure ML
    accuracy, run_id, evaluation_passed = train_and_save_azure(
        train_dataset_name, 
        test_dataset_name, 
        model_name, 
        params,
        compute_target_name
    )
    
    print(f"Final Results:")
    print(f"Accuracy: {accuracy}")
    print(f"Run ID: {run_id}")
    print(f"Evaluation Passed: {evaluation_passed}")

Class KubernetesCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Available compute targets:
  - adsai-lambda-0: Kubernetes


Class KubernetesCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Using existing compute target: adsai-lambda-0
Submitted run: emotion-classification-training_1749111219_f9ebbe89
RunId: emotion-classification-training_1749111219_f9ebbe89
Web View: https://ml.azure.com/runs/emotion-classification-training_1749111219_f9ebbe89?wsid=/subscriptions/0a94de80-6d3b-49f2-b3e9-ec5818862801/resourcegroups/buas-y2/workspaces/NLP6-2025&tid=0a33589b-0036-4fe8-a829-3ed0926af886

Streaming user_logs/std_log.txt

Traceback (most recent call last):
  File "<string>", line 197, in <module>
  File "<string>", line 193, in main
  File "<frozen runpy>", line 291, in run_path
  File "<frozen runpy>", line 98, in _run_module_code
  File "<frozen runpy>", line 88, in _run_code
  File "train_model.py", line 8, in <module>
    from azureml.core import Dataset, Run
ModuleNotFoundError: No module named 'azureml'


Execution Summary
RunId: emotion-classification-training_1749111219_f9ebbe89
Web View: https://ml.azure.com/runs/emotion-classification-training_1749111219_f9ebbe89?ws

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "{\"NonCompliant\":\"Execution failed. User process '/azureml-envs/azureml_e51ad04b8b37e0f6931f0efa6adce3ea/bin/python' exited with status code 1. Please check log file 'user_logs/std_log.txt' for error details. Error: Traceback (most recent call last):\\n  File \\\"<string>\\\", line 197, in <module>\\n  File \\\"<string>\\\", line 193, in main\\n  File \\\"<frozen runpy>\\\", line 291, in run_path\\n  File \\\"<frozen runpy>\\\", line 98, in _run_module_code\\n  File \\\"<frozen runpy>\\\", line 88, in _run_code\\n  File \\\"train_model.py\\\", line 8, in <module>\\n    from azureml.core import Dataset, Run\\nModuleNotFoundError: No module named 'azureml'\\n\\n\"}\n{\n  \"code\": \"ExecutionFailed\",\n  \"target\": \"\",\n  \"category\": \"UserError\",\n  \"error_details\": [\n    {\n      \"key\": \"exit_codes\",\n      \"value\": \"1\"\n    }\n  ]\n}",
        "messageParameters": {},
        "details": []
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"{\\\"NonCompliant\\\":\\\"Execution failed. User process '/azureml-envs/azureml_e51ad04b8b37e0f6931f0efa6adce3ea/bin/python' exited with status code 1. Please check log file 'user_logs/std_log.txt' for error details. Error: Traceback (most recent call last):\\\\n  File \\\\\\\"<string>\\\\\\\", line 197, in <module>\\\\n  File \\\\\\\"<string>\\\\\\\", line 193, in main\\\\n  File \\\\\\\"<frozen runpy>\\\\\\\", line 291, in run_path\\\\n  File \\\\\\\"<frozen runpy>\\\\\\\", line 98, in _run_module_code\\\\n  File \\\\\\\"<frozen runpy>\\\\\\\", line 88, in _run_code\\\\n  File \\\\\\\"train_model.py\\\\\\\", line 8, in <module>\\\\n    from azureml.core import Dataset, Run\\\\nModuleNotFoundError: No module named 'azureml'\\\\n\\\\n\\\"}\\n{\\n  \\\"code\\\": \\\"ExecutionFailed\\\",\\n  \\\"target\\\": \\\"\\\",\\n  \\\"category\\\": \\\"UserError\\\",\\n  \\\"error_details\\\": [\\n    {\\n      \\\"key\\\": \\\"exit_codes\\\",\\n      \\\"value\\\": \\\"1\\\"\\n    }\\n  ]\\n}\",\n        \"messageParameters\": {},\n        \"details\": []\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}