# Pipeline

In [1]:
from azure.ai.ml import MLClient, command
from azure.ai.ml.entities import Model
from azure.ai.ml.entities import ModelPackage
from azure.identity import InteractiveBrowserCredential
from azure.ai.ml.constants import AssetTypes
import os
import time

# Step 1: Connect to Azure ML workspace
credential = InteractiveBrowserCredential()
ml_client = MLClient(
    credential=credential,
    subscription_id="0a94de80-6d3b-49f2-b3e9-ec5818862801",
    resource_group_name="buas-y2",
    workspace_name="NLP6-2025"
)

# Step 1.5: List available datasets and get details
def list_available_datasets():
    print("Available datasets in workspace:")
    try:
        datasets = ml_client.data.list()
        dataset_names = []
        dataset_info = {}
        for dataset in datasets:
            dataset_names.append(dataset.name)
            dataset_info[dataset.name] = dataset
            print(f"  - {dataset.name} (version: {dataset.version}, type: {dataset.type})")
        return dataset_names, dataset_info
    except Exception as e:
        print(f"Error listing datasets: {e}")
        return [], {}

# Step 2: Create the enhanced training script with model registration
def create_training_script():
    os.makedirs("azure_scripts", exist_ok=True)
    
    # Enhanced training script with model registration capability (Unicode characters removed)
    training_script = '''
import pandas as pd 
import joblib 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import StandardScaler 
import argparse
import os
import glob
from sklearn.preprocessing import LabelEncoder
import json
import time

def find_csv_file(path):
    """Find CSV file in directory or return path if it's a file"""
    if os.path.isdir(path):
        csv_files = glob.glob(os.path.join(path, "*.csv"))
        if not csv_files:
            raise FileNotFoundError(f"No CSV files found in directory: {path}")
        print(f"Found CSV file: {csv_files[0]}")
        return csv_files[0]
    return path

def preprocess_data(train_path, test_path): 
    print("Loading data...")
    print(f"Train path: {train_path}")
    print(f"Test path: {test_path}")
    
    # Find CSV files in directories or use direct file paths
    train_file = find_csv_file(train_path)
    test_file = find_csv_file(test_path)
    
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)
    
    print("Dataset Info:")
    print(f"Train shape: {train_df.shape}")
    print(f"Test shape: {test_df.shape}")
    print("Column data types:")
    print(train_df.dtypes)
    
    # Find target column
    target_columns = ['target', 'Target', 'label', 'Label', 'emotion', 'Emotion']
    target_col = None
    for col in target_columns:
        if col in train_df.columns:
            target_col = col
            break
    
    if target_col is None:
        raise ValueError(f"No target column found. Available columns: {train_df.columns.tolist()}")
    
    print(f"Using target column: {target_col}")
    
    # Extract and encode target variables
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(train_df[target_col])
    y_test = label_encoder.transform(test_df[target_col])
    
    print(f"Unique target values: {label_encoder.classes_}")
    
    # Drop target column and process features
    X_train = train_df.drop(target_col, axis=1)
    X_test = test_df.drop(target_col, axis=1)
    
    feature_columns = []
    for column in X_train.columns:
        print(f"Processing column: {column}")
        try:
            # Try numeric conversion
            X_train[column] = pd.to_numeric(X_train[column], errors='raise')
            X_test[column] = pd.to_numeric(X_test[column], errors='raise')
            feature_columns.append(column)
            print(f"Converted to numeric: {column}")
        except (ValueError, TypeError):
            # For non-numeric columns, try encoding
            try:
                label_enc = LabelEncoder()
                X_train[column] = label_enc.fit_transform(X_train[column].astype(str))
                X_test[column] = label_enc.transform(X_test[column].astype(str))
                feature_columns.append(column)
                print(f"Encoded categorical: {column}")
            except Exception as e:
                print(f"Skipping column {column}: {str(e)}")
    
    if not feature_columns:
        print("Column details:")
        for col in X_train.columns:
            print(f"{col}: {X_train[col].dtype}")
            print(f"Sample values: {X_train[col].head()}")
        raise ValueError("No usable features found for training!")
    
    X_train = X_train[feature_columns]
    X_test = X_test[feature_columns]
    
    print(f"Selected features: {feature_columns}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Save the scaler and label encoder for model registration
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(scaler, 'outputs/scaler.pkl')
    joblib.dump(label_encoder, 'outputs/label_encoder.pkl')
    
    return X_train_scaled, X_test_scaled, y_train, y_test

def get_model(name, params=None): 
    if name == "random_forest": 
        return RandomForestClassifier(**(params or {})) 
    elif name == "logistic_regression": 
        return LogisticRegression(**(params or {})) 
    else: 
        raise ValueError("Model not supported.") 

def log_metrics_to_file(metrics, params):
    """Log metrics and parameters to files instead of MLflow"""
    os.makedirs('outputs', exist_ok=True)
    
    # Save metrics
    with open('outputs/metrics.json', 'w') as f:
        json.dump(metrics, f, indent=2)
    
    # Save parameters  
    with open('outputs/params.json', 'w') as f:
        json.dump(params, f, indent=2)
    
    print("Metrics and parameters saved to outputs/")

def safe_mlflow_logging(metrics, params):
    """Safely attempt MLflow logging with fallback"""
    try:
        import mlflow
        # Try to set a simple tracking URI to avoid Azure ML registry issues
        mlflow.set_tracking_uri("file:./mlruns")
        
        for key, value in metrics.items():
            mlflow.log_metric(key, value)
        
        for key, value in params.items():
            mlflow.log_param(key, value)
            
        print("Successfully logged to MLflow")
        return True
    except Exception as e:
        print(f"MLflow logging failed: {str(e)}")
        print("Falling back to file-based logging...")
        log_metrics_to_file(metrics, params)
        return False

def register_model_locally(model, accuracy, threshold, model_name, hyperparams):
    """Register model locally if it passes the threshold"""
    if accuracy >= threshold:
        print(f"Model passed evaluation with accuracy {accuracy:.4f} >= {threshold}")
        print("Registering model locally...")
        
        # Create model metadata
        model_metadata = {
            "accuracy": accuracy,
            "threshold": threshold,
            "model_type": model_name,
            "hyperparameters": hyperparams,
            "training_timestamp": time.time(),
            "passed_evaluation": True
        }
        
        # Save model metadata
        with open('outputs/model_metadata.json', 'w') as f:
            json.dump(model_metadata, f, indent=2)
        
        # Save the trained model
        joblib.dump(model, 'outputs/model.pkl')
        
        # Create a model registration file
        with open('outputs/model_registered.txt', 'w') as f:
            f.write(f"Model registered successfully\\n")
            f.write(f"Accuracy: {accuracy:.4f}\\n")
            f.write(f"Model Type: {model_name}\\n")
            f.write(f"Hyperparameters: {hyperparams}\\n")
            f.write(f"Registration Time: {time.ctime()}\\n")
        
        print("[SUCCESS] Model registered locally in outputs/ directory")
        print(f"   - Model file: outputs/model.pkl")
        print(f"   - Metadata: outputs/model_metadata.json")
        print(f"   - Registration info: outputs/model_registered.txt")
        return True
    else:
        print(f"[FAILED] Model did not pass evaluation: {accuracy:.4f} < {threshold}")
        print("Model will not be registered.")
        return False

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_data', type=str, required=True)
    parser.add_argument('--test_data', type=str, required=True)
    parser.add_argument('--model_name', type=str, required=True)
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--max_depth', type=int, default=20)
    parser.add_argument('--threshold', type=float, default=0.8)
    parser.add_argument('--learning_rate', type=float, default=0.1)
    parser.add_argument('--min_samples_split', type=int, default=2)
    parser.add_argument('--C', type=float, default=1.0)  # For logistic regression
    args = parser.parse_args()
    
    X_train, X_test, y_train, y_test = preprocess_data(args.train_data, args.test_data)
    
    # Prepare hyperparameters based on model type
    if args.model_name == "random_forest":
        params = {
            "n_estimators": args.n_estimators, 
            "max_depth": args.max_depth,
            "min_samples_split": args.min_samples_split,
            "random_state": 42
        }
    elif args.model_name == "logistic_regression":
        params = {
            "C": args.C,
            "max_iter": 1000,
            "random_state": 42
        }
    else:
        params = {"n_estimators": args.n_estimators, "max_depth": args.max_depth}
    
    model = get_model(args.model_name, params)
    
    print("Training model...")
    print(f"Model: {args.model_name}")
    print(f"Hyperparameters: {params}")
    model.fit(X_train, y_train)
    
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    
    # Prepare metrics and parameters for logging
    metrics = {
        "accuracy": acc,
        "evaluation_passed": 1 if acc >= args.threshold else 0
    }
    
    log_params = {
        "model_name": args.model_name,
        "threshold": args.threshold,
        **params  # Include all model-specific parameters
    }
    
    # Try MLflow logging with fallback
    safe_mlflow_logging(metrics, log_params)
    
    # Register model locally if it passes the threshold
    model_registered = register_model_locally(model, acc, args.threshold, args.model_name, params)
    
    print(f"\\n=== Training Results ===")
    print(f"Model: {args.model_name}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Threshold: {args.threshold}")
    print(f"Model Registered: {'Yes' if model_registered else 'No'}")
    print(f"Hyperparameters: {params}")

if __name__ == "__main__":
    main()
'''
    
    # Write the script with explicit UTF-8 encoding to avoid charset issues
    with open("azure_scripts/train_model.py", "w", encoding='utf-8') as f:
        f.write(training_script)
    
    print("Training script created successfully with proper encoding")

# Step 3: Define multiple hyperparameter configurations
def get_hyperparameter_configs():
    """Define different hyperparameter configurations to test"""
    configs = [
        {
            "name": "random_forest_config_1",
            "model_name": "random_forest",
            "params": {"n_estimators": 100, "max_depth": 10, "min_samples_split": 2}
        },
        {
            "name": "random_forest_config_2", 
            "model_name": "random_forest",
            "params": {"n_estimators": 200, "max_depth": 15, "min_samples_split": 5}
        },
        {
            "name": "random_forest_config_3",
            "model_name": "random_forest", 
            "params": {"n_estimators": 150, "max_depth": 20, "min_samples_split": 3}
        },
        {
            "name": "logistic_regression_config_1",
            "model_name": "logistic_regression",
            "params": {"C": 1.0}
        },
        {
            "name": "logistic_regression_config_2",
            "model_name": "logistic_regression", 
            "params": {"C": 0.1}
        },
        {
            "name": "logistic_regression_config_3",
            "model_name": "logistic_regression",
            "params": {"C": 10.0}
        }
    ]
    return configs

# Step 4: Submit training job with specific hyperparameters
def submit_training_job(train_dataset_name, test_dataset_name, config, dataset_info):
    from azure.ai.ml import Input
    from azure.ai.ml.constants import AssetTypes
    
    create_training_script()  # Ensure script is created before submitting
    
    # Get the actual dataset objects with proper version handling
    try:
        train_dataset = ml_client.data.get(name=train_dataset_name, version=None)
        test_dataset = ml_client.data.get(name=test_dataset_name, version=None)
    except:
        # Fallback: try getting latest version
        try:
            train_dataset = ml_client.data.get(name=train_dataset_name, label="latest")
            test_dataset = ml_client.data.get(name=test_dataset_name, label="latest")
        except:
            # Last resort: get version 1
            train_dataset = ml_client.data.get(name=train_dataset_name, version="1")
            test_dataset = ml_client.data.get(name=test_dataset_name, version="1")
    
    print(f"Train dataset path: {train_dataset.path}")
    print(f"Test dataset path: {test_dataset.path}")
    print(f"Configuration: {config['name']}")
    print(f"Model: {config['model_name']}")
    print(f"Parameters: {config['params']}")

    # Determine the correct asset type based on dataset type
    train_asset_type = AssetTypes.URI_FOLDER if train_dataset.type == "uri_folder" else AssetTypes.URI_FILE
    test_asset_type = AssetTypes.URI_FOLDER if test_dataset.type == "uri_folder" else AssetTypes.URI_FILE

    # Prepare command arguments based on model type
    command_args = [
        "python train_model.py",
        "--train_data ${{inputs.train_data}}",
        "--test_data ${{inputs.test_data}}",
        "--model_name ${{inputs.model_name}}",
        "--threshold 0.8"
    ]
    
    # Add model-specific parameters
    inputs = {
        "train_data": Input(type=train_asset_type, path=train_dataset.path),
        "test_data": Input(type=test_asset_type, path=test_dataset.path),
        "model_name": config["model_name"],
    }
    
    if config["model_name"] == "random_forest":
        command_args.extend([
            "--n_estimators ${{inputs.n_estimators}}",
            "--max_depth ${{inputs.max_depth}}",
            "--min_samples_split ${{inputs.min_samples_split}}"
        ])
        inputs.update({
            "n_estimators": config["params"].get("n_estimators", 100),
            "max_depth": config["params"].get("max_depth", 20),
            "min_samples_split": config["params"].get("min_samples_split", 2)
        })
    elif config["model_name"] == "logistic_regression":
        command_args.append("--C ${{inputs.C}}")
        inputs["C"] = config["params"].get("C", 1.0)

    job = command(
        code="./azure_scripts",  # directory containing the training script
        command=" ".join(command_args),
        environment="emotion-clf-pipeline-env:24",
        inputs=inputs,
        compute="adsai-lambda-0",
        display_name=f"emotion-training-{config['name']}",
        experiment_name="emotion-classification-hyperparameter-testing"
    )

    returned_job = ml_client.jobs.create_or_update(job)
    print(f"Job '{config['name']}' submitted successfully!")
    print(f"Monitor at: {returned_job.studio_url}")
    return returned_job

# Step 5: Submit multiple training jobs with different hyperparameters
def submit_multiple_training_jobs(train_dataset_name, test_dataset_name, dataset_info):
    """Submit multiple training jobs with different hyperparameter configurations"""
    configs = get_hyperparameter_configs()
    submitted_jobs = []
    
    print(f"Submitting {len(configs)} training jobs with different hyperparameters...")
    print("=" * 60)
    
    for i, config in enumerate(configs, 1):
        print(f"\nSubmitting job {i}/{len(configs)}: {config['name']}")
        try:
            job = submit_training_job(train_dataset_name, test_dataset_name, config, dataset_info)
            submitted_jobs.append({
                "config": config,
                "job": job,
                "job_name": job.name
            })
            print(f"[SUCCESS] Successfully submitted: {config['name']}")
            
            # Add a small delay between submissions to avoid overwhelming the system
            time.sleep(2)
            
        except Exception as e:
            print(f"[ERROR] Failed to submit {config['name']}: {str(e)}")
    
    print(f"\n=== Summary ===")
    print(f"Successfully submitted: {len(submitted_jobs)} out of {len(configs)} jobs")
    print("\nJob Details:")
    for job_info in submitted_jobs:
        print(f"  - {job_info['config']['name']}: {job_info['job_name']}")
    
    return submitted_jobs

# Step 6: Function to check job status and retrieve successful models
def check_job_status_and_get_models(submitted_jobs):
    """Check the status of submitted jobs and identify successful models"""
    print("\nChecking job statuses...")
    successful_jobs = []
    
    for job_info in submitted_jobs:
        try:
            job = ml_client.jobs.get(job_info["job_name"])
            status = job.status
            print(f"Job {job_info['config']['name']}: {status}")
            
            if status == "Completed":
                successful_jobs.append(job_info)
        except Exception as e:
            print(f"Error checking job {job_info['job_name']}: {str(e)}")
    
    print(f"\nCompleted jobs: {len(successful_jobs)}")
    return successful_jobs

# Step 7: Main execution with error handling and multiple configurations
if __name__ == "__main__":
    # First, list available datasets
    available_datasets, dataset_info = list_available_datasets()
    
    # Check if the expected datasets exist
    expected_train = "emotion-raw-train"
    expected_test = "emotion-raw-test"
    
    if expected_train not in available_datasets:
        print(f"Dataset '{expected_train}' not found!")
        print("Available datasets with 'emotion' in name:")
        emotion_datasets = [d for d in available_datasets if 'emotion' in d.lower()]
        for dataset in emotion_datasets:
            print(f"   - {dataset}")
        
        if emotion_datasets:
            print(f"\nSuggestion: Update the dataset names in the script to match available ones.")
        else:
            print("\nSuggestion: You may need to register your datasets first.")
            print("   Check Azure ML Studio > Data > Datasets to see what's available.")
    else:
        # Proceed with multiple job submissions
        try:
            # Option 1: Submit all hyperparameter configurations
            print("Starting hyperparameter testing with multiple configurations...")
            submitted_jobs = submit_multiple_training_jobs(
                train_dataset_name=expected_train,
                test_dataset_name=expected_test,
                dataset_info=dataset_info
            )
            
            print("\n" + "="*60)
            print("All jobs submitted! You can monitor them in Azure ML Studio.")
            print("Jobs will automatically register models locally if they pass the threshold.")
            print("\nTo check job status later, you can use the check_job_status_and_get_models() function.")
            
            # Uncomment the following lines if you want to wait and check job status
            # print("\nWaiting 5 minutes before checking job status...")
            # time.sleep(300)  # Wait 5 minutes
            # successful_jobs = check_job_status_and_get_models(submitted_jobs)
            
        except Exception as e:
            print(f"Error in main execution: {e}")
            print("Check dataset names and environment availability.")

# Additional utility functions
def list_hyperparameter_configs():
    """Display all available hyperparameter configurations"""
    configs = get_hyperparameter_configs()
    print("Available Hyperparameter Configurations:")
    print("=" * 50)
    for config in configs:
        print(f"Name: {config['name']}")
        print(f"Model: {config['model_name']}")
        print(f"Parameters: {config['params']}")
        print("-" * 30)

def submit_single_config(config_name, train_dataset_name="emotion-raw-train", test_dataset_name="emotion-raw-test"):
    """Submit a single configuration by name"""
    configs = get_hyperparameter_configs()
    selected_config = None
    
    for config in configs:
        if config['name'] == config_name:
            selected_config = config
            break
    
    if selected_config is None:
        print(f"Configuration '{config_name}' not found!")
        print("Available configurations:")
        for config in configs:
            print(f"  - {config['name']}")
        return None
    
    try:
        available_datasets, dataset_info = list_available_datasets()
        job = submit_training_job(train_dataset_name, test_dataset_name, selected_config, dataset_info)
        print(f"Successfully submitted single configuration: {config_name}")
        return job
    except Exception as e:
        print(f"Error submitting single configuration: {e}")
        return None     

Available datasets in workspace:
  - emotion-raw-train (version: None, type: uri_folder)
  - emotion-raw-test (version: None, type: uri_folder)
  - emotion-train-data-v2 (version: None, type: uri_file)
  - emotion-test-data-v2 (version: None, type: uri_file)
  - emotion-processed-train (version: None, type: uri_file)
  - emotion-processed-test (version: None, type: uri_file)
  - emotion-encoders (version: None, type: uri_folder)
Starting hyperparameter testing with multiple configurations...
Submitting 6 training jobs with different hyperparameters...

Submitting job 1/6: random_forest_config_1
Training script created successfully with proper encoding
Train dataset path: azureml://subscriptions/0a94de80-6d3b-49f2-b3e9-ec5818862801/resourcegroups/buas-y2/workspaces/NLP6-2025/datastores/workspaceblobstore/paths/LocalUpload/55dd9664b81809ba9dd646fa4024abd5/feedback_upload_39__hn2s/
Test dataset path: azureml://subscriptions/0a94de80-6d3b-49f2-b3e9-ec5818862801/resourcegroups/buas-y2/works

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Job 'random_forest_config_1' submitted successfully!
Monitor at: https://ml.azure.com/runs/dreamy_animal_gqnj2my846?wsid=/subscriptions/0a94de80-6d3b-49f2-b3e9-ec5818862801/resourcegroups/buas-y2/workspaces/NLP6-2025&tid=0a33589b-0036-4fe8-a829-3ed0926af886
[SUCCESS] Successfully submitted: random_forest_config_1

Submitting job 2/6: random_forest_config_2
Training script created successfully with proper encoding
Train dataset path: azureml://subscriptions/0a94de80-6d3b-49f2-b3e9-ec5818862801/resourcegroups/buas-y2/workspaces/NLP6-2025/datastores/workspaceblobstore/paths/LocalUpload/55dd9664b81809ba9dd646fa4024abd5/feedback_upload_39__hn2s/
Test dataset path: azureml://subscriptions/0a94de80-6d3b-49f2-b3e9-ec5818862801/resourcegroups/buas-y2/workspaces/NLP6-2025/datastores/workspaceblobstore/paths/LocalUpload/43d17bfd2745a38aac4e0e6ca508aac0/test/
Configuration: random_forest_config_2
Model: random_forest
Parameters: {'n_estimators': 200, 'max_depth': 15, 'min_samples_split': 5}
Job 'r