In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC ## Demo_train_Notebook1
# MAGIC This notebook is a minimal prototype for training a sample ML model on Databricks.
# MAGIC 
# MAGIC **Purpose:** Demonstrate full notebook lifecycle for automation & job integration.
# MAGIC 
# MAGIC - Uses mock data
# MAGIC - Trains fast
# MAGIC - Logs to MLflow
# MAGIC - Staff-level architecture best practices


In [0]:
#install dependencies
%pip install pandas scikit-learn mlflow --quiet


In [0]:
# Section 2 - Imports & Setup

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
from datetime import datetime
import json



In [0]:
data = pd.DataFrame({
    "feature1": np.random.rand(100),
    "feature2": np.random.rand(100),
    "label": np.random.randint(0, 2, 100)
})


In [0]:
# For demo: keep preprocessing minimal
X = data[["feature1", "feature2"]]
y = data["label"]


In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [0]:
model = LogisticRegression()
model.fit(X_train, y_train)


In [0]:
predictions = model.predict(X_test)
acc = accuracy_score(y_test, predictions)
print(f"Accuracy: {acc}")


In [0]:
# Section 8 - MLflow Tracking

# Get notebook context as JSON (safe for all Databricks environments)
notebook_context = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
context_json = json.loads(notebook_context.toJson())

# Extract Git-related info from context
git_info = context_json.get("extraContext", {})
git_url = git_info.get("vcs.repo.url", "")
branch_name = git_info.get("vcs.branch", "unknown")

# Derive repo name from Git URL
if git_url:
    from urllib.parse import urlparse
    repo_name = urlparse(git_url).path.strip("/").replace(".git", "").split("/")[-1]
else:
    repo_name = "unknown-repo"

# Derive environment from branch
env = "prod" if branch_name == "main" else "dev"

# Build dynamic experiment path
experiment_path = f"/Users/{user_email}/{repo_name}_train_{env}"

# Ensure experiment exists
client = MlflowClient()
if not client.get_experiment_by_name(experiment_path):
    client.create_experiment(experiment_path)

mlflow.set_experiment(experiment_path)

# Clean up any stale run
if mlflow.active_run():
    mlflow.end_run()

# Define dynamic run name
run_name = f"{repo_name}-train-{env}"

# Start MLflow run
with mlflow.start_run(run_name=run_name):
    mlflow.set_tags({
        "project": repo_name,
        "notebook": "Demo_train_Notebook1",
        "branch": branch_name,
        "env": env,
        "owner": user_email,
        "run_type": "train",
        "date": datetime.today().strftime('%Y-%m-%d')
    })

    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("train_rows", len(X_train))
    mlflow.log_param("features", X.columns.tolist())

    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(model, "model")

    print(f"✅ Model logged to MLflow under run '{run_name}' on branch '{branch_name}' and env '{env}'")


In [0]:
print("✅ Training complete and model logged.")
