In [1]:
import mlflow
import mlflow.sklearn

# notebook is inside /notebooks, db is in root
mlflow.set_tracking_uri("sqlite:///../mlflow.db")

mlflow.set_experiment("AI Job Salary Prediction")


2026/01/25 22:48:26 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/25 22:48:26 INFO mlflow.store.db.utils: Updating database tables
2026/01/25 22:48:26 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/25 22:48:26 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/25 22:48:26 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/25 22:48:26 INFO alembic.runtime.migration: Will assume non-transactional DDL.


<Experiment: artifact_location='file:///c:/Users/ASUS/Desktop/job-salary-ml/notebooks/mlruns/1', creation_time=1768327109984, experiment_id='1', last_update_time=1768327109984, lifecycle_stage='active', name='AI Job Salary Prediction', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [2]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [3]:
df = pd.read_csv("../data/raw/ai_jobs.csv")

df["avg_salary"] = (df["salary_min_usd"] + df["salary_max_usd"]) / 2
df = df.drop(columns=["job_id", "salary_min_usd", "salary_max_usd"])

X = df.drop(columns=["avg_salary"])
y = df["avg_salary"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [4]:
preprocessor = joblib.load("../artifacts/preprocessor.pkl")


In [5]:
## Model 1: Ridge Regression with mlflow
with mlflow.start_run(run_name="Ridge"):
    ridge_pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", Ridge(alpha=1.0))
    ])

    ridge_pipeline.fit(X_train, y_train)
    y_pred = ridge_pipeline.predict(X_test)

    mae_ridge = mean_absolute_error(y_test, y_pred)
    rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred))

    mlflow.log_param("model", "Ridge")
    mlflow.log_param("alpha", 1.0)
    mlflow.log_metric("MAE", mae_ridge)
    mlflow.log_metric("RMSE", rmse_ridge)


In [None]:
## Model 2: Random Forest Regressor with mlflow
with mlflow.start_run(run_name="RandomForest"):
    rf_pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(
            n_estimators=200,
            max_depth=12,
            min_samples_leaf=50,
            random_state=42,
            n_jobs=-1
        ))
    ])

    rf_pipeline.fit(X_train, y_train)
    y_pred = rf_pipeline.predict(X_test)

    mae_rf = mean_absolute_error(y_test, y_pred)
    rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred))

    mlflow.log_param("model", "RandomForest")
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", 12)
    mlflow.log_param("min_samples_leaf", 50)
    mlflow.log_metric("MAE", mae_rf)
    mlflow.log_metric("RMSE", rmse_rf)


: 