<a href="https://colab.research.google.com/github/AmitKPandey11/100-plus-python-coding-problems-with-solutions/blob/master/converted_mmm_mlflow_pymc_with_shap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Marketing Mix Modeling (MMM) — MLflow + PyMC + Visualizations

This notebook is a conversion of the enhanced MMM Python script into notebook form. It includes:
- optional Google Colab Drive mount
- data loading and parsing
- per-channel adstock tuning
- RidgeCV baseline with adstock + log1p features
- MLflow logging (if mlflow available)
- rich evaluation charts and contributions
- PyMC Bayesian MMM skeleton (if pymc available)
- portable HTML summary generation

**Notes before running:**
- Ensure the required packages are installed in your environment: pandas, numpy, scikit-learn, matplotlib, seaborn, joblib, mlflow (optional), pymc & arviz (optional).
- Place `Train_Dataset.csv` (and optional `Test_Dataset.csv`) in `/mnt/data` or update the DATA_DIR variable.
- Run cells sequentially.


In [None]:
/content/models/monitoring

In [28]:
!pip install mlflow



In [29]:

# Optional: mount Google Drive when running in Google Colab
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print('Mounted /content/drive')
except Exception as e:
    print('Google Colab drive mount skipped or not available:', e)

# Install notes (uncomment to run in Colab)
# !pip install pandas numpy scikit-learn matplotlib seaborn joblib mlflow pymc arviz


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Mounted /content/drive


In [30]:

# Imports & helper functions
import os, json, warnings
warnings.filterwarnings('ignore')
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import joblib
try:
    import mlflow, mlflow.sklearn
    mlflow_available = True
except Exception as e:
    mlflow_available = False
    print('mlflow not available:', e)
try:
    import pymc as pm, arviz as az
    pymc_available = True
except Exception as e:
    pm = None; az = None; pymc_available = False
    print('PyMC/ArviZ not available:', e)

DATA_DIR = '/content/'
TRAIN_FILE = os.path.join(DATA_DIR, 'Train_Dataset.csv')
TEST_FILE = os.path.join(DATA_DIR, 'Test_Dataset.csv')
os.makedirs('models', exist_ok=True)

def adstock_geometric(series, rate):
    out = np.zeros(len(series))
    for i in range(len(series)):
        out[i] = series.iloc[i] + (out[i-1] * rate if i>0 else 0.0)
    return pd.Series(out, index=series.index)

def ensure_dir(d):
    if not os.path.exists(d):
        os.makedirs(d, exist_ok=True)


In [31]:
TRAIN_FILE

'/content/Train_Dataset.csv'

In [32]:

# Load training data
if not os.path.exists(TRAIN_FILE):
    raise FileNotFoundError(f'Train file not found at {TRAIN_FILE}. Update DATA_DIR if needed.')

train = pd.read_csv(TRAIN_FILE)
print('Train shape:', train.shape)
display(train.head())

# parse date-like columns
for c in train.columns:
    if 'date' in c.lower() or 'week' in c.lower() or 'day' in c.lower():
        try:
            train[c] = pd.to_datetime(train[c])
        except Exception:
            pass

# Prefer ORDERS as target if available
if 'ORDERS' in train.columns:
    target_col = 'ORDERS'
else:
    numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    exclude = [c for c in train.columns if 'id' in c.lower() or 'date' in c.lower()]
    numeric_cols = [c for c in numeric_cols if c not in exclude]
    target_col = numeric_cols[0] if numeric_cols else None
print('Using target_col =', target_col)

# Detect media columns heuristically
exclude_cols = {target_col, 'NEWS_ANOMALY', 'HOLIDAY_FLAG', 'SALE_FLAG', 'CUSTOM_MONTH', 'WEEK_NO_IN_MONTH'}
media_cols = [c for c in train.select_dtypes(include=[np.number]).columns if c not in exclude_cols]
print('Detected media-like numeric columns (sample):', media_cols[:30])


Train shape: (130, 28)


Unnamed: 0,WEEK_START,ASA_APP,BING_DISPLAY,BING_SEARCH,DV360_DISPLAY,DV360_DISPLAY_OR_OLV,FACEBOOK_SOCIAL,GOOGLE_DISPLAY,GOOGLE_DISPLAY_OR_OLV,GOOGLE_SEARCH,...,BRAND_SPEND,REDBOX_APP,LIFTOFF_APP,ORDERS,NEWS_ANOMALY,HOLIDAY_FLAG,SALE_FLAG,WSJ_EMAILS_TOTAL,CUSTOM_MONTH,WEEK_NO_IN_MONTH
0,2022-01-03,19.78,92.44,34854.24,90.04,85.74,596157.82,71.58,73842.95,151262.48,...,31486.48,54.76,85.77,16612,0,1,1,2110938.44,1,1
1,2022-01-10,68.5,26.03,27042.74,57.47,71.62,665526.63,56.17,77942.52,128322.68,...,29486.99,90.34,67.79,18868,0,1,1,1525313.67,1,2
2,2022-01-17,24.45,59.16,35169.95,54.32,52.17,737263.23,29.59,47934.12,92450.86,...,27281.34,60.28,66.14,14425,0,1,0,2015786.95,1,3
3,2022-01-24,41.46,20.58,30953.54,71.58,84.55,505900.98,94.87,42025.72,109841.72,...,33625.09,27.79,21.02,8621,0,0,1,1742444.38,1,4
4,2022-01-31,78.64,26.52,27245.21,16.47,18.09,400583.55,59.54,60106.38,89608.01,...,46937.51,19.39,67.63,11823,0,1,0,1698713.22,2,1


Using target_col = ORDERS
Detected media-like numeric columns (sample): ['ASA_APP', 'BING_DISPLAY', 'BING_SEARCH', 'DV360_DISPLAY', 'DV360_DISPLAY_OR_OLV', 'FACEBOOK_SOCIAL', 'GOOGLE_DISPLAY', 'GOOGLE_DISPLAY_OR_OLV', 'GOOGLE_SEARCH', 'LINKEDIN_SOCIAL', 'META_APP', 'REDDIT_SOCIAL', 'SNAPCHAT_SOCIAL', 'TAPTICA_APP', 'TWITTER_SOCIAL', 'COMMISSIONS_AFFILIATE', 'PLACEMENT_AFFILIATE', 'BRAND_SPEND', 'REDBOX_APP', 'LIFTOFF_APP', 'WSJ_EMAILS_TOTAL']


In [33]:

# Per-channel adstock tuning (grid) using TimeSeriesSplit CV
import numpy as np
rates = list(np.round(np.linspace(0.1, 0.9, 17), 2))  # finer grid from 0.1 to 0.9 step 0.05

tscv = TimeSeriesSplit(n_splits=5)
best_rates = {}

if mlflow_available:
    mlflow.set_experiment('MMM_with_Adstock_and_PyMC')
    run = mlflow.start_run(run_name='adstock_tuning')
    mlflow.log_param('adstock_grid', rates)

for ch in media_cols:
    best_score = -np.inf
    best_r = 0.0
    for r in rates:
        X = train[media_cols].copy().fillna(0.0)
        X[ch] = adstock_geometric(X[ch], r)
        X_trans = np.log1p(X)
        ridge = Ridge()
        try:
            scores = cross_val_score(ridge, X_trans, train[target_col].values, cv=tscv, scoring='neg_mean_squared_error')
            mean_score = scores.mean()
        except Exception as e:
            mean_score = -np.inf
        if mean_score > best_score:
            best_score = mean_score
            best_r = r
    best_rates[ch] = best_r
    print(f'Channel: {ch:30s} best_rate: {best_r}  cv_neg_mse: {best_score:.2f}')
    if mlflow_available:
        mlflow.log_param(f"{ch}_best_rate", float(best_r))
        mlflow.log_metric(f"{ch}_cv_neg_mse", float(best_score))

with open('best_adstock_rates.json','w') as f:
    json.dump(best_rates, f, indent=2)
if mlflow_available:
    mlflow.log_artifact('best_adstock_rates.json')
    mlflow.end_run()

print('Saved best adstock rates to best_adstock_rates.json')


Channel: ASA_APP                        best_rate: 0.1  cv_neg_mse: -30766659.74
Channel: BING_DISPLAY                   best_rate: 0.9  cv_neg_mse: -31077721.62
Channel: BING_SEARCH                    best_rate: 0.55  cv_neg_mse: -30947260.86
Channel: DV360_DISPLAY                  best_rate: 0.7  cv_neg_mse: -28322480.31
Channel: DV360_DISPLAY_OR_OLV           best_rate: 0.1  cv_neg_mse: -30974137.10
Channel: FACEBOOK_SOCIAL                best_rate: 0.1  cv_neg_mse: -30837805.65
Channel: GOOGLE_DISPLAY                 best_rate: 0.5  cv_neg_mse: -29335068.31
Channel: GOOGLE_DISPLAY_OR_OLV          best_rate: 0.1  cv_neg_mse: -30684959.95
Channel: GOOGLE_SEARCH                  best_rate: 0.1  cv_neg_mse: -31079276.53
Channel: LINKEDIN_SOCIAL                best_rate: 0.1  cv_neg_mse: -31689067.57
Channel: META_APP                       best_rate: 0.8  cv_neg_mse: -29646691.54
Channel: REDDIT_SOCIAL                  best_rate: 0.9  cv_neg_mse: -22269308.64
Channel: SNAPCHAT_SOCIAL   

In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:

# Build features using best adstock rates and train RidgeCV
with open('best_adstock_rates.json','r') as f:
    best_rates = json.load(f)

X = train[media_cols].copy().fillna(0.0)
for ch in media_cols:
    r = best_rates.get(ch, 0.5)
    X[ch + '_adstock'] = adstock_geometric(X[ch], r)

X_log = np.log1p(X[[c for c in X.columns if c.endswith('_adstock')]])
X_final = X_log.copy()

alphas = np.logspace(-3, 3, 13)
tscv = TimeSeriesSplit(n_splits=5)

if mlflow_available:
    run = mlflow.start_run(run_name='ridge_final_model')
    mlflow.log_param('model', 'RidgeCV')
    mlflow.log_param('alphas', list(map(float, alphas)))

model = RidgeCV(alphas=alphas, cv=tscv, scoring='neg_mean_squared_error').fit(X_final, train[target_col].values)
preds = model.predict(X_final)
mse = mean_squared_error(train[target_col].values, preds)
r2 = r2_score(train[target_col].values, preds)
print('RidgeCV chosen alpha:', model.alpha_, 'MSE:', mse, 'R2:', r2)

joblib.dump(model, 'models/ridge_cv_adstock_model.pkl')
meta = {'target': target_col, 'media_cols': media_cols, 'best_rates': best_rates, 'alphas': list(map(float, alphas))}
with open('models/metadata.json','w') as f:
    json.dump(meta, f, indent=2)
if mlflow_available:
    mlflow.log_metric('train_mse', float(mse))
    mlflow.log_metric('train_r2', float(r2))
    mlflow.sklearn.log_model(model, artifact_path='ridge_model')
    mlflow.log_artifact('models/metadata.json')
    mlflow.end_run()

print('Model and metadata saved to ./models')



RidgeCV chosen alpha: 3.1622776601683795 MSE: 10829881.666337159 R2: 0.5983090590638344




Model and metadata saved to ./models


In [36]:

# Evaluate model, create charts, compute contributions, and log artifacts
model = joblib.load('models/ridge_cv_adstock_model.pkl')
X_train = X_final
y_train = train[target_col].values
preds = model.predict(X_train)
mse = mean_squared_error(y_train, preds)
r2 = r2_score(y_train, preds)
print(f'Train MSE: {mse:.2f}, R2: {r2:.3f}')

# Actual vs predicted
plt.figure(figsize=(10,4))
plt.plot(train.index, y_train, label='actual')
plt.plot(train.index, preds, label='predicted')
plt.legend(); plt.title('Actual vs Predicted'); plt.tight_layout(); plt.savefig('models/actual_vs_predicted.png'); plt.close()

# Residuals
resid = y_train - preds
plt.figure(figsize=(8,4)); plt.hist(resid, bins=30); plt.title('Residuals'); plt.tight_layout(); plt.savefig('models/residuals_hist.png'); plt.close()

# Channel contributions
coefs = pd.Series(model.coef_, index=X_train.columns)
contrib = (coefs * X_train.mean()).sort_values(ascending=False)
contrib.to_csv('models/channel_contributions.csv', header=['contribution'])
plt.figure(figsize=(10,6)); contrib.head(20).plot(kind='bar'); plt.title('Top 20 channel contributions'); plt.tight_layout(); plt.savefig('models/top20_contributions.png'); plt.close()

# Cumulative actual vs predicted
cum_df = pd.DataFrame({'actual': y_train, 'predicted': preds})
cum_df['actual_cum'] = np.cumsum(cum_df['actual'])
cum_df['pred_cum'] = np.cumsum(cum_df['predicted'])
plt.figure(figsize=(10,4)); plt.plot(cum_df['actual_cum'], label='actual_cum'); plt.plot(cum_df['pred_cum'], label='pred_cum'); plt.legend(); plt.title('Cumulative actual vs predicted'); plt.tight_layout(); plt.savefig('models/cumulative_actual_predicted.png'); plt.close()

# Spend vs orders scatter for top channels
for ch in media_cols[:4]:
    x = train[ch].fillna(0.0)
    y = train[target_col]
    plt.figure(figsize=(6,4)); plt.scatter(x,y, alpha=0.6, s=20); plt.xlabel(ch); plt.ylabel(target_col)
    df_tmp = pd.DataFrame({ch: x, target_col: y}).sort_values(ch)
    df_tmp['y_smooth'] = df_tmp[target_col].rolling(window=5, min_periods=1).mean()
    plt.plot(df_tmp[ch], df_tmp['y_smooth'], color='red')
    plt.title(f'{ch} spend vs {target_col} (with smooth)'); plt.tight_layout(); plt.savefig(f'models/{ch}_spend_vs_orders.png'); plt.close()

# Log evaluation artifacts to MLflow
if mlflow_available:
    run = mlflow.start_run(run_name='eval_and_artifacts')
    mlflow.log_metric('eval_mse', float(mse))
    mlflow.log_metric('eval_r2', float(r2))
    for fname in ['models/actual_vs_predicted.png','models/residuals_hist.png','models/top20_contributions.png','models/cumulative_actual_predicted.png']:
        if os.path.exists(fname):
            mlflow.log_artifact(fname, artifact_path='figures')
    mlflow.log_artifact('models/channel_contributions.csv', artifact_path='artifacts')
    mlflow.end_run()

print('Saved evaluation plots and artifacts to models/')

Train MSE: 10829881.67, R2: 0.598
Saved evaluation plots and artifacts to models/


In [37]:

# Robust PyMC Bayesian MMM cell with ArviZ fallbacks
if not (('pm' in globals() and pm is not None) and ('train' in globals() and 'X_final' in globals())):
    print('PyMC or required data not available in memory; ensure imports and feature prep cells have run.')
else:
    print('Running PyMC Bayesian skeleton (this may be slow).')
    X_bayes = X_final.copy().astype(float)
    y_bayes = train[target_col].values.astype(float)

    with pm.Model() as mmm_model:
        sigma = pm.HalfNormal('sigma', sigma=1e4)
        intercept = pm.Normal('intercept', mu=0, sigma=1e4)
        coefs = pm.Normal('coefs', mu=0, sigma=10, shape=X_bayes.shape[1])
        mu = intercept + pm.math.dot(X_bayes.values, coefs)
        y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y_bayes)

        trace = pm.sample(1000, tune=1000, target_accept=0.9)

        ppc = pm.sample_posterior_predictive(trace, var_names=['y_obs'], random_seed=42)

    # Try ArviZ conversion using available helpers
    az_data = None
    if 'az' in globals() and az is not None:
        for fn in ('from_pymc', 'from_pymc3', 'from_pymc4'):
            conv = getattr(az, fn, None)
            if callable(conv):
                try:
                    # different wrappers expect different args
                    az_data = conv(trace=trace, posterior_predictive=ppc)
                    print(f'Converted using az.{fn}()')
                    break
                except Exception as e:
                    print(f'az.{fn} conversion failed: {e}')
                    az_data = None

    # Plot using ArviZ if conversion succeeded, otherwise fallback to pm plotting and manual PPC
    if az_data is not None:
        try:
            az.plot_trace(az_data)
            plt.tight_layout(); plt.savefig('models/pymc_trace.png'); plt.close()
        except Exception as e:
            print('az.plot_trace failed:', e)
        try:
            try:
                az.plot_ppc(az_data)
            except Exception:
                # alternate signatures
                az.plot_ppc(az_data, data_pairs={'y_obs':'y_obs'})
            plt.tight_layout(); plt.savefig('models/pymc_ppc.png'); plt.close()
        except Exception as e:
            print('az.plot_ppc failed:', e)
        try:
            az.to_netcdf(az_data, 'models/pymc_inference.nc')
        except Exception as e:
            print('az.to_netcdf failed:', e)
    else:
        # Fallback plotting
        try:
            pm.plot_trace(trace)
            plt.tight_layout(); plt.savefig('models/pymc_trace.png'); plt.close()
        except Exception as e:
            print('pm.plot_trace failed:', e)
        try:
            import numpy as _np
            y_pred_mean = _np.mean(ppc['y_obs'], axis=0)
            plt.figure(figsize=(8,4))
            plt.hist(y_bayes, bins=30, alpha=0.6, label='observed')
            plt.hist(y_pred_mean, bins=30, alpha=0.6, label='ppc_mean')
            plt.legend(); plt.title('Posterior predictive check (observed vs predictive mean)')
            plt.tight_layout(); plt.savefig('models/pymc_ppc_manual.png'); plt.close()
            print('Saved manual PPC to models/pymc_ppc_manual.png')
        except Exception as e:
            print('Manual PPC failed:', e)

    # Log PyMC artifacts to MLflow if available
    if 'mlflow_available' in globals() and mlflow_available:
        try:
            run = mlflow.start_run(run_name='pymc_bayesian_mmm')
            for fname in ('models/pymc_trace.png','models/pymc_ppc.png','models/pymc_ppc_manual.png','models/pymc_inference.nc'):
                if os.path.exists(fname):
                    try:
                        mlflow.log_artifact(fname, artifact_path='pymc')
                    except Exception as e:
                        print('Failed to log artifact to MLflow:', fname, e)
            mlflow.end_run()
        except Exception as e:
            print('MLflow logging for PyMC failed:', e)

    print('PyMC block finished. Artifacts (if any) saved under models/.')

Running PyMC Bayesian skeleton (this may be slow).


Output()

Output()

Manual PPC failed: 'y_obs'
PyMC block finished. Artifacts (if any) saved under models/.


In [38]:

# Create portable HTML summary (embedded images)
import base64
def embed(path):
    if not os.path.exists(path):
        return ''
    ext = os.path.splitext(path)[1].lstrip('.')
    with open(path,'rb') as f:
        data = f.read()
    return f"data:image/{ext};base64," + base64.b64encode(data).decode('utf-8')

imgs = {name: embed(os.path.join('models', name)) for name in ['actual_vs_predicted.png','residuals_hist.png','top20_contributions.png','cumulative_actual_predicted.png','pymc_trace.png','pymc_ppc.png']}
contrib_head = ''
try:
    contrib_head = pd.read_csv('models/channel_contributions.csv', index_col=0).head(30).to_string()
except Exception:
    try:
        contrib_head = contrib.head(30).to_string()
    except Exception:
        contrib_head = 'N/A'

html = f"""
<html><head><meta charset='utf-8'><title>MMM Results Summary</title></head><body>
<h1>MMM Results Summary</h1>
<p>Target: <b>{target_col}</b></p>
<h2>Train metrics</h2><ul><li>MSE: {mse:.2f}</li><li>R2: {r2:.3f}</li></ul>
<h2>Figures</h2>
{f'<img src="{imgs["actual_vs_predicted.png"]}" width="900"><br>' if imgs['actual_vs_predicted.png'] else '<p>actual_vs_predicted not found</p>'}
{f'<img src="{imgs["residuals_hist.png"]}" width="500"><br>' if imgs['residuals_hist.png'] else ''}
{f'<img src="{imgs["top20_contributions.png"]}" width="700"><br>' if imgs['top20_contributions.png'] else ''}
{f'<img src="{imgs["cumulative_actual_predicted.png"]}" width="700"><br>' if imgs['cumulative_actual_predicted.png'] else ''}
{f'<h2>PyMC outputs</h2><img src="{imgs["pymc_trace.png"]}" width="800"><br><img src="{imgs["pymc_ppc.png"]}" width="800"><br>' if imgs['pymc_trace.png'] else ''}
<h2>Top channel contributions</h2><pre>{contrib_head}</pre>
</body></html>
"""

with open('models/summary_embedded.html','w', encoding='utf-8') as f:
    f.write(html)

print('Wrote models/summary_embedded.html')

Wrote models/summary_embedded.html



## Next steps / customization ideas
- Tune adstock grid more finely and fit saturation (Hill) parameters jointly.
- Add seasonality/trend (STL) controls and holiday effects.
- Extend Bayesian model to jointly estimate adstock + saturation + hierarchical priors.
- Use MLflow Model Registry to register and serve the model.



### MLflow Model Versioning & Monitoring (added/updated)

This section trains multiple candidate models, logs them to MLflow, attempts to register them in the Model Registry, and produces basic monitoring artifacts (data drift, performance over time).

**What it does:**
- Trains Ridge, RandomForest, ElasticNetCV as candidates
- Logs params, metrics, artifacts, model signatures to MLflow
- Attempts to register each model in MLflow Model Registry (if available)
- Computes simple KS drift between train and test features and saves monitoring artifacts under `models/monitoring/`
- Saves a `models/model_candidates_summary.csv` summarising train/test metrics for each model


In [40]:

# Multi-model training, MLflow logging, registration & basic monitoring
import os, time, json
import numpy as np
import pandas as pd
import joblib
import mlflow
from sklearn.linear_model import Ridge, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from mlflow.tracking import MlflowClient
from mlflow.models.signature import infer_signature

# Ensure X_final and train exist
if 'X_final' not in globals():
    raise RuntimeError('X_final not found. Run feature-prep & model training cells first.')

X_train = X_final.copy()
y_train = train[target_col].values

# Prepare test features if test exists
TEST_PATH = os.path.join(DATA_DIR, 'Test_Dataset.csv')
X_test = None
if os.path.exists(TEST_PATH):
    test = pd.read_csv(TEST_PATH)
    X_test = test[media_cols].copy().fillna(0.0)
    for ch in media_cols:
        r = best_rates.get(ch, 0.5)
        X_test[ch + '_adstock'] = adstock_geometric(X_test[ch], r)
    X_test = np.log1p(X_test[[c for c in X_test.columns if c.endswith('_adstock')]])

models_to_try = {
    'ridge': Pipeline([('scaler', StandardScaler()), ('ridge', Ridge(alpha=1.0))]),
    'rf': Pipeline([('rf', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=2))]),
    'elasticnet_cv': Pipeline([('scaler', StandardScaler()), ('en', ElasticNetCV(cv=5))])
}

mlflow_experiment = 'MMM_MultiModel_Registry'
if mlflow_available:
    mlflow.set_experiment(mlflow_experiment)
client = MlflowClient() if mlflow_available else None

results = []

for name, mdl in models_to_try.items():
    print('\nTraining model:', name)
    if mlflow_available:
        mlflow.start_run(run_name=f'model_{name}_{int(time.time())}')
    try:
        mdl.fit(X_train, y_train)
        preds = mdl.predict(X_train)
        mse = mean_squared_error(y_train, preds)
        r2 = r2_score(y_train, preds)
        print(f'{name} train mse={mse:.2f}, r2={r2:.3f}')

        # Save local model
        model_path = f'models/{name}_model.pkl'
        joblib.dump(mdl, model_path)

        # Save importance/coefs
        try:
            if hasattr(mdl.named_steps, 'ridge'):
                coefs = mdl.named_steps['ridge'].coef_
                pd.Series(coefs, index=X_train.columns).to_csv(f'models/{name}_coefs.csv', header=['coef'])
            elif hasattr(mdl.named_steps, 'en'):
                coefs = mdl.named_steps['en'].coef_
                pd.Series(coefs, index=X_train.columns).to_csv(f'models/{name}_coefs.csv', header=['coef'])
            elif hasattr(mdl.named_steps, 'rf'):
                res = permutation_importance(mdl, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2)
                imp = pd.Series(res.importances_mean, index=X_train.columns).sort_values(ascending=False)
                imp.to_csv(f'models/{name}_perm_importance.csv', header=['importance'])
        except Exception as e:
            print('Feature importance save failed for', name, e)

        # Plot actual vs pred
        try:
            import matplotlib.pyplot as plt
            plt.figure(figsize=(8,3))
            plt.plot(y_train, label='actual')
            plt.plot(preds, label='preds')
            plt.legend()
            plt.title(f'{name} actual vs preds')
            plt.tight_layout()
            plt.savefig(f'models/{name}_actual_vs_preds.png')
            plt.close()
        except Exception as e:
            print('Plot failed for', name, e)

        # Infer signature
        try:
            signature = infer_signature(X_train, mdl.predict(X_train))
        except Exception:
            signature = None

        # --- MLflow logging & registry (Option A safe artifact name) ---
        if mlflow_available:
            try:
                run_id = mlflow.active_run().info.run_id
            except Exception as e:
                print("Unable to get active run id:", e)
                run_id = None

            # safe artifact name (no slashes or bad chars)
            artifact_name = f"{name}_artifacts"

            # Log model
            try:
                mlflow.sklearn.log_model(sk_model=mdl, artifact_path=artifact_name, signature=signature)
                print(f"Model logged to MLflow run {run_id} at artifact path '{artifact_name}'")
            except Exception as e:
                print("Failed to log model to MLflow:", e)

            # Build model_uri
            model_uri = None
            if run_id is not None:
                model_uri = f"runs:/{run_id}/{artifact_name}"

            # Try to register
            if client is not None and model_uri:
                try:
                    reg_name = f"MMM_{name}".upper()
                    existing = [m.name for m in client.list_registered_models()]
                    if reg_name not in existing:
                        client.create_registered_model(reg_name)
                        print(f"Created registered model '{reg_name}'")
                    mv = client.create_model_version(name=reg_name, source=model_uri, run_id=run_id)
                    print('Registered model version:', reg_name, mv.version)
                except Exception as e:
                    print('Model registry step skipped/failed:', e)

            # Log metrics
            mlflow.log_param('model_type', name)
            mlflow.log_metric('train_mse', float(mse))
            mlflow.log_metric('train_r2', float(r2))
        # --------------------------------------------------------------

        results.append({'name': name, 'mse': mse, 'r2': r2, 'model_path': model_path})
    finally:
        if mlflow_available:
            mlflow.end_run()

# Save summary
pd.DataFrame(results).to_csv('models/model_candidates_summary.csv', index=False)
print('Saved models/model_candidates_summary.csv')





Training model: ridge
ridge train mse=10425048.80, r2=0.613




Model logged to MLflow run afe1b15378014d679e5dfa8163fc20f9 at artifact path 'ridge_artifacts'
Model registry step skipped/failed: 'MlflowClient' object has no attribute 'list_registered_models'

Training model: rf
rf train mse=2190776.53, r2=0.919




Model logged to MLflow run 6fb7c99e0fb04fd9bb1ae2d2be029f3e at artifact path 'rf_artifacts'
Model registry step skipped/failed: 'MlflowClient' object has no attribute 'list_registered_models'

Training model: elasticnet_cv
elasticnet_cv train mse=17135055.64, r2=0.364




Model logged to MLflow run 4dd1987fbd094188a825d058f4c72629 at artifact path 'elasticnet_cv_artifacts'
Model registry step skipped/failed: 'MlflowClient' object has no attribute 'list_registered_models'
Saved models/model_candidates_summary.csv


In [41]:

# Basic monitoring helpers: compute KS drift and log monitoring artifacts
import scipy.stats as stats
os.makedirs('models/monitoring', exist_ok=True)

def compute_ks_drift(train_vec, test_vec):
    try:
        ks = stats.ks_2samp(train_vec, test_vec)
        return {'statistic': float(ks.statistic), 'pvalue': float(ks.pvalue)}
    except Exception as e:
        return {'error': str(e)}

monitor_records = []
for r in results:
    name = r['name']
    record = {'model': name, 'train_mse': float(r['mse']), 'train_r2': float(r['r2'])}
    if X_test is not None:
        ks_stats = {}
        for feat in X_train.columns:
            ks_stats[feat] = compute_ks_drift(X_train[feat].values, X_test[feat].values)
        with open(f'models/monitoring/{name}_data_drift.json','w') as f:
            json.dump(ks_stats, f, indent=2)
        if target_col in test.columns:
            y_test = test[target_col].values
            preds_test = joblib.load(r['model_path']).predict(X_test)
            test_mse = mean_squared_error(y_test, preds_test)
            test_r2 = r2_score(y_test, preds_test)
            record.update({'test_mse': float(test_mse), 'test_r2': float(test_r2)})
            perf_path = f'models/monitoring/{name}_perf_over_time.csv'
            pd.DataFrame([{'timestamp': pd.Timestamp.now(), 'test_mse': test_mse, 'test_r2': test_r2}]).to_csv(perf_path, index=False)
    monitor_records.append(record)
    if mlflow_available:
        mlflow.start_run(run_name=f'monitoring_{name}_{int(time.time())}')
        mlflow.log_metric('monitor_train_mse', float(r['mse']))
        if 'test_mse' in record:
            mlflow.log_metric('monitor_test_mse', float(record['test_mse']))
        drift_file = f'models/monitoring/{name}_data_drift.json'
        if os.path.exists(drift_file):
            mlflow.log_artifact(drift_file, artifact_path='monitoring')
        mlflow.end_run()

pd.DataFrame(monitor_records).to_csv('models/monitoring/monitoring_summary.csv', index=False)
print('Wrote models/monitoring/monitoring_summary.csv')


Wrote models/monitoring/monitoring_summary.csv


In [42]:

# SHAP attribution and additional response-curve charts
# Requires shap package (pip install shap). This cell is defensive and will skip steps if shap isn't available.
import os
try:
    import shap
    shap_available = True
except Exception as e:
    shap_available = False
    print('shap not available:', e)

# Choose a model to explain: prefer RandomForest if trained, else the ridge model
expl_model = None
expl_model_name = None
if os.path.exists('models/rf_model.pkl'):
    try:
        expl_model = joblib.load('models/rf_model.pkl')
        expl_model_name = 'rf'
    except Exception:
        expl_model = None
if expl_model is None and os.path.exists('models/ridge_model.pkl'):
    try:
        expl_model = joblib.load('models/ridge_model.pkl')
        expl_model_name = 'ridge'
    except Exception:
        expl_model = None
if expl_model is None and 'model' in globals():
    expl_model = globals().get('model')
    expl_model_name = 'active_model'

print('Explainer model:', expl_model_name)

# Prepare X for SHAP: use X_final if present
if 'X_final' in globals():
    X_explain = X_final.copy()
else:
    raise RuntimeError('X_final not found. Re-run feature preparation cells before SHAP cell.')

# Compute SHAP values if available
if shap_available and expl_model is not None:
    try:
        # For tree models, use TreeExplainer; otherwise KernelExplainer (slower)
        if hasattr(shap, 'TreeExplainer') and ('RandomForest' in type(expl_model).__name__ or 'RandomForest' in str(type(expl_model))):
            expl = shap.TreeExplainer(expl_model)
        else:
            # KernelExplainer needs a background dataset; use a sample of X
            background = X_explain.sample(n=min(50, len(X_explain)), random_state=42)
            expl = shap.KernelExplainer(lambda v: expl_model.predict(pd.DataFrame(v, columns=X_explain.columns)), background)
        shap_values = expl.shap_values(X_explain)
        # Save SHAP summary plot
        try:
            plt.figure(figsize=(8,6))
            shap.summary_plot(shap_values, X_explain, show=False)
            plt.tight_layout(); plt.savefig('models/shap_summary.png', dpi=150); plt.close()
        except Exception as e:
            print('Could not create SHAP summary plot:', e)
        # Save shap values to csv (may be large)
        try:
            if isinstance(shap_values, list):
                # for multi-output models shap_values can be a list; take first
                arr = shap_values[0]
            else:
                arr = shap_values
            pd.DataFrame(arr, columns=X_explain.columns).to_csv('models/shap_values.csv', index=False)
            print('Saved models/shap_values.csv and shap_summary.png')
        except Exception as e:
            print('Saving SHAP values failed:', e)
    except Exception as e:
        print('SHAP explain failed:', e)
else:
    print('Skipping SHAP — shap not installed or model not found.')

# Additional charts: per-channel response curves (vary one channel across its range, keep others at mean)
os.makedirs('models/response_curves', exist_ok=True)
for ch in media_cols[:10]:  # limit to top 10 channels for speed
    try:
        base = X_explain.copy()
        # original (pre-log) values: we reverse log1p and adstock approx - but here we'll simulate by scaling
        ch_range = np.linspace(train[ch].min(), train[ch].max(), 50)
        preds = []
        for v in ch_range:
            temp = base.copy()
            # set the adstocked feature for this channel to log1p(adstock_of_v)
            # approximate adstock of a single value repeated: adstock = v / (1-rate) -> use average best_rate
            r = best_rates.get(ch, 0.5)
            adstock_approx = v / (1 - r) if (1 - r) > 0 else v
            temp[colname := ch + '_adstock'] = np.log1p(adstock_approx)
            preds.append(expl_model.predict(temp.values if hasattr(expl_model, 'predict') else temp))
        # preds may be 2D if model returns array; take mean across rows
        preds = np.array(preds)
        if preds.ndim == 2:
            preds_mean = preds.mean(axis=1)
        else:
            preds_mean = preds
        # Plot response curve
        plt.figure(figsize=(6,4))
        plt.plot(ch_range, preds_mean)
        plt.title(f'Response curve for {ch} (approx)')
        plt.xlabel(ch); plt.ylabel('predicted ' + target_col)
        plt.tight_layout(); plt.savefig(f'models/response_curves/{ch}_response_curve.png', dpi=150); plt.close()
    except Exception as e:
        print('Response curve failed for', ch, e)

print('SHAP + response curves cell finished.')

Explainer model: rf


  0%|          | 0/130 [00:00<?, ?it/s]

Saved models/shap_values.csv and shap_summary.png
SHAP + response curves cell finished.
