# Statsmodels

In [3]:
import pandas as pd

In [36]:
df = pd.read_csv('data/merged_data.csv')

df.drop(columns=["UK L'Oreal Paris Haircare Total Online Sellout Units",
                "UK L'Oreal Paris Haircare Online Average Price (in pound)",
                 "UK L'Oreal Paris Haircare Total Online Sellout Value (in pound)"], inplace=True)

df.rename(columns={"UK L'Oreal Paris Haircare Total Offline Sellout Units": "offline_units",
                   "UK L'Oreal Paris Haircare Offline Average Price (in pound)": "offline_average_price",
                   "UK L'Oreal Paris Haircare Total Offline Sellout Value (in pound)": "offline_sellout_value",
                   "UK L'Oreal Paris Haircare Total Weigheted Promotion Distribution (%)": "weighted_promotion_distribution"},
                    inplace=True)

In [37]:
MMM_CATEGORIES = {
    "search": ["google", "amazon"],
    "retail_media": ["amazon_retail", "tesco", "citrus", "the_hut_group"],
    "video": ["google_video", "youtube", "bvod", "linear"],
    "social": ["meta", "pinterest", "tik_tok", "meta_collab_ads", "influencer_management"],
    "display": ["criteo", "testers_and_merchandising"]
}

In [38]:
growth_columns = ["growth_driver_l1", "growth_driver_l2", "growth_driver_l3", "growth_driver_l4", "growth_driver_l5"]

df['Starting Week'] = pd.to_datetime(df['Starting Week'])

# Initialize columns to store execution and investment for each category
for category in MMM_CATEGORIES.keys():
    df[f"{category}_impression"] = 0
    df[f"{category}_spend"] = 0

for index, row in df.iterrows():
    driver = row["growth_driver_l5"]
    for category, drivers in MMM_CATEGORIES.items():
        if driver in drivers:
            df.at[index, f"{category}_impression"] += row["execution"]
            df.at[index, f"{category}_spend"] += row["investment (in pound)"]

df = df.drop(columns=growth_columns + ["metric", "Year_x", "Year_y"])

df_grouped = df.groupby("Starting Week").agg({
    'execution': 'first',
    'offline_average_price': 'first',
    'weighted_promotion_distribution': 'first',
    'offline_sellout_value': 'first',
    'offline_units': 'first',
    'search_impression': 'sum',
    'search_spend': 'sum',
    'retail_media_impression': 'sum',
    'retail_media_spend': 'sum',
    'video_impression': 'sum',
    'video_spend': 'sum',
    'social_impression': 'sum',
    'social_spend': 'sum',
    'display_impression': 'sum',
    'display_spend': 'sum'
}).reset_index()

  df.at[index, f"{category}_impression"] += row["execution"]
  df.at[index, f"{category}_spend"] += row["investment (in pound)"]
  df.at[index, f"{category}_impression"] += row["execution"]
  df.at[index, f"{category}_spend"] += row["investment (in pound)"]
  df.at[index, f"{category}_impression"] += row["execution"]
  df.at[index, f"{category}_spend"] += row["investment (in pound)"]
  df.at[index, f"{category}_impression"] += row["execution"]
  df.at[index, f"{category}_spend"] += row["investment (in pound)"]
  df.at[index, f"{category}_impression"] += row["execution"]
  df.at[index, f"{category}_spend"] += row["investment (in pound)"]


In [39]:
import holidays

# Get UK holidays for 2022 and 2023
uk_holidays = holidays.UnitedKingdom(years=[2022, 2023])

# Convert to DataFrame for better visualization
holiday_data = pd.DataFrame(list(uk_holidays.items()), columns=["Date", "Holiday"])

# Sort the holidays by date
holiday_data['Date'] = pd.to_datetime(holiday_data['Date'])
holiday_data = holiday_data.sort_values(by='Date')

# Add a week number column
holiday_data['Week'] = holiday_data['Date'].dt.strftime('%Y-W%U')

In [40]:
# Add a column to df_grouped to indicate if there is a holiday that week
df_grouped['is_holiday'] = df_grouped['Starting Week'].dt.strftime('%Y-W%U').isin(holiday_data['Week']).astype(int)

# Display the updated DataFrame
df_grouped["is_holiday"].sum()

12

In [41]:
import statsmodels.api as sm

# Define the target variable and the channels
target = 'execution'
channels = list(MMM_CATEGORIES.keys())

# Prepare the data for the model
X = df_grouped[channels]
y = df_grouped[target]

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Fit the OLS model
model = sm.OLS(y, X).fit()

# Print the model summary
print(model.summary())

ModuleNotFoundError: No module named 'statsmodels'

# Meridian

In [42]:
df_grouped["geo"] = "Geo0"
df_grouped.to_csv("data/offline_binned.csv", index=False)

In [60]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp
import arviz as az

import IPython

from meridian import constants
from meridian.data import load
from meridian.data import test_utils
from meridian.model import model
from meridian.model import spec
from meridian.model import prior_distribution
from meridian.analysis import optimizer
from meridian.analysis import analyzer
from meridian.analysis import visualizer
from meridian.analysis import summarizer
from meridian.analysis import formatter

coord_to_columns = load.CoordToColumns(
    time='Starting Week',
    geo='geo',
    controls=['weighted_promotion_distribution'],
    #population='population',
    kpi='offline_units',
    revenue_per_kpi='offline_average_price',
    media=[
        'search_impression',
        'retail_media_impression',
        'video_impression',
        'social_impression',
        'display_impression',
    ],
    media_spend=[
        'search_spend',
        'retail_media_spend',
        'video_spend',
        'social_spend',
        'display_spend',
    ],
    #organic_media=['Organic_channel0_impression'],
    #non_media_treatments=['Promo'],
)

correct_media_to_channel = {
    'search_impression': 'search',
    'retail_media_impression': 'retail_media',
    'video_impression': 'video',
    'social_impression': 'social',
    'display_impression': 'display',
}

correct_media_spend_to_channel = {
    'search_spend' : "search",
    'retail_media_spend': "retail_media",
    'video_spend': "video",
    'social_spend': "social",
    'display_spend': "display",
}

loader = load.CsvDataLoader(
    csv_path="data/offline_binned.csv",
    kpi_type='non_revenue',
    coord_to_columns=coord_to_columns,
    media_to_channel=correct_media_to_channel,
    media_spend_to_channel=correct_media_spend_to_channel,
)
data = loader.load()


  self.df[geo_column_name] = self.df[geo_column_name].replace(
  if (constants.GEO) not in self.dataset.dims.keys():
  if constants.MEDIA_TIME not in self.dataset.dims.keys():


In [69]:
roi_mu = 0.5     # Mu for ROI prior for each media channel.
roi_sigma = 0.5  # Sigma for ROI prior for each media channel.
prior = prior_distribution.PriorDistribution(
    roi_m=tfp.distributions.LogNormal(roi_mu, roi_sigma, name=constants.ROI_M)
)
model_spec = spec.ModelSpec(prior=prior)

mmm = model.Meridian(input_data=data, model_spec=model_spec)



In [None]:
mmm.sample_prior(500)
mmm.sample_posterior(n_chains=7, n_adapt=500, n_burnin=500, n_keep=1000)



In [63]:
model_fit = visualizer.ModelFit(mmm)
model_fit.plot_model_fit()

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [67]:
mmm_summarizer = summarizer.Summarizer(mmm)


In [68]:
mmm_summarizer.output_model_results_summary('summary_output.html', filepath='output')

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
