In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# load data
data=pd.read_csv('E://cv//MMM_test_data.csv', sep=',', encoding='latin-1', header=0)


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   start_of_week    104 non-null    object 
 1   revenue          104 non-null    float64
 2   spend_channel_1  104 non-null    float64
 3   spend_channel_2  104 non-null    float64
 4   spend_channel_3  104 non-null    float64
 5   spend_channel_4  104 non-null    float64
 6   spend_channel_5  104 non-null    float64
 7   spend_channel_6  104 non-null    float64
 8   spend_channel_7  104 non-null    float64
dtypes: float64(8), object(1)
memory usage: 7.4+ KB


In [14]:
data

Unnamed: 0,start_of_week,revenue,spend_channel_1,spend_channel_2,spend_channel_3,spend_channel_4,spend_channel_5,spend_channel_6,spend_channel_7
0,30-08-20,157906.75,2625.48,262.71,12954.12,3609.63,12955.29,12659.12,19379.79
1,06-09-20,186425.68,2634.01,108.66,8760.28,4560.60,12747.70,12338.18,22473.45
2,13-09-20,161607.39,2087.08,110.32,7155.42,4362.96,15015.41,10811.15,22596.05
3,20-09-20,180089.13,1690.70,52.79,15185.22,3883.41,15521.41,12890.22,24728.73
4,27-09-20,217793.98,1547.30,80.56,18524.05,4043.09,15793.74,12642.55,26515.48
...,...,...,...,...,...,...,...,...,...
99,24-07-22,72021.50,0.00,1118.78,22305.10,4566.01,2987.70,0.00,19916.88
100,31-07-22,90786.21,0.00,122.16,15812.65,4795.51,3293.40,0.00,22687.98
101,07-08-22,105929.40,0.00,754.09,12166.85,4961.12,4057.03,0.00,30142.64
102,14-08-22,90506.31,0.00,1043.22,10856.58,4431.13,3086.81,0.00,28622.78


# Modeling the adstock effect
The adstock effect captures the delayed influence of advertising expenditure. We'll use a simple adstock model in which the effect of advertising decreases exponentially over time.

In [15]:
# Function to apply the adstock transformation
def adstock_transform(spend, decay_rate):
    adstock = np.zeros_like(spend)
    for t in range(1, len(spend)):
        adstock[t] = spend[t] + decay_rate * adstock[t-1]
    return adstock

In [16]:
# Apply adstock transformation to each channel (example with a rate of 0.5)
decay_rate = 0.5
adstock_data = np.zeros((len(data), 7))
for i in range(1, 8):
    adstock_data[:, i-1] = adstock_transform(data[f'spend_channel_{i}'].values, decay_rate)


# construction of an extension of classical linear regression
Polynomial regression: polynomial regression is a type of non-linear regression that models the relationship between the input variable and the output variable as a polynomial function of the nth degree.

In [17]:
# Prepare data for linear regression
X = adstock_data  # Explanatory variables (expenditure adstock)
y = data['revenue']  # Target variable (revenues)
# Division of data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


In [20]:
# Add polynomial terms
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [23]:
# Create linear regression model
model = LinearRegression()

# Model training with cross-validation
scores = cross_val_score(model, X_train_poly, y_train, cv=5, scoring='r2')
print(f'Validation croisée R^2 scores: {scores}')
print(f'Mean cross-validated R^2 score: {scores.mean()}')

# Training on the complete set
model.fit(X_train_poly, y_train)

# Model coefficients
coefficients = dict(zip(['channel_'+str(i) for i in range(1, 8)], model.coef_))
print('Channel coefficients :', coefficients)

Validation croisée R^2 scores: [ -0.35619441  -1.30237142  -1.73866232 -12.40802647  -0.12454024]
Mean cross-validated R^2 score: -3.185958972467641
Channel coefficients : {'channel_1': 15.378211762601307, 'channel_2': -53.3985198083431, 'channel_3': -5.099308258671339, 'channel_4': -19.652272707333275, 'channel_5': -1.0348742545159157, 'channel_6': 6.433994743943118, 'channel_7': 4.527450351853535}


#  Evaluating model performance
We will evaluate the model's performance using metrics such as RMSE (Root Mean Squared Error) and R^2

In [24]:
# Prédictions et évaluation
y_pred = model.predict(X_test_poly)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'R^2: {r2}')


RMSE: 27080.19838788203
R^2: 0.6942250854567515


# Conclusions and ROI
We'll analyze the coefficients to understand the impact of each channel on revenues, and calculate the ROI for each channel.

In [25]:
# Display coefficients
for channel, coef in coefficients.items():
    print(f'Impact du canal {channel} : {coef:.2f}')

# Calculate ROI for each channel
roi = {}
for i in range(7):
    total_spend = data[f'spend_channel_{i+1}'].sum()
    total_impact = coefficients[f'channel_{i+1}'] * total_spend
    roi[f'channel_{i+1}'] = total_impact / total_spend

# Display ROI
print(roi)

# Best channel in terms of ROI
best_channel = max(roi, key=roi.get)
print(f'The best channel in terms of ROI is {best_channel} with an ROI of {roi[best_channel]}')

Impact du canal channel_1 : 15.38
Impact du canal channel_2 : -53.40
Impact du canal channel_3 : -5.10
Impact du canal channel_4 : -19.65
Impact du canal channel_5 : -1.03
Impact du canal channel_6 : 6.43
Impact du canal channel_7 : 4.53
{'channel_1': 15.378211762601307, 'channel_2': -53.3985198083431, 'channel_3': -5.099308258671339, 'channel_4': -19.652272707333275, 'channel_5': -1.0348742545159157, 'channel_6': 6.433994743943118, 'channel_7': 4.527450351853535}
The best channel in terms of ROI is channel_1 with an ROI of 15.378211762601307
