In [189]:
import pandas as pd
import numpy as np

In [190]:
# Load csv files
channel_spending = pd.read_csv("../data/channel_spending.csv")
channel = pd.read_csv("../data/channel.csv")
customer = pd.read_csv("../data/customer.csv")
loan = pd.read_csv("../data/loan.csv")
sales_opportunities = pd.read_csv("../data/sales_opportunities.csv")
touchpoints = pd.read_csv("../data/touchpoints.csv")

In [191]:
# Merge touchpoints with channel data for clarity
touchpoints = touchpoints.merge(channel, on="ChannelID", how="left")
touchpoints

Unnamed: 0,TouchpointID,CustomerID,ChannelID,Timestamp,ChannelName
0,T00001,C0001,CH03,2023-02-18 00:51:00,Direct
1,T00002,C0001,CH01,2023-07-17 22:59:00,Paid Search
2,T00003,C0001,CH03,2024-01-21 06:02:00,Direct
3,T00004,C0001,CH05,2024-08-07 13:46:00,Phone Call
4,T00005,C0001,CH02,2024-09-01 13:08:00,Email
...,...,...,...,...,...
5572,T04945,C1000,CH04,2023-08-22 20:48:00,Referral
5573,T04946,C1000,CH03,2023-12-28 07:24:00,Direct
5574,T04947,C1000,CH04,2024-01-29 10:29:00,Referral
5575,T04948,C1000,CH01,2024-10-29 19:21:00,Paid Search


### 1. Model the customer journey based on the conditional probabilities

In [192]:
# Step 1: Create customer journeys (sequences of touchpoints)
# Sort by CustomerID and Timestamp to build ordered journeys
touchpoints = touchpoints.sort_values(by=["CustomerID", "Timestamp"])
journeys = touchpoints.groupby("CustomerID")["ChannelName"].apply(list)

# Add "Start" at the beginning and "Converted" at the end for completed conversions
# Use "NoConversion" for non-converters if needed
def add_start_and_end(journey):
    # Ensure "Start" is at the beginning
    journey = ["Start"] + journey

    # Check if the last element is "Converted"
    if journey[-1] != "Converted":
        # If not, append "NoConversion" at the end
        journey.append("NoConversion")
    return journey

journeys = journeys.apply(add_start_and_end)

# Calculate the baseline conversion rate (i.e., fraction of journeys ending with "Converted")
baseline_conv_rate = journeys.apply(lambda j: j[-1] == "Converted").mean()
baseline_conv_rate


0.628

### 2. Simulate the conversion of users based on the probability matrix to obtain the conversion rates of the entire system

In [193]:
# Step 2: Create a transition matrix
# To build a transition matrix, which represents the probabilities of moving from one state (channel) to another
transitions = {}

for journey in journeys:
    for i in range(len(journey) - 1):
        current_state = journey[i]
        next_state = journey[i + 1]

        if current_state not in transitions:
            transitions[current_state] = {} # Initialize if state doesn't exist
        if next_state not in transitions[current_state]:
            transitions[current_state][next_state] = 0 # Initialize the next state count
        transitions[current_state][next_state] += 1

# Convert the transition dictionary into a DataFrame (transition matrix)
states = list(transitions.keys())
transition_matrix = pd.DataFrame(0, index=states, columns=states)

for current_state, next_states in transitions.items():
    total_transitions = sum(next_states.values()) # Total transitions from this state
    for next_state, count in next_states.items():
        transition_matrix.loc[current_state, next_state] = count / total_transitions

transition_matrix


Unnamed: 0,Start,Direct,Paid Search,Phone Call,Email,Referral,NoConversion,Converted
Start,0,0.198,0.205,0.184,0.208,0.205,,
Direct,0,0.13067,0.158747,0.165227,0.185745,0.171706,0.066955,0.12095
Paid Search,0,0.135961,0.181281,0.172414,0.152709,0.133005,0.077833,0.146798
Phone Call,0,0.145709,0.149701,0.173653,0.169661,0.166667,0.076846,0.117764
Email,0,0.168116,0.172947,0.155556,0.169082,0.142995,0.074396,0.116908
Referral,0,0.15345,0.15448,0.159629,0.159629,0.161689,0.0793,0.131823


In [194]:
# ==============
#     DEBUG
# ==============
print("Total Journeys:", len(journeys))
print("Journeys ending in 'Converted':", sum(journey[-1] == "Converted" for journey in journeys))
print("Baseline Conversion Rate:", baseline_conv_rate)

Total Journeys: 1000
Journeys ending in 'Converted': 628
Baseline Conversion Rate: 0.628


In [195]:
print("Transition Matrix Shape:", transition_matrix.shape)
print("Transition Matrix Columns:", transition_matrix.columns.tolist())
print("Transition Matrix Rows:", transition_matrix.index.tolist())

Transition Matrix Shape: (6, 8)
Transition Matrix Columns: ['Start', 'Direct', 'Paid Search', 'Phone Call', 'Email', 'Referral', 'NoConversion', 'Converted']
Transition Matrix Rows: ['Start', 'Direct', 'Paid Search', 'Phone Call', 'Email', 'Referral']


### 3. Understand the contribution of every channel by removing each of them from the system and re-calculating the conversions, known as the Removal Effect

In [196]:
# Step 3: Calculate the removal effect of each channel
channels = [state for state in states if state not in ["Start", "Converted", "NoConversion"]]
removal_effects = {}

for channel_to_remove in channels:
    # Make a copy of the transition matrix to modify
    temp_matrix = transition_matrix.copy()
    
    # Remove the channel by zeroing out its row (outgoing transitions)
    # and its column (incoming transitions), but keep "Start" and "NoConversion"
    temp_matrix.loc[channel_to_remove, :] = 0
    temp_matrix.loc[:, channel_to_remove] = 0

    # Ensure that "Start" and "NoConversion" are still properly connected
    temp_matrix.loc["Start", :] = 0  # Ensure no transition from "Start" is removed
    temp_matrix.loc[:, "Start"] = 0  # Ensure no incoming transition is removed from "Start"
    
    temp_matrix.loc["NoConversion", :] = 0  # Ensure "NoConversion" isn't removed
    temp_matrix.loc[:, "NoConversion"] = 0  # Ensure no incoming transition is removed from "NoConversion"

    # Build a full state space (make sure rows and columns are identical)
    full_states = temp_matrix.index.union(temp_matrix.columns)
    temp_matrix = temp_matrix.reindex(index=full_states, columns=full_states, fill_value=0)

    # Initialize a probability vector where all probability starts at "Start"
    probabilities = pd.Series(0, index=full_states)
    probabilities["Start"] = 1

    # Iterate the chain to let probabilities converge (increase iterations if needed)
    for _ in range(1000):
        probabilities = temp_matrix.dot(probabilities)

    # Get the conversion probability after removal
    removal_conv_rate = probabilities.get("Converted", 0)

    # Calculate removal effect as the relative drop in conversion rate
    removal_effect = 1 - (removal_conv_rate / baseline_conv_rate) if baseline_conv_rate > 0 else np.nan
    removal_effects[channel_to_remove] = removal_effect

removal_effects

  temp_matrix.loc[:, channel_to_remove] = 0
  temp_matrix.loc[:, "NoConversion"] = 0  # Ensure no incoming transition is removed from "NoConversion"
  temp_matrix.loc[:, channel_to_remove] = 0
  temp_matrix.loc[:, "NoConversion"] = 0  # Ensure no incoming transition is removed from "NoConversion"
  temp_matrix.loc[:, channel_to_remove] = 0
  temp_matrix.loc[:, "NoConversion"] = 0  # Ensure no incoming transition is removed from "NoConversion"
  temp_matrix.loc[:, channel_to_remove] = 0
  temp_matrix.loc[:, "NoConversion"] = 0  # Ensure no incoming transition is removed from "NoConversion"
  temp_matrix.loc[:, channel_to_remove] = 0
  temp_matrix.loc[:, "NoConversion"] = 0  # Ensure no incoming transition is removed from "NoConversion"


{'Direct': 1.0,
 'Paid Search': 1.0,
 'Phone Call': 1.0,
 'Email': 1.0,
 'Referral': 1.0}

In [197]:
# Step 4: Allocate weights from removal effects
# Compute channel weights by normalizing the removal effects:
total_effect = sum(removal_effects.values())
if total_effect > 0:
    channel_weights = {ch: (eff / total_effect)*100 for ch, eff in removal_effects.items()}
else:
    print("Warning: Total removal effect is zero, check transition matrix!")
    channel_weights = {ch: 0 for ch in removal_effects}
    
channel_weights

{'Direct': 20.0,
 'Paid Search': 20.0,
 'Phone Call': 20.0,
 'Email': 20.0,
 'Referral': 20.0}

https://medium.com/@bhaskarammu/a-machine-learning-approach-to-marketing-attribution-601ca22ac8bd