In [1]:
import numpy as np
import pandas as pd
import warnings
import re
import random

warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
cj = pd.read_csv('example_journey.csv')

In [3]:
cj.head()

Unnamed: 0.1,Unnamed: 0,path,conv,conv_null,Unnamed: 4
0,1,SEM BRAND > SEM NON BRAND > SHOPPING,1,0,
1,2,SHOPPING,1,0,
2,3,SHOPPING,1,0,
3,4,SHOPPING > OTHERS > RETARGETING > OTHERS > RET...,1,0,
4,5,SEM BRAND,1,0,


All journeys in this dataframe are converting journeys. This is indicated by column conv_null

In [4]:
def strip_repeat_channels(df, path = 'path'):
    """Strip journeys like > SHOPPING > SHOPPING > SHOPPNG > DIRECT into > SHOPPING > DIRECT
    For removale effect calculation same to same channel is not relevant
    Ends > conversion or no_conversion to path based on conv column
    
    INPUT: df = dataframe with paths, path = columnname of paths
    OUTPUT: same dataframe with cleaned journeys and list of channel names
    """
    df = df.copy()
    
    #Get all channel names
    all_chnl = []
    for p in df[path]:
        all_chnl.extend(p.split(' > '))
    
    #Get all unique channels
    all_chnl_unique = list(set(all_chnl))
    
    
    #Add start and end touchpint
    df[path] = [ 'start > ' + p + ' > conversion' if c==1 else 'start > ' + p + ' > no_conversion' for c, p in zip(cj['conv'], cj[path])]
    
    
    #Strip consecutive repeats
    for chnl in set(all_chnl):
        df.path =  [re.sub("(> "+chnl+" )+", "> "+chnl+" ", p) for p in df[path]]
        
    return all_chnl_unique, df
 
def calc_trans_matrix(df, channels, path = 'path'):
    trans = [[0]*(len(channels)+1) for _ in range(len(channels)+1)]
    for i, i_from in enumerate(channels):
        #print(i_from + ' > converion' )
        i = i+1
        trans[i][len(channels)] = sum([p.endswith(i_from + ' > conversion') for p in df[path]])/sum([p.count(i_from) for p in df[path]])
        #print(trans[i][len(set(channels))])
        

        for j, j_to in enumerate(channels):
            trans[0][j] = sum([p.startswith('start > ' + j_to) for p in df[path]])/len(df[path])
        
            if sum([p.count(i_from) for p in df[path]]) >0:   
                #print(i_from, " > ",  j_to)
                trans[i][j] = sum([p.count(i_from + ' > ' + j_to) for p in df[path]])/ sum([p.count(i_from) for p in df[path]])
                #print(trans[i][j])
    return trans

In [5]:
def simulate_removal_path(trans_matrix, simulation_n, all_chnl, max_length, remove_chnl = 'NONE'):
    """
    Function simulates n paths returns the amount of converted paths.
    If remove_chnl != NONE functions simulates paths after removal
    trans_matrix = transition matrix
    simulation_n = amount of paths to simulate
    remove_chnl = removed channel    
    """
    all_chnl = all_chnl + ['conversion']
    trans_val = dict(zip(all_chnl, range(1,len(all_chnl)+1)))
    count_conv_removal = 0

    for j in range(0,simulation_n):
        #Calculate removal effects --> Remove CRM = 1
        journey = []
        x = random.choices(population = all_chnl, weights=trans_matrix[0], k=1)

        journey.append(x[0])
        for cj_step in range(0,max_length):   
            if x[0] == 'conversion':
                count_conv_removal = count_conv_removal +1
                #print(journey)
                break
            if (x[0] == remove_chnl) | (x[0] == 'no_conversion'):
                #break journey, if next simulated channel is either removed or ends withour conversion
                break
            x = random.choices(population = all_chnl, weights = trans_matrix[trans_val[x[0]]], k=1)
            journey.append(x[0])
    return count_conv_removal

In [6]:
def removal_effects(cj, simulation_n, max_length = 'max', path = 'path'):
    """
    Function calculates removal effects for dataframe cj.
    cj = input df with paths
    simulation_n = paths to simulate
    max_length = max length of journey
    path = column name in cj containing paths as string
    """
        
    if max_length == 'max':
        max_length = cj.path.str.count('>').max()+1
        print('Max length of journey in dataset: ', max_length)
        
    all_chnl, cj_clean = strip_repeat_channels(cj, path = path)
    trans_matrix = calc_trans_matrix(cj_clean, all_chnl, path = path)
    
    count_conv = simulate_removal_path(trans_matrix, simulation_n, all_chnl, max_length)
    print(count_conv)
    removal_effects = []
    for remove_chnl in all_chnl:
        count_conv_removal = simulate_removal_path(trans_matrix, simulation_n, all_chnl, max_length, remove_chnl= remove_chnl)
        
        removal = 1 - count_conv_removal/count_conv
        removal_effects.append([remove_chnl, removal])
        print('removal ', remove_chnl, ': ',removal)
    return removal_effects

In [7]:
removal = removal_effects(cj, 10000, max_length = 'max', path = 'path')

Max length of journey in dataset:  12
9997
removal  SEM NON BRAND :  0.06651995598679605
removal  DIRECT :  0.2536761028308493
removal  CRM :  0.1399419825947784
removal  PARTNERS :  0.01920576172851851
removal  RETARGETING :  0.03320996298889667
removal  SEO NON BRAND :  0.08642592777833347
removal  SHOPPING :  0.7450235070521156
removal  SEM BRAND :  0.19405821746523955
removal  DISPLAY :  0.015604681404421283
removal  OTHERS :  0.021606481944583367


In [16]:
pd.DataFrame(columns = ['Channel', 'removal_effect'], data = removal)

Unnamed: 0,Channel,removal_effect
0,OTHERS,0.022207
1,RETARGETING,0.03181
2,DIRECT,0.250275
3,PARTNERS,0.016705
4,DISPLAY,0.017405
5,SEO NON BRAND,0.085226
6,CRM,0.141242
7,SEM NON BRAND,0.06592
8,SHOPPING,0.753926
9,SEM BRAND,0.20076
