In [95]:
import pandas as pd
import matplotlib.pyplot as ply
import numpy as np
import math
import ast

In [2]:
df_c = pd.read_csv('attribution_allocation_student_data.csv')

In [3]:
df_c.head()

Unnamed: 0,convert_TF,touch_1,touch_2,touch_3,touch_4,touch_5,tier
0,True,email,direct,social,,,1
1,True,social,direct,organic_search,,,1
2,True,organic_search,display,social,,,1
3,True,social,direct,,,,1
4,True,social,display,direct,,,1


In [17]:
df_c.dtypes

convert_TF      bool
touch_1       object
touch_2       object
touch_3       object
touch_4       object
touch_5       object
tier           int64
dtype: object

In [20]:
# List of possible channel
channel = ['social', 'organic_search', 'referral', 'email', 'paid_search', 'display', 'direct']

In [70]:
df_s = pd.read_csv('channel_spend_student_data.csv')

In [80]:
df_s

Unnamed: 0,tier,spend by channel
0,1,"{'social': 50, 'organic_search': 0, 'referral'..."
1,2,"{'social': 100, 'organic_search': 0, 'referral..."
2,3,"{'social': 150, 'organic_search': 0, 'referral..."
3,total,"{'social': 300, 'organic_search': 0, 'referral..."


In [97]:
df_s.dtypes

tier                object
spend by channel    object
dtype: object

In [98]:
# values in spend by channel column are inputed as in a dictionary format but categorized as string, 
# modify them to make the dataframe being more accessable

for row in df_s.itertuples():
    di = ast.literal_eval(row[2])
    index = row[0]
    
    for item in di.items():
        df_s.at[index, item[0]] = item[1]

In [102]:
# Drop the original column
df_s.drop(['spend by channel'], axis = 1, inplace = True)

In [103]:
df_s

Unnamed: 0,tier,social,organic_search,referral,email,paid_search,display,direct
0,1,50.0,0.0,50.0,50.0,50.0,50.0,0.0
1,2,100.0,0.0,100.0,100.0,100.0,100.0,0.0
2,3,150.0,0.0,150.0,150.0,150.0,150.0,0.0
3,total,300.0,0.0,300.0,300.0,300.0,300.0,0.0


## Part 1. Attribution

#### Method -- Last Interaction 

In [109]:
# Create a dictionary to count the number of True convert and False Convert for each channel

LI_count = {'Method': 'Last Interaction'}

for i in channel:
    LI_count[i] = {'True':0, 'False':0}

In [110]:
# Iterate over the dataframe to obtain the last interaction channel for each customer
for row in df_c.itertuples():
    # For each customer, get the last interaction channel (not np.nan)
    for i in range(6,1,-1):
        if not pd.isnull(row[i]):
            # Since convert_TF is a boolean, it could be used directly in if-condition statement
            if row[1]:
                LI_count[row[i]]['True'] +=1
            else:
                LI_count[row[i]]['False'] +=1
            break
        

In [111]:
LI_count

{'Method': 'Last Interaction',
 'social': {'True': 352, 'False': 34287},
 'organic_search': {'True': 662, 'False': 51494},
 'referral': {'True': 10, 'False': 659},
 'email': {'True': 323, 'False': 25449},
 'paid_search': {'True': 12, 'False': 681},
 'display': {'True': 406, 'False': 57006},
 'direct': {'True': 613, 'False': 46458}}

In [118]:
# Check result

su = 0

for key in LI_count.keys():
    if key != 'Method':
        su += LI_count[key]['True']
        su += LI_count[key]['False']

print(su)
print(su == df_c.shape[0])

218412
True


In [121]:
# Calculate CAC for each channel without considering tiers

def CAC_calc(d):
    
    '''
    input -> (dict) a dictionary with keys as marketing channels
    and values as a sub-dictionary which have keys as convert_TF 
    and values as number of T and F
    
    return -> (dict) a dictionary with keys as the marketing channels 
    and values as CAC calculated using total marketing cost divide 
    number of successful converted customer for each channel
    '''
    
    # Initiate the result dictionary
    result = {}
    result['Method'] = d['Method']
    
    # Calculation
    
    for key in d.keys():
        if key != 'Method':
            total_c = float(df_s.loc[(df_s['tier'] == 'total')][key])
            n_succ = d[key]['True']
        
            result[key] = total_c/n_succ
        
    return result

In [122]:
CAC_calc(LI_count)

{'Method': 'Last Interaction',
 'social': 0.8522727272727273,
 'organic_search': 0.0,
 'referral': 30.0,
 'email': 0.9287925696594427,
 'paid_search': 25.0,
 'display': 0.7389162561576355,
 'direct': 0.0}

### Method -- First Interaction

In [132]:
# Create a dictionary to count the number of True convert and False Convert for each channel

FI_count = {'Method': 'First Interaction'}

for i in channel:
    FI_count[i] = {'True':0, 'False':0}
    
    
# Iterate over the dataframe to obtain the first interaction channel for each customer
for row in df_c.itertuples():
    # For each customer, get the first interaction channel (not np.nan)
    for i in range(2,7,1):
        if not pd.isnull(row[i]):
            # Since convert_TF is a boolean, it could be used directly in if-condition statement
            if row[1]:
                FI_count[row[i]]['True'] +=1
            else:
                FI_count[row[i]]['False'] +=1
            break
        

In [133]:
FI_count

{'Method': 'First Interaction',
 'social': {'True': 340, 'False': 34696},
 'organic_search': {'True': 675, 'False': 51846},
 'referral': {'True': 10, 'False': 681},
 'email': {'True': 357, 'False': 25226},
 'paid_search': {'True': 12, 'False': 635},
 'display': {'True': 434, 'False': 56742},
 'direct': {'True': 550, 'False': 46208}}

In [134]:
# Check result

su = 0

for key in FI_count.keys():
    if key != 'Method':
        su += FI_count[key]['True']
        su += FI_count[key]['False']

print(su)
print(su == df_c.shape[0])

218412
True


In [135]:
CAC_calc(FI_count)

{'Method': 'First Interaction',
 'social': 0.8823529411764706,
 'organic_search': 0.0,
 'referral': 30.0,
 'email': 0.8403361344537815,
 'paid_search': 25.0,
 'display': 0.6912442396313364,
 'direct': 0.0}

### Method -- Linear Attribution

In [154]:
# Create a dictionary to count the number of True convert and False Convert for each channel

LA_count = {'Method': 'Linear'}

for i in channel:
    LA_count[i] = {'True':0, 'False':0}

In [155]:
# Iterate over the dataframe to obtain the first interaction channel for each customer
for row in df_c.itertuples():
    temp = []
    
    
    # For each customer, get all channels used (not np.nan)
    for i in range(2,7,1):
        if not pd.isnull(row[i]):
            temp.append(row[i])
    
    # Allocate the credit linearly (evenly) to all channels used         
    avg_credit = 1/len(temp)
    
    for i in temp:
        if row[1]:
            LA_count[i]['True'] += avg_credit
        else:
            LA_count[i]['False'] += avg_credit


In [156]:
LA_count

{'Method': 'Linear',
 'social': {'True': 353.16666666666396, 'False': 34408.71666665843},
 'organic_search': {'True': 667.3333333333375, 'False': 51663.73333335842},
 'referral': {'True': 10.283333333333335, 'False': 658.916666666671},
 'email': {'True': 337.61666666666446, 'False': 25357.6500000015},
 'paid_search': {'True': 10.566666666666668, 'False': 673.4500000000086},
 'display': {'True': 416.6666666666625, 'False': 56946.86666669063},
 'direct': {'True': 582.3666666666646, 'False': 46324.66666668092}}

In [157]:
# Check result

su = 0

for key in FI_count.keys():
    if key != 'Method':
        su += FI_count[key]['True']
        su += FI_count[key]['False']

print(su)
print(su == df_c.shape[0])

218412
True


In [158]:
CAC_calc(LA_count)

{'Method': 'Linear',
 'social': 0.8494572911750891,
 'organic_search': 0.0,
 'referral': 29.17341977309562,
 'email': 0.8885817248358651,
 'paid_search': 28.39116719242902,
 'display': 0.7200000000000072,
 'direct': 0.0}

### Method -- Position Based

In [166]:
# Under this method, assuming the first interaction and last interaction contributing 40% of the credit each 
# and the rest (20%) are evenly assigned to the channel in the middle
# If only one channel is used, 100& credit to the channel
# If two channels -- 50% for each


# Create a dictionary to count the number of True convert and False Convert for each channel

PB_count = {'Method': 'Position Based'}

for i in channel:
    PB_count[i] = {'True':0, 'False':0}

In [167]:
# Iterate over the dataframe to obtain the first interaction channel for each customer
for row in df_c.itertuples():
    temp = []
    
    # For each customer, get all channels used (not np.nan)
    for i in range(2,7,1):
        if not pd.isnull(row[i]):
            temp.append(row[i])
    
    # One channel only situation
    if len(temp) == 1:
        if row[1]:
            PB_count[temp[0]]['True'] += 1
        else:
            PB_count[temp[0]]['False'] += 1
    
    # Two Channels situation
    elif len(temp) == 2:
        for i in temp:
            if row[1]:
                PB_count[i]['True'] += 0.5
            else:
                PB_count[i]['False'] += 0.5
                
    # More than two channels            
    else:
        # Get the first and last channel
        first = temp[0]
        last = temp[-1]
        
        # Calcluate the allocation for channels in between
        avg_credit = 0.2/(len(temp) - 2)
        
        
        if row[1]:
            # Allocate 40% credit to first and last channel
            PB_count[first]['True'] += 0.4
            PB_count[last]['True'] += 0.4
            
            # Allocate the remaining 20& credit evenly to the middle channel(s)
            for i in range(1, len(temp) - 1, 1):
                PB_count[temp[i]]['True'] += avg_credit
        
        else:
            PB_count[first]['False'] += 0.4
            PB_count[last]['False'] += 0.4
            for i in range(1, len(temp) - 1, 1):
                PB_count[temp[i]]['False'] += avg_credit

In [168]:
PB_count

{'Method': 'Position Based',
 'social': {'True': 349.29999999999717, 'False': 34452.20000002224},
 'organic_search': {'True': 669.0333333333288, 'False': 51675.233333359},
 'referral': {'True': 10.300000000000002, 'False': 666.1666666666621},
 'email': {'True': 340.5666666666644, 'False': 25354.166666679663},
 'paid_search': {'True': 11.233333333333336, 'False': 662.7666666666651},
 'display': {'True': 416.69999999999635, 'False': 56896.73333335448},
 'direct': {'True': 580.866666666661, 'False': 46326.73333335914}}

In [169]:
# Check result

su = 0

for key in PB_count.keys():
    if key != 'Method':
        su += PB_count[key]['True']
        su += PB_count[key]['False']

print(round(su))
print(round(su) == df_c.shape[0])

218412
True


In [170]:
CAC_calc(PB_count)

{'Method': 'Position Based',
 'social': 0.858860578299463,
 'organic_search': 0.0,
 'referral': 29.126213592233004,
 'email': 0.8808847998434042,
 'paid_search': 26.706231454005927,
 'display': 0.7199424046076377,
 'direct': 0.0}