In [169]:
import pandas as pd
from icecream import ic
import numpy as np

## Change directory to data (no duplicate rows)

In [170]:
export_dir = 'C:/Users/lavil/source/repos/LukVill/Misc Data/export_no_dup.csv'

export = pd.read_csv(export_dir, nrows=1000)

In [171]:
export.head()

Unnamed: 0.1,Unnamed: 0,customer_id,account_id,ed_id,event_name,Date,Time
0,0,-784961211,1773350293,12,application_web_approved,2023-03-22,08:45:22
1,1,-784961211,1773350293,19,application_web_view,2023-03-22,13:32:10
2,14,-784961211,1773350293,3,application_web_submit,2023-03-22,13:32:10
3,15,-784961211,1773350293,2,campaign_click,2023-03-22,14:45:22
4,16,-784961211,1773350293,19,application_web_view,2023-07-27,14:57:56


In [172]:
# get max journey length by customer and account
max_journey_len = export.groupby(["customer_id","account_id"]).count().iloc[:,0].max()
max_journey_len

195

In [173]:
# get each unique cust/account
res = export[["customer_id","account_id"]].drop_duplicates().reset_index(drop=True)

# make event column
res["events"] = 0
res["events"] = res["events"].astype(object)

In [174]:
# for each pair, filter data to pair
# then arrange via date and time
# then extract events into list
# once extracted, convert list into np.array
# and set it into res

for i in range(len(res)):
    # get data pair for index
    pair = res.iloc[i,:]
    # extract cust and acct
    cust = pair[0]
    acct = pair[1]
    
    # filter data to pair
    df = export.loc[(export["customer_id"] == cust) & (export["account_id"] == acct)].sort_values(by = ["Date","Time"])
    # extract events into list
    events = df["ed_id"]
    
    # pad events list to reach max journey
    if len(events) < max_journey_len:
        num_zeros = max_journey_len - len(events)
        z_list = np.repeat(0,num_zeros)
        # append list
        events = np.append(events, z_list)
        
    elif len(events) == max_journey_len:
        pass

    else:
        raise Exception("length of event list of customer ", cust, " is less than max journey length ", max_journey_len)
    
    # convert and set
    res.at[i,"events"] = events.astype(object)
    # res is now of unique cust/acct with events


In [175]:
# now, mutate a new column so that it labels the rows such that:
# if events 29,12,15 only exist, 
# then label as Activated, No Order
# if events 7 and 18 only exist,
# then label as Ordered, Not Activated
# if events from both lists exist,
# label as Acitvated and Ordered

# function for labeling
# var x is array
def event_label(x):
    act_list = np.array([29,12,15])
    ord_list = np.array([7,18])

    is_act = False
    is_ord = False

    # parse and get condition checks
    # if any activated, 
    if any(np.in1d(x,act_list)):
        is_act = True
    if any(np.in1d(x,ord_list)):
        is_ord = True

    # activated only
    if is_act == True and is_ord == False:
        return "Activated, No Order"

    # ordered only
    elif is_act == False and is_ord == True:
        return "Ordered, Not Activated"

    # activated and ordered
    elif is_act == True and is_ord == True:
        return "Activated and Ordered"
    
    # accounts neither fit
    else:
        return "Neither"

# vectorize function
event_label_vec = np.vectorize(event_label)

In [181]:
# vectorize label func across rows

res["event_label"] = event_label_vec(res["events"])

In [182]:
# check output
res.head(n = 15)

Unnamed: 0,customer_id,account_id,events,event_label
0,-784961211,1773350293,"[12, 19, 3, 2, 19, 19, 0, 0, 0, 0, 0, 0, 0, 0,...","Activated, No Order"
1,15849251,383997507,"[4, 4, 4, 4, 4, 2, 4, 4, 19, 19, 19, 19, 19, 3...","Activated, No Order"
2,155529381,1786111954,"[2, 12, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","Activated, No Order"
3,-1697747935,-1016455199,"[19, 19, 19, 19, 19, 3, 19, 12, 19, 19, 19, 19...","Activated, No Order"
4,-986357645,1289564882,"[12, 4, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","Activated, No Order"
5,689736168,900049321,"[12, 19, 3, 1, 4, 24, 0, 0, 0, 0, 0, 0, 0, 0, ...","Activated, No Order"
6,1938053063,1808210098,"[19, 19, 19, 19, 19, 19, 19, 19, 3, 12, 2, 0, ...","Activated, No Order"
7,-1522106248,-693958153,"[19, 19, 19, 19, 3, 19, 12, 4, 4, 4, 4, 4, 4, ...","Activated, No Order"
8,1483781929,-47514109,"[12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","Activated, No Order"
9,-484079419,1366678149,"[19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 3, 12...","Activated, No Order"


## SAVE OUTPUT TO THIS FILE PATH

In [None]:
# SAVE OUTPUT

# FOLDER
save_dir = "C:/Users/lavil/source/repos/LukVill/Misc Data"

# res.to_csv(save_dir + "/export_prepped.csv")